In [None]:
%pip install "loguru==0.7.3" "scikit-learn==1.6.1" "elasticsearch==8.17.0" "openai==1.59.7" "tqdm==4.67.1" "pandas==2.2.3" "bokeh==3.6.2"

## Dataset Preparation

Configure the environment variables required for the Azure OpenAI service to generate embeddings. Ensure the following environment variables are set before executing the code:

- `AZURE_OPENAI_API_KEY`
- `AZURE_OPENAI_ENDPOINT`
- `AZURE_OPENAI_DEPLOYMENT_NAME` - this should point to a `text-ada-002` endpoint

In [1]:
import os
import json
import re

from loguru import logger
import numpy as np
from openai import AzureOpenAI
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize
from tqdm import tqdm

First we define some helper functions for generating the text embeddings:

In [2]:
def chunk_text(text: str, max_chunk_size: int = 500) -> list[str]:
    """
    Split text into chunks of a maximum size.

    :param text: Input text to be chunked.
    :param max_chunk_size: Maximum size of each chunk.
    :return: List of text chunks.
    """
    words = text.split()
    return [
        " ".join(words[i : i + max_chunk_size])
        for i in range(0, len(words), max_chunk_size)
    ]


def clean_text(text: str) -> str:
    """
    Clean the input text by removing non-alphabetic characters and stopwords.

    :param text: Input text to be cleaned.
    :return: Cleaned text.
    """
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = " ".join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text


def generate_embedding_azure_openai(chunks: list[str]) -> list[np.ndarray]:
    """
    Generate embeddings for a list of text chunks using Azure OpenAI.

    :param chunks: List of text chunks.
    :return: List of numpy arrays representing embeddings.
    """
    client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    )
    deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
    if not deployment_name:
        raise ValueError(
            "Environment variable AZURE_OPENAI_DEPLOYMENT_NAME must be set."
        )

    embeddings = []
    for chunk in chunks:
        response = client.embeddings.create(model=deployment_name, input=[chunk])
        embeddings.append(np.array(response.data[0].embedding))
    return embeddings

Get Newsgroup data for 5 categories
- rec.sport.baseball
- rec.sport.hockey 
- comp.sys.ibm.pc.hardware 
- talk.religion.misc
- sci.med

In [3]:
categories = [
    "rec.sport.baseball",
    "rec.sport.hockey",
    "comp.sys.ibm.pc.hardware",
    "talk.religion.misc",
    "sci.med",
]

newsgroups_data = fetch_20newsgroups(subset="all", categories=categories)
newsgroups_data["data"][0]

'From: glang@slee01.srl.ford.com (Gordon Lang)\nSubject: Re: IP numbers on Ethernet Cards\nOrganization: Ford Motor Company Research Laboratory\nLines: 30\nNNTP-Posting-Host: slee01.srl.ford.com\nX-Newsreader: Tin 1.1 PL5\n\nTigger (djohnson@moose.uvm.edu) wrote:\n: Hi!\n: \t\n: Is it possible through either pin configuration or through software\n: programming to change the IP numbers on an ethernet card?\n: \t\n: Thanks in Advance!\n: \n: -- \n: =-Dave   *Tigger!*\n: \n: djohnson@moose.uvm.edu        \'Tiggers are wonderful things!\'\n: Dave C Johnson\n\nI think you mean the ethernet numbers.  The 8 byte ethernet id is the unique\nElectronic Serial Number (ESN) assigned to each ethernet board in existence.\nThis is a "physical layer" concept.  The IP address is a higher layer protocol.\nThe analogy to telephone service is the IP address is your phone number, while\nthe particular wire pair in the cable on the pole has some (unknown to you or\nI) physical identification scheme (number)

In [25]:
target_map = dict(zip(range(5), newsgroups_data["target_names"]))
target_map

{0: 'comp.sys.ibm.pc.hardware',
 1: 'rec.sport.baseball',
 2: 'rec.sport.hockey',
 3: 'sci.med',
 4: 'talk.religion.misc'}

## Vectorize the data

Get the `text-ada-002` vectors for this dataset. For the purpose of this demo, each document is chunked in a simple way and we take the maximum of the chunk vectors for each document.

In [20]:
dataset_filepath = "data/openai_vectorized_dataset.json"
os.makedirs("data", exist_ok=True)

logger.info(
    f"Creating embeddings for newsgroup data. The length of the vectors is {len(newsgroups_data['data'])}"
)

data_with_vectors = []
for doc, target in tqdm(
    zip(newsgroups_data["data"], newsgroups_data["target"]),
    total=len(newsgroups_data["data"]),
):
    chunks = chunk_text(clean_text(doc))
    chunk_vectors = generate_embedding_azure_openai(chunks)
    doc_vector = np.max(chunk_vectors, axis=0)
    data_with_vectors.append(
        {
            "news_body": doc,
            "target": target_map[target],
            "openai_vector": doc_vector.tolist(),
        }
    )

logger.info(f"Embeddings created for newsgroup data.")
logger.info(f"Storing dataset to {dataset_filepath}")

with open(dataset_filepath, "w") as f:
    json.dump(data_with_vectors, f)

[32m2025-01-23 10:41:29.603[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mCreating embeddings for newsgroup data. The length of the vectors is 4593[0m
100%|██████████| 4593/4593 [18:49<00:00,  4.07it/s]
[32m2025-01-23 11:00:19.268[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mEmbeddings created for newsgroup data.[0m
[32m2025-01-23 11:00:19.269[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mStoring dataset to data/openai_vectorized_dataset.json[0m


In [21]:
data_with_vectors[0].keys()

dict_keys(['news_body', 'target', 'openai_vector'])

In [22]:
print(data_with_vectors[0]["target"])

comp.sys.ibm.pc.hardware


In [23]:
print(data_with_vectors[0]["news_body"])

From: glang@slee01.srl.ford.com (Gordon Lang)
Subject: Re: IP numbers on Ethernet Cards
Organization: Ford Motor Company Research Laboratory
Lines: 30
NNTP-Posting-Host: slee01.srl.ford.com
X-Newsreader: Tin 1.1 PL5

Tigger (djohnson@moose.uvm.edu) wrote:
: Hi!
: 	
: Is it possible through either pin configuration or through software
: programming to change the IP numbers on an ethernet card?
: 	
: Thanks in Advance!
: 
: -- 
: =-Dave   *Tigger!*
: 
: djohnson@moose.uvm.edu        'Tiggers are wonderful things!'
: Dave C Johnson

I think you mean the ethernet numbers.  The 8 byte ethernet id is the unique
Electronic Serial Number (ESN) assigned to each ethernet board in existence.
This is a "physical layer" concept.  The IP address is a higher layer protocol.
The analogy to telephone service is the IP address is your phone number, while
the particular wire pair in the cable on the pole has some (unknown to you or
I) physical identification scheme (number).

But to answer your question 

In [26]:
print(data_with_vectors[0]["openai_vector"][:5])

[-0.003231409704312682, -0.001362027251161635, -0.008819126524031162, -0.03726506605744362, -0.0034561441279947758]


## K-means clustering

Compute cluster centers for a dataset using KMeans and save them to a JSON file. 

Cluster centers will be stored in `data/openai_cluster_centers.json`. 

In [28]:
import json
from loguru import logger
from sklearn.cluster import KMeans

dataset_filepath = "data/openai_vectorized_dataset.json"
cluster_centers_filepath = "data/openai_cluster_centers.json"

# loading the data we stored in the previous step
with open(dataset_filepath, "r") as f:
    data_with_vectors = json.load(f)

n_clusters = 5
normalized_vectors = normalize([x["openai_vector"] for x in data_with_vectors])

logger.info("Started training Kmeans model...")
kmeans_model = KMeans(n_clusters=n_clusters, random_state=123)
kmeans_model.fit(normalized_vectors)

cluster_centers = kmeans_model.cluster_centers_.tolist()
with open(cluster_centers_filepath, "w") as f:
    json.dump(cluster_centers, f)

logger.info(f"Kmeans cluster centers saved at file_path: {cluster_centers_filepath}")

[32m2025-01-23 11:01:14.091[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mStarted training Kmeans model...[0m
[32m2025-01-23 11:01:14.223[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mKmeans cluster centers saved at file_path: data/openai_cluster_centers.json[0m


## Store the data in Elasticsearch

In [36]:
import os
from elasticsearch import Elasticsearch, helpers

es_client = Elasticsearch(
    hosts=os.getenv("ES_HOST"),
    basic_auth=(os.getenv("ES_USERNAME"), os.getenv("ES_PASSWORD")),
)

In [None]:
def ingest_data_to_es(es: Elasticsearch, index_name, data) -> None:
    actions = []
    for doc in data:
        action = {"_index": index_name, "_source": doc}
        actions.append(action)

    if actions:
        helpers.bulk(es, actions)
        logger.info(f"Ingested {len(actions)} documents into index '{index_name}'")


ingest_data_to_es(
    es_client, index_name="newsgroups_openai_dataset", data=data_with_vectors
)

[32m2025-01-23 11:06:57.607[0m | [1mINFO    [0m | [36m__main__[0m:[36mingest_data_to_es[0m:[36m50[0m - [1mIngested 4593 documents into index 'newsgroups_openai_dataset'[0m


In [40]:
cluster_centers_filepath = "data/openai_cluster_centers.json"

# loading the cluster centers we stored in the previous step
with open(cluster_centers_filepath, "r") as f:
    cluster_centers = json.load(f)

## Add clustering ingest pipeline

In [52]:
script = {
    "script": {
        "lang": "painless",
        "source": """ 
     double euclideanDistance(List array1, List array2) {
       double sum = 0.0;
       for (int i=0; i<array1.length; i++) {
         sum += Math.pow((array1[i]-array2[i]), 2.0);
       }
       return Math.sqrt(sum);
     }

     List l2NormalizeArray(List array) {
       double sumSquared = 0.0;
       for (element in array) {
         sumSquared += Math.pow(element, 2.0)
       }
       double l2Norm = Math.sqrt(sumSquared);
       // create a new array to not overwrite ctx
       List outputArray = new ArrayList();
       for (element in array) {
         outputArray.add(element/l2Norm)
       }
       return outputArray
     }

     List distances = new ArrayList();
     double minDistance;
     int closestCluster;

     for (int i=0; i<params.clusterCenters.length; i++) {
       double distance;
       if (params.normalize == true) {
         distance = euclideanDistance(params.clusterCenters[i], l2NormalizeArray(ctx[params.analysisField]));
       } else {
         distance = euclideanDistance(params.clusterCenters[i], ctx[params.analysisField]);
       }
       distances.add(distance);
       if (i == 0) {
         minDistance = distance;
         closestCluster = i;
       } else {
         if (distance < minDistance) {
           minDistance = distance;
           closestCluster = i;
         }
       }
     }
     ctx["ml_clustering.closestCluster"] = closestCluster;
     ctx["ml_clustering.minDistance"] = Collections.min(distances);
   """,
        "params": {
            "clusterCenters": cluster_centers,
            "analysisField": "openai_vector",
            "normalize": True,
        },
    }
}

es_client.ingest.put_pipeline(
    id="ml_clustering_5_newsgroup_openai", processors=[script]
)

ObjectApiResponse({'acknowledged': True})

When simulating the ingest pipeline, we can see that a cluster number is assinged to a test document.

In [63]:
test_response = es_client.ingest.simulate(
    id="ml_clustering_5_newsgroup_openai", docs=[{"_source": data_with_vectors[1]}]
)
source = test_response["docs"][0]["doc"]["_source"].copy()
print(source.keys())

source["ml_clustering.closestCluster"]

dict_keys(['news_body', 'openai_vector', 'target', 'ml_clustering.closestCluster', 'ml_clustering.minDistance'])


2

Finally, we will reindex the dataset into the new pipeline, which will assign `ml_clustering.closestCluster`:

In [84]:
body = {
    "source": {
        "index": "newsgroups_openai_dataset",
    },
    "dest": {
        "index": "newsgroups_openai_dataset_with_clustering",
        "pipeline": "ml_clustering_5_newsgroup_openai",
    },
}
task = es_client.reindex(body=body, wait_for_completion=False)

After waiting about a few moments, the task will complete:

In [95]:
print("task completed: ", es_client.tasks.get(task_id=task["task"])["completed"])

task completed:  True


  print("task completed: ", es_client.tasks.get(task_id=task['task'])['completed'])


## Visualization

In [1]:
import os
from elasticsearch import Elasticsearch, helpers

es_client = Elasticsearch(
    hosts=os.getenv("ES_HOST"),
    basic_auth=(os.getenv("ES_USERNAME"), os.getenv("ES_PASSWORD")),
)

data = list(helpers.scan(es_client, index="newsgroups_openai_dataset_with_clustering"))
data = [x["_source"] for x in data]

For the visualizations, first we are going to assign a label to the `ml_clustering.closestCluster`. In order to do this, we slice the data by the newsgroup name (target), then count the cluster numbers assigned in that slice, and assign a mapping based on the highest count.

In [2]:
import pandas as pd

df_openai = pd.DataFrame(data)

open_ai_category_map = {}
for i in df_openai["target"].unique():
    vcs = df_openai[df_openai["target"] == i][
        "ml_clustering.closestCluster"
    ].value_counts()
    choice = int(vcs.index[0])
    open_ai_category_map[choice] = i

df_openai["cluster_label"] = df_openai["ml_clustering.closestCluster"].map(
    open_ai_category_map
)

print(open_ai_category_map)

{4: 'comp.sys.ibm.pc.hardware', 2: 'talk.religion.misc', 1: 'rec.sport.hockey', 3: 'rec.sport.baseball', 0: 'sci.med'}


Before creating the visualizations, let's look at the in-sample accuracy and confusion matrix:

In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, mutual_info_score

openai_true = df_openai["target"]
openai_pred = df_openai["cluster_label"]

print("accuracy", accuracy_score(y_true=openai_true, y_pred=openai_pred))
print(confusion_matrix(y_true=openai_true, y_pred=openai_pred))

accuracy 0.9623339865011975
[[973   0   0   4   5]
 [  9 935  22   6  22]
 [  9  21 963   1   5]
 [  6   1   0 931  52]
 [  0   0   0  10 618]]


The last thing we need to do for the visualizations is to compute the t-SNE embeddings for the `text-ada-002` vectors;

In [4]:
import numpy as np
from sklearn.manifold import TSNE

model_openai = TSNE(n_components=2)
transformed_openai = model_openai.fit_transform(
    np.asarray(list(df_openai["openai_vector"]))
)

df_openai["tsne_x"] = transformed_openai[:, 0]
df_openai["tsne_y"] = transformed_openai[:, 1]

Then we can create visualizations!

In [25]:
from bokeh.plotting import output_notebook, figure, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Category10_10 as colors_list
from bokeh.layouts import column


output_notebook()


def create_source(subset):
    source = ColumnDataSource(
        data={
            "index": list(subset.index),
            "x": list(subset["tsne_x"]),
            "y": list(subset["tsne_y"]),
            "news_body": list([x[:100] for x in subset["news_body"]]),
            "cluster_number": list(subset["cluster_label"]),
            "pred_color": list(subset["cluster_label"].map(color_map)),
            "actual_color": list(subset["target"].map(color_map)),
        }
    )
    return source


hover = HoverTool(
    tooltips=[
        ("index", "$index"),
        ("(x,y)", "(@x, @y)"),
        ("news_body", "@news_body"),
    ]
)

color_map = dict(zip(df_openai["target"].unique(), colors_list))

groups_map = {
    "comp.sys.ibm.pc.hardware": "train: pc hardware",
    "rec.sport.baseball": "train: baseball",
    "rec.sport.hockey": "train: hockey",
    "sci.med": "train: medicine",
    "talk.religion.misc": "train: religion",
}

p = figure(width=600, height=400, toolbar_location=None, title="openai actual labels")
p.tools.append(hover)

for newsgroup in groups_map.keys():
    source = create_source(df_openai[df_openai["target"] == newsgroup])
    p.scatter(
        "x",
        "y",
        size=10,
        alpha=0.25,
        line_alpha=0,
        legend_label=groups_map[newsgroup],
        color="actual_color",
        source=source,
    )
p.axis.visible = False
p.legend.click_policy = "hide"
p.legend[0].item_background_fill_alpha = 1
p.legend.item_background_fill_alpha = 0
p.add_layout(p.legend[0], "right")

p2 = figure(
    width=600, height=400, toolbar_location=None, title="openai predicted clusters"
)
p2.tools.append(hover)
for newsgroup in groups_map.keys():
    source = create_source(df_openai[df_openai["target"] == newsgroup])
    p2.scatter(
        "x",
        "y",
        size=10,
        alpha=0.25,
        line_alpha=0,
        legend_label=groups_map[newsgroup],
        fill_color="pred_color",
        source=source,
    )
p2.axis.visible = False
p2.legend.click_policy = "hide"
p2.legend[0].glyph_height = 0
p2.add_layout(p2.legend[0], "right")
p2.axis.visible = False

show(column(p, p2))