In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("..")

In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from src.constants import (
    ITEM_METADATA_FILE,
    PROCESSED_FOLDER,
    PROJECT_ROOT_DIR,
    INT_FOLDER,
    ITEM_CLUSTERS_FILE,
)

In [4]:
item_metadata_path = PROJECT_ROOT_DIR / PROCESSED_FOLDER / ITEM_METADATA_FILE

item_metadata_df = pd.read_csv(item_metadata_path)

In [5]:
item_metadata_df

Unnamed: 0,title,summary,brief_summary,details,target_audience,item_id
0,Bulletproof Ruby on Rails Applications,This course aims to teach you how to write bet...,Gain insights into good testing practices in R...,,intermediate,5b00c097-464e-5195-9fb4-ce51218ea733
1,Business Machine Learning,AI has enabled us to develop machine learning ...,"Delve into business machine learning, gaining ...",,intermediate,cbbafad0-63b5-526c-93ff-435ebca16aa9
2,C++ Brain Teasers: Exercise Your Mind,"C++, a powerful and versatile programming lang...",Gain insights into C++ through engaging puzzle...,,intermediate,1e77eebd-3485-56f6-984e-d0efc70ff119
3,Calculating Sales Tax Using Avalara's AvaTax A...,Avalara offers automated software solutions to...,Explore Avalara's AvaTax API to automate tax c...,,beginner,5f7953c7-2fe5-5eb3-b39e-444eeafdba97
4,"Cloud Native Development with Tailwind, Google...",This course is a hands-on guidebook to walk yo...,Become a cloud pro with hands-on experience in...,,intermediate,cb662d24-384b-5dd2-b5e6-2a2928c1002d
...,...,...,...,...,...,...
1114,Embedded Programming with Modern C++,Embedded Programming with Modern C++ is highl...,Gain insights into using Modern C++ for embedd...,,intermediate,0d18a30b-7f9a-5d46-b1d4-a20680332415
1115,Introduction to Data Science with Python,Python is one of the most popular programming ...,"Delve into foundational Python programming, ex...",,beginner,feebaba9-7ffe-5438-9420-e908ad9740cc
1116,Mini Course: Build Rock Paper Scissors with Py...,"After learning the basics of Python, it’s a gr...",This course builds a classic Rock Paper Scisso...,,beginner,9a1cb6cc-99cd-5b8b-9aa1-d4c03357ef78
1117,The Ultimate Guide to Kotlin Programming,Kotlin has been growing in popularity among de...,"Gain insights into Kotlin, explore differences...",,beginner,d7957da8-84de-5b3c-a84a-5146f432fe64


In [6]:
title_df = item_metadata_df[["item_id", "title"]]
title_df

Unnamed: 0,item_id,title
0,5b00c097-464e-5195-9fb4-ce51218ea733,Bulletproof Ruby on Rails Applications
1,cbbafad0-63b5-526c-93ff-435ebca16aa9,Business Machine Learning
2,1e77eebd-3485-56f6-984e-d0efc70ff119,C++ Brain Teasers: Exercise Your Mind
3,5f7953c7-2fe5-5eb3-b39e-444eeafdba97,Calculating Sales Tax Using Avalara's AvaTax A...
4,cb662d24-384b-5dd2-b5e6-2a2928c1002d,"Cloud Native Development with Tailwind, Google..."
...,...,...
1114,0d18a30b-7f9a-5d46-b1d4-a20680332415,Embedded Programming with Modern C++
1115,feebaba9-7ffe-5438-9420-e908ad9740cc,Introduction to Data Science with Python
1116,9a1cb6cc-99cd-5b8b-9aa1-d4c03357ef78,Mini Course: Build Rock Paper Scissors with Py...
1117,d7957da8-84de-5b3c-a84a-5146f432fe64,The Ultimate Guide to Kotlin Programming


### Compute TF-IDF feature for item title


In [7]:
def title_pipeline_steps():
    steps = [
        ("tfidf", TfidfVectorizer(stop_words="english", max_features=1000, min_df=5)),
        (
            "todense",
            FunctionTransformer(lambda x: np.asarray(x.todense())),
        ),
    ]
    return steps


tfm = [
    ("title", Pipeline(steps=title_pipeline_steps()), "title"),
]

preprocessor = ColumnTransformer(transformers=tfm, remainder="drop")

In [8]:
pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("kmeans", KMeans(n_clusters=150, random_state=42, n_init="auto")),
    ]
)

In [9]:
pipeline.fit(title_df)

labels = pipeline.predict(title_df)

In [10]:
title_df["cluster"] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_df["cluster"] = labels


In [11]:
pipeline.named_steps["kmeans"].inertia_

390.5092274245063

In [12]:
title_df

Unnamed: 0,item_id,title,cluster
0,5b00c097-464e-5195-9fb4-ce51218ea733,Bulletproof Ruby on Rails Applications,47
1,cbbafad0-63b5-526c-93ff-435ebca16aa9,Business Machine Learning,45
2,1e77eebd-3485-56f6-984e-d0efc70ff119,C++ Brain Teasers: Exercise Your Mind,131
3,5f7953c7-2fe5-5eb3-b39e-444eeafdba97,Calculating Sales Tax Using Avalara's AvaTax A...,84
4,cb662d24-384b-5dd2-b5e6-2a2928c1002d,"Cloud Native Development with Tailwind, Google...",13
...,...,...,...
1114,0d18a30b-7f9a-5d46-b1d4-a20680332415,Embedded Programming with Modern C++,126
1115,feebaba9-7ffe-5438-9420-e908ad9740cc,Introduction to Data Science with Python,89
1116,9a1cb6cc-99cd-5b8b-9aa1-d4c03357ef78,Mini Course: Build Rock Paper Scissors with Py...,121
1117,d7957da8-84de-5b3c-a84a-5146f432fe64,The Ultimate Guide to Kotlin Programming,132


In [13]:
title_df.to_csv(PROJECT_ROOT_DIR / INT_FOLDER / ITEM_CLUSTERS_FILE, index=False)

In [14]:
# Plot to visualize clusters (when i hover the points, show the title)
import plotly.express as px
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
title_features = preprocessor.transform(title_df)
title_features_2d = pca.fit_transform(title_features)
fig = px.scatter(
    x=title_features_2d[:, 0],
    y=title_features_2d[:, 1],
    color=title_df["cluster"].astype(str),
    hover_data={"title": title_df["title"]},
    title="Item Title Clusters",
)
fig.show()

In [15]:
cluster_counts = title_df["cluster"].value_counts().sort_index()
cluster_counts

cluster
0      9
1      4
2      5
3      3
4      8
      ..
145    5
146    5
147    7
148    5
149    6
Name: count, Length: 150, dtype: int64

### Do experiments with different number of clusters to choose the optimal inertia\_ value


In [16]:
max_num_clusters = 1000

exp_results = []
for n_clusters in range(2, max_num_clusters + 1, 2):
    experiments_pipeline = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("kmeans", KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")),
        ]
    )
    experiments_pipeline.fit(title_df)
    inertia = experiments_pipeline.named_steps["kmeans"].inertia_
    exp_results.append((n_clusters, inertia))


Number of distinct clusters (947) found smaller than n_clusters (948). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (950). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (952). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (954). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (956). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (958). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (960). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (962). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (964). Possibly due to duplicate points

In [17]:
# ensure exp_results_df exists (build from exp_results if needed)
if "exp_results_df" not in globals():
    exp_results_df = pd.DataFrame(exp_results, columns=["n_clusters", "inertia"])

fig = px.line(
    exp_results_df,
    x="n_clusters",
    y="inertia",
    title="KMeans Inertia vs Number of Clusters",
    markers=True,
    hover_data=["n_clusters", "inertia"],
)

# highlight the current pipeline's k (if available)
current_k = pipeline.named_steps["kmeans"].n_clusters
current_point = exp_results_df[exp_results_df["n_clusters"] == current_k]
if not current_point.empty:
    fig.add_scatter(
        x=current_point["n_clusters"].tolist(),
        y=current_point["inertia"].tolist(),
        mode="markers",
        marker=dict(size=12, color="red"),
        name=f"current k = {current_k}",
        hovertemplate="k=%{x}<br>inertia=%{y:.2f}<extra></extra>",
    )

fig.update_layout(
    xaxis_title="Number of Clusters",
    yaxis_title="Inertia",
    template="plotly_white",
    width=900,
    height=500,
)
fig.show()

In [18]:
# Calculate silhouette scores for different k values
from sklearn.metrics import silhouette_score

silhouette_results = []
for n_clusters in range(2, max_num_clusters + 1, 2):
    experiments_pipeline = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("kmeans", KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")),
        ]
    )
    experiments_pipeline.fit(title_df)
    title_features = preprocessor.transform(title_df)
    labels = experiments_pipeline.predict(title_df)
    silhouette_avg = silhouette_score(title_features, labels)
    silhouette_results.append((n_clusters, silhouette_avg))
silhouette_results_df = pd.DataFrame(
    silhouette_results, columns=["n_clusters", "silhouette_score"]
)
fig = px.line(
    silhouette_results_df,
    x="n_clusters",
    y="silhouette_score",
    title="Silhouette Score vs Number of Clusters",
    markers=True,
    hover_data=["n_clusters", "silhouette_score"],
)
fig.update_layout(
    xaxis_title="Number of Clusters",
    yaxis_title="Silhouette Score",
    template="plotly_white",
    width=900,
    height=500,
)
fig.show()


Number of distinct clusters (947) found smaller than n_clusters (948). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (950). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (952). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (954). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (956). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (958). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (960). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (962). Possibly due to duplicate points in X.


Number of distinct clusters (947) found smaller than n_clusters (964). Possibly due to duplicate points