In [None]:
'''
Train Kmeans model on TF-IDF features to group similar papers.
Evaluate with silhouette score and save trained model artifact
'''

In [2]:
!pip install --upgrade s3fs

Collecting s3fs
  Downloading s3fs-2025.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.19.0-py3-none-any.whl.metadata (23 kB)
Collecting fsspec==2025.2.0.* (from s3fs)
  Downloading fsspec-2025.2.0-py3-none-any.whl.metadata (11 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.36.4,>=1.36.0 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.36.3-py3-none-any.whl.metadata (5.7 kB)
Downloading s3fs-2025.2.0-py3-none-any.whl (30 kB)
Downloading fsspec-2025.2.0-py3-none-any.whl (184 kB)
Downloading aiobotocore-2.19.0-py3-none-any.whl (77 kB)
Downloading aioitertools-0.12.0-py3-none-any.whl (24 kB)
Downloading botocore-1.36.3-py3-none-any.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m229.7 MB/s[0m eta [36m0:00:00[0m
Installing collected

In [15]:
# Model_Training.ipynb file
import boto3
import sagemaker
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

bucket_name = "arxiv-project-bucket"
role = "arn:aws:iam::221082214706:role/MYLabRole"
region = "us-east-1"

sess = sagemaker.Session(boto_session=boto3.Session(region_name=region))

# Load processed data from S3
s3_path = f"s3://{bucket_name}/processed/train"
df_features = pd.read_parquet(s3_path)
print("Loaded processed data shape:", df_features.shape)

Loaded processed data shape: (1853230, 25)


In [9]:
print(df_features[feature_columns].isna().sum())

10            0
100      280000
11      1293230
12       593230
15      1153230
20       140000
2007    1783230
2d       173230
30      1503230
3d       383230
dtype: int64


In [None]:
'''
Error situation is occurrs when implementing TF‑IDF vectorizer is fitted independently
so the vocabulary (as such the columns) can vary between chunks. 
When we read or concatenate all the processed Parquet files, 
pandas performs a union of the columns. Rows from chunks that didn’t 
include a token (like year or numbers in general) will have NaN in that column 
instead of 0 even though logically a missing token should be a zero.
'''

In [10]:
df_features = df_features.fillna(0)

In [11]:
print(df_features[feature_columns].isna().sum())

10      0
100     0
11      0
12      0
15      0
20      0
2007    0
2d      0
30      0
3d      0
dtype: int64


In [12]:
# Use the last 10 columns (our TF-IDF features) as input
feature_columns = df_features.columns[-10:]
X = df_features[feature_columns].values

In [13]:
# Train a KMeans with 5 clusters
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=39)
clusters = kmeans.fit_predict(X)

In [18]:
# Need to complete silhouette Score on a sample with PCA Reduction because computation
# Sampling to speed up silhouette score computation
sample_size = 100000 # Roughly 2.6% of data
if len(X) > sample_size:
    indices = np.random.choice(len(X), sample_size, replace=False)
    X_sample = X[indices]
    clusters_sample = clusters[indices]
else:
    X_sample = X
    clusters_sample = clusters

# PCA to reduce the sample's dimensionality (to 2 components) because kernel dies
pca = PCA(n_components=2, random_state=39)
X_sample_pca = pca.fit_transform(X_sample)

In [19]:
# Compute the silhouette score on the sampled sample 2.6%
score = silhouette_score(X_sample_pca, clusters_sample)
print("Silhouette Score (on PCA-reduced sample):", score)

Silhouette Score (on PCA-reduced sample): 0.913128583984273


In [20]:
# Save the Trained Model and Upload to S3
model_path = "kmeans_arxiv_model_reduced_sample.joblib"
joblib.dump(kmeans, model_path)
model_s3_prefix = "models"
sess.upload_data(model_path, bucket=bucket_name, key_prefix=model_s3_prefix)
model_s3_path = f"s3://{bucket_name}/{model_s3_prefix}/{model_path}"
print("Model uploaded to:", model_s3_path)

Model uploaded to: s3://arxiv-project-bucket/models/kmeans_arxiv_model_reduced_sample.joblib


In [None]:
# Important clarification, need to be a .tar.gz file, use below 2 cells instead of above

In [24]:
import tarfile
model_filename = "kmeans_arxiv_model_reduced_sample.joblib"
archive_filename = "model.tar.gz"
with tarfile.open(archive_filename, "w:gz") as tar:
    tar.add(model_filename)

In [25]:
sess.upload_data(archive_filename, bucket=bucket_name, key_prefix="models")
model_s3_path = f"s3://{bucket_name}/models/{archive_filename}"
print("Model uploaded to:", model_s3_path)

Model uploaded to: s3://arxiv-project-bucket/models/model.tar.gz


In [21]:
# Append Cluster Assignments and Show Sample Data
df_features["cluster"] = clusters
print(df_features.head())

          id            submitter  \
0  0807.5056  Momme Winkelnkemper   
1  0811.3632          Raseong Kim   
2  0707.0830  A. M. Dee McDougall   
3  0705.2205         Nir Piterman   
4  0711.2351         Dan Butnariu   

                                             authors  \
0  M. Winkelnkemper, M. Dworzak, T. P. Bartel, A....   
1  Raseong Kim, Supriyo Datta, and Mark S. Lundstrom   
2  A.M.D. McDougall and A.W. Hood (University of ...   
3                                       Nir Piterman   
4                         Dan Butnariu, Gabor Kassay   

                                               title  \
0  Origin of the Broad Lifetime Distribution of L...   
1  Influence of Dimensionality on Thermoelectric ...   
2  A New Look at Mode Conversion in a Stratified ...   
3  From Nondeterministic B\"uchi and Streett Auto...   
4  A Proximal-Projection Method for Finding Zeros...   

                                            comments  \
0  5 pages, 4 figures. accepted at Physica Stat