In [1]:
'''
In this notebook, we train our model, however, there are ramifications in how
we train the model that affect our deployment further down the pipeline. In model 
deployment, the scikit-learn container we use is 1.2.0. We need to train our 
model on the same version of scikit-learn or we will encounter hidden errors 
that deploy an empty endpoint. 
'''

'\nIn this notebook, we train our model, however, there are ramifications in how\nwe train the model that affect our deployment further down the pipeline. In model \ndeployment, the scikit-learn container we use is 1.2.0. We need to train our \nmodel on the same version of scikit-learn or we will encounter hidden errors \nthat deploy an empty endpoint. \n'

In [2]:
# Ensure it is 1.2.0 and matches deployment container
!pip install --upgrade scikit-learn==1.2.0

Collecting scikit-learn==1.2.0
  Downloading scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m107.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.2.0


In [3]:
!pip install awswrangler



In [4]:
# Config, imports, the lot
import boto3
import sagemaker
import sys
import pandas as pd
import numpy as np
import joblib
import os
import io
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import tarfile
import awswrangler as wr

# Configuration parameters
bucket_name = "arxiv-project-bucket"
role = "arn:aws:iam::221082214706:role/MYLabRole"
region = "us-east-1"

# Create a SageMaker session
sess = sagemaker.Session(boto_session=boto3.Session(region_name=region))
print("Using bucket:", bucket_name)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Using bucket: arxiv-project-bucket


In [5]:
import sklearn
print(sys.executable)
print("scikit-learn version:", sklearn.__version__)

/home/ec2-user/anaconda3/envs/python3/bin/python
scikit-learn version: 1.2.0


In [6]:
# Now that data is transformed, can load from S3
# Can use awswrangler to read all CSV files from that folder as a single DataFrame.
s3_train_path = f"s3://{bucket_name}/processed_csv/train/"
df_train = wr.s3.read_csv(path=s3_train_path)
print("Loaded processed training data shape:", df_train.shape)

Loaded processed training data shape: (648786, 59)


In [7]:
# Need to extract Numeric Features for Clustering
svd_cols = [col for col in df_train.columns if col.startswith("svd_")]
if not svd_cols:
    raise ValueError("No columns starting with 'svd_' found in the training data.")

X = df_train[svd_cols].values
print("Feature matrix shape (X):", X.shape)

Feature matrix shape (X): (648786, 50)


In [None]:
# KMeans Model, silhouette score
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=39)
clusters = kmeans.fit_predict(X)
print("KMeans clustering complete.")

# Give computation time, we will compute silhouette score on a random sample
# Resource limitation was a strong motivating factor behind many decisions
sample_size = 200000
if len(X) > sample_size:
    indices = np.random.choice(len(X), sample_size, replace=False)
    X_sample = X[indices]
    clusters_sample = kmeans.predict(X_sample)
else:
    X_sample = X
    clusters_sample = clusters

score = silhouette_score(X_sample, clusters_sample)
print("Silhouette Score (on sampled, reduced data):", score)



KMeans clustering complete.


In [None]:
# Append Cluster labels to the DataFrame in case we observe them
df_train["cluster"] = kmeans.labels_
print("Sample of training data with cluster assignments:")
print(df_train.head())

In [None]:
# Save and Upload Trained Model to S3
model_filename = "kmeans_arxiv_model.joblib"
joblib.dump(kmeans, model_filename)
print("Model saved locally as:", model_filename)

# Archive the model into a tar.gz file for the sklearn container in next notebook
archive_filename = "model.tar.gz"
with tarfile.open(archive_filename, "w:gz") as tar:
    tar.add(model_filename)
print("Model archived as:", archive_filename)

# Upload the archived model to S3 
sess.upload_data(archive_filename, bucket=bucket_name, key_prefix="models")
model_s3_path = f"s3://{bucket_name}/models/{archive_filename}"
print("Model uploaded to:", model_s3_path)