In [1]:
from sklearn.decomposition import PCA
import hopsworks
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

In [2]:
project = hopsworks.login(api_key_value="wOWQmzzHeITT5wlJ.FkOButuQ3XpDXSUO1LnTuaNfD7SPWolfF1hateoistsLFFsBF7upULn5z6qKPOIB")
fs = project.get_feature_store() 
headline_fg = fs.get_feature_group(
    name='headlines_new', 
    version=1,
)
headline_df = headline_fg.read()
headline_df = headline_df.sort_values(by='pubdate')

print(headline_df.head())
print(headline_df.tail())

2024-12-12 21:42:58,423 INFO: Initializing external client
2024-12-12 21:42:58,425 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-12 21:42:59,600 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1163414
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.25s) 
                       pubdate  \
2949 2023-12-01 00:00:00+00:00   
8838 2023-12-01 00:00:00+00:00   
1247 2023-12-01 00:00:00+00:00   
2108 2023-12-01 00:00:00+00:00   
5064 2023-12-01 00:00:00+00:00   

                                                  title  
2949  Danny Macklin: Police find ex-AFC Wimbledon ch...  
8838  Gaza's fear and anger as ceasefire ends and fi...  
1247  COP28: 'The Earth does not belong to us' - Kin...  
2108  Belgium 1-1 Scotland: Visitors relegated to Na...  
5064  Premiership: Harlequins 36-3 Sale - Marcus Smi...  
                        pubdate  \
14702 2024-12-03 00:00:00+00:00   
14703 2024-12-03 00:00:0

In [3]:
def df_between_dates(df, start_date, end_date):
    return df[(df['pubdate'] >= start_date) & (df['pubdate'] <= end_date)]

now = datetime.now()
past_year = now + relativedelta(years=-1)
#df_week = df_between_dates(headline_df, '2024-11-01', '2024-12-3')
dfs = {}
dfs['year'] = df_between_dates(headline_df, past_year.strftime("%Y-%m-%d"), now.strftime("%Y-%m-%d"))

In [4]:
emb_model = SentenceTransformer('all-MiniLM-L6-v2')

2024-12-12 21:43:24,182 INFO: Use pytorch device_name: cpu
2024-12-12 21:43:24,183 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [5]:
#headlines = headline_df['title'][13000:13500].tolist()
headlines = {}
embeddings = {}
for key in dfs.keys():
    headlines[key]= dfs[key]['title'].tolist()
    embeddings[key] = emb_model.encode(headlines[key])

Batches:   0%|          | 0/445 [00:00<?, ?it/s]

In [6]:
def training(embeddings, headlines, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=22)
    clusters = kmeans.fit(embeddings) # 1, 2 or 3... for each headline
    pca = PCA(n_components=2)
    pca.fit(embeddings)
    return kmeans, pca
n_clusters=5
kmeans,pca = training(embeddings['year'], headlines['year'], n_clusters)

In [7]:
emb =  emb_model.encode(['police', 'Trump'])
clusters = kmeans.predict(emb)
print(clusters)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[4 2]


In [8]:
import pickle
with open("pca_model.pkl", "wb") as pca_file:
    pickle.dump(pca, pca_file)

with open("kmeans_model.pkl", "wb") as kmeans_file:
    pickle.dump(kmeans, kmeans_file)

In [9]:
mr = project.get_model_registry()

pca_model = mr.python.create_model(
    name="PCA_Model",
    description="PCA model for dimensionality reduction"
)
kmeans_model = mr.python.create_model(
    name="KMeans_Model",
    description="KMeans clustering model"
)
pca_model.save('pca_model.pkl')
kmeans_model.save('kmeans_model.pkl')

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/5360 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1163414/models/PCA_Model/2


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/65205 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1163414/models/KMeans_Model/2


Model(name: 'KMeans_Model', version: 2)

Now write headlines and embeddings to a dataframe for fast acess

In [13]:
print(embeddings['year'].shape)
print(len(headlines['year']))

(14220, 384)
14220


In [14]:
import pandas as pd
df = pd.DataFrame(embeddings['year'], columns=[f"dim_{i}" for i in range(384)])
df["headlines"] = headlines['year']
feature_group = fs.get_or_create_feature_group(
    name="high_dimensional_vectors",
    version=2,
    description="Feature group for 384-dimensional vectors",
    primary_key=["headlines"], 
    online_enabled=True 
)
feature_group.insert(df, write_options={"wait_for_job": True})

print("384-dimensional vectors successfully written to Hopsworks Feature Store.")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1163414/fs/1154117/fg/1385877


Uploading Dataframe: 100.00% |██████████| Rows 14220/14220 | Elapsed Time: 01:35 | Remaining Time: 00:00


Launching job: high_dimensional_vectors_2_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1163414/jobs/named/high_dimensional_vectors_2_offline_fg_materialization/executions
2024-12-12 21:53:51,418 INFO: Waiting for execution to finish. Current state: SUBMITTED. Final status: UNDEFINED
2024-12-12 21:53:54,549 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED
2024-12-12 22:04:42,423 INFO: Waiting for log aggregation to finish.
2024-12-12 22:05:10,360 INFO: Execution finished successfully.
384-dimensional vectors successfully written to Hopsworks Feature Store.


In [16]:
#size mb, vectors 100 times larger
import sys
print(sys.getsizeof(embeddings['year'])/ (1024**2))
print(sys.getsizeof(headlines['year'])/ (1024**2))

20.8302001953125
0.10854339599609375
