# Using ALS + KMeans to get meme clusters

In [4]:
from datetime import datetime

import polars as pl
from implicit.als import AlternatingLeastSquares
from scipy import sparse
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [5]:
dtm_fmt = "%B %-d, %Y, %I:%M %p"

In [11]:
user_meme_df = (
    pl.read_csv('user_meme_reaction_240301_240413.csv')
    .with_columns(pl.col('sent_at').str.to_datetime(dtm_fmt))
    .with_columns(pl.col('reaction_id').fill_null(2))
    .with_columns(pl.col('reaction_id').map_elements(lambda x: 1 if x == 1 else -1, pl.Int64))
    .filter(pl.col('sent_at') < datetime(2024, 4, 1))
)

In [13]:
le_user = LabelEncoder().fit(user_meme_df.select('user_id').unique().get_column('user_id').to_list())
le_meme = LabelEncoder().fit(user_meme_df.select('meme_id').unique().get_column('meme_id').to_list())

In [14]:
n_users = le_user.classes_.shape[0]
n_memes = le_meme.classes_.shape[0]

In [15]:
users = le_user.transform(user_meme_df.get_column('user_id'))
memes = le_meme.transform(user_meme_df.get_column('meme_id'))
reactions = user_meme_df.get_column('reaction_id').to_numpy()

In [16]:
user_meme = sparse.coo_array((reactions, (users, memes)), shape=(n_users, n_memes)).tocsr()

In [None]:
model = AlternatingLeastSquares(factors=64)

In [None]:
model.fit(user_meme)

In [19]:
item_embeds = model.item_factors

In [20]:
cluster = KMeans(n_clusters=10)

In [21]:
clusters = cluster.fit_predict(item_embeds)

In [22]:
pl.Series(clusters).value_counts()

Unnamed: 0_level_0,count
i32,u32
0,2410
5,26
9,3352
1,1145
3,3755
4,16242
8,591
6,1561
7,315
2,364


In [23]:
res = []
for meme_id, cluster_id in zip(le_meme.classes_, clusters):
    res.append({'meme_id': meme_id, 'cluster_id': cluster_id})
pl.DataFrame(res).write_parquet('meme_clusters.pq')