In [None]:
import os

import pandas as pd
import tensorflow as tf

In [None]:
data_dir = os.path.join(".", "data", "ml-latest-small")

ratings_df = pd.read_csv(os.path.join(data_dir, "ratings.csv"))
ratings_df["timestamp"] = pd.to_datetime(
    ratings_df["timestamp"], unit="s", origin="unix"
)

movies_df = pd.read_csv(os.path.join(data_dir, "movies.csv"))

tags_df = pd.read_csv(os.path.join(data_dir, "tags.csv"))
tags_df["timestamp"] = pd.to_datetime(tags_df["timestamp"], unit="s", origin="unix")

raw_df = pd.merge(ratings_df, movies_df, on="movieId")
raw_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,1996-11-08 06:36:02,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,2005-01-25 06:52:26,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,2017-11-13 12:59:30,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,2011-05-18 05:28:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,2016-11-19 08:55:49,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,2016-11-19 08:43:18,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,2017-05-03 20:53:14,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,2017-05-03 21:59:49,Blair Witch (2016),Horror|Thriller


In [None]:
df = (
    raw_df.groupby("movieId")
    .agg(
        **{
            "num_ratings": pd.NamedAgg(column="rating", aggfunc=len),
            "mean_rating": pd.NamedAgg(column="rating", aggfunc=pd.DataFrame.mean),
        }
    )
    .assign(
        mod_mean_rating=lambda x: x["num_ratings"]
        * x["mean_rating"]
        / (x["num_ratings"] + 4)
    )
)
df = raw_df.join(df, on="movieId").assign(
    mean_centered_rating=lambda x: x["rating"] - x["mean_rating"]
)
df  # type: ignore

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,num_ratings,mean_rating,mod_mean_rating,mean_centered_rating
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,3.849315,0.07907
1,5,1,4.0,1996-11-08 06:36:02,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,3.849315,0.07907
2,7,1,4.5,2005-01-25 06:52:26,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,3.849315,0.57907
3,15,1,2.5,2017-11-13 12:59:30,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,3.849315,-1.42093
4,17,1,4.5,2011-05-18 05:28:03,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215,3.92093,3.849315,0.57907
...,...,...,...,...,...,...,...,...,...,...
100831,610,160341,2.5,2016-11-19 08:55:49,Bloodmoon (1997),Action|Thriller,1,2.50000,0.500000,0.00000
100832,610,160527,4.5,2016-11-19 08:43:18,Sympathy for the Underdog (1971),Action|Crime|Drama,1,4.50000,0.900000,0.00000
100833,610,160836,3.0,2017-05-03 20:53:14,Hazard (2005),Action|Drama|Thriller,1,3.00000,0.600000,0.00000
100834,610,163937,3.5,2017-05-03 21:59:49,Blair Witch (2016),Horror|Thriller,1,3.50000,0.700000,0.00000


In [None]:
data_table = df.pivot(
    index="userId", columns="movieId", values="mean_centered_rating"
).fillna(0)
data_table.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,-1.42093,0.0,0.0,0.0,0.0,0.0,-0.685185,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.07907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-1.42093,-1.431818,-1.259615,0.0,0.0,0.0,0.0,0.0,0.0,0.503788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.503788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,1.07907,0.0,0.0,0.0,0.0,1.053922,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
corr_df = data_table.corr()

In [None]:
corr_df.loc[:, 10].sort_values(ascending=False).head(20)

movieId
10        1.000000
1722      0.361334
3082      0.354190
52694     0.330937
64249     0.323501
167746    0.309567
67295     0.301176
49272     0.295883
107       0.291848
3398      0.267956
6794      0.263775
164909    0.259932
2046      0.259744
6550      0.257671
109846    0.256586
648       0.255358
78637     0.254637
106438    0.251612
46337     0.251612
3599      0.250916
Name: 10, dtype: float64

In [None]:
A, B = ["xXx (2002)", "Star Wars: Episode II - Attack of the Clones (2002)"]
-tf.losses.cosine_similarity(data_table[A], data_table[B]).numpy()  # type: ignore

In [None]:
data_table.loc[(abs(data_table[A]) > 1e-4) & (abs(data_table[B]) > 1e-4), [A, B]]