In [11]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [12]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [13]:
select_movies = ratings.movieId.value_counts().head(1000).index.to_list()
movies = movies.loc[movies.movieId.isin(select_movies)]
ratings = ratings.loc[ratings.movieId.isin(select_movies)]

In [14]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [15]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807


In [16]:
m = movies.copy()
m['genres'] = m['genres'].str.split('|')
m = m.explode('genres')
m = m.pivot(index='movieId', columns='genres', values='title')
m = ~m.isna()
m = m.astype(int)
m.head(2)

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [7]:
r = ratings.copy()

In [8]:
r['hour'] = r['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)
r.head()

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,6
1,1,24,1.5,1217895807,5
2,1,32,4.0,1217896246,6
3,1,47,4.0,1217896556,6
4,1,50,4.0,1217896523,6


In [19]:
users = pd.read_csv('users.csv')
users

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303
2,3,20,4.521478
3,4,23,2.095284
4,5,35,1.759860
...,...,...,...
663,664,22,5.288101
664,665,20,5.220446
665,666,19,3.262313
666,667,17,3.674356


In [21]:
users = pd.read_csv('users.csv')
users = users.merge(r.groupby('userId').rating.mean().reset_index(), on='userId')
users = users.merge(r.groupby('userId').hour.mean().reset_index(), on='userId')
users.head(2)

Unnamed: 0,userId,age,time_spent_per_day,rating,hour
0,1,16,3.976315,3.691589,5.616822
1,2,24,1.891303,3.923077,21.0


In [22]:
u = users.copy()
u = u.set_index('userId')
u.columns = ['age', 'time_spent_per_day', 'u_avg_rating', 'hour']

scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)
u.head(2)

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.073572,-0.882006
2,-0.135616,-1.079947,0.426461,1.477906


In [23]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105148,668,109374,4.0,1415763807
105151,668,109487,4.0,1415421771
105185,668,111759,3.0,1413728719
105205,668,112852,4.0,1433992065


In [25]:
X = ratings[['movieId', 'userId', 'rating']].copy()

In [27]:
X = X.merge(u.reset_index(), on='userId', how='right')


In [29]:
X.head()

Unnamed: 0,movieId,userId,rating,age,time_spent_per_day,u_avg_rating,hour
0,16,1,4.0,-1.470292,0.341073,-0.073572,-0.882006
1,24,1,1.5,-1.470292,0.341073,-0.073572,-0.882006
2,32,1,4.0,-1.470292,0.341073,-0.073572,-0.882006
3,47,1,4.0,-1.470292,0.341073,-0.073572,-0.882006
4,50,1,4.0,-1.470292,0.341073,-0.073572,-0.882006


In [31]:
X = X.merge(m.reset_index(), on='movieId', how='right')
X

Unnamed: 0,movieId,userId,rating,age,time_spent_per_day,u_avg_rating,hour,Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,2,5.0,-0.135616,-1.079947,0.426461,1.477906,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,5,4.0,1.699565,-1.169532,-1.859363,-1.664898,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,8,5.0,0.364888,0.298545,0.160605,1.324497,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,11,4.0,-1.303458,0.513712,-0.380602,0.557454,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,14,4.0,-0.302450,1.251552,-0.379415,0.557454,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63245,116797,606,4.0,-0.302450,1.470639,0.126240,1.491852,0,0,0,...,0,0,0,0,0,0,0,1,1,0
63246,116797,619,4.0,0.364888,1.140836,-0.398649,0.898946,0,0,0,...,0,0,0,0,0,0,0,1,1,0
63247,116797,622,4.0,-0.636119,-0.301899,0.350543,-0.147432,0,0,0,...,0,0,0,0,0,0,0,1,1,0
63248,116797,642,4.0,-0.802954,-0.783329,0.719685,-0.489334,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [32]:
X = X.drop(columns = ['movieId', 'userId'])
y = X.pop('rating')

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [34]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [35]:
from sklearn.metrics import mean_squared_error as mse
mse(y_test, y_pred)**0.5

0.8868700141216211

In [36]:
ratings.loc[ratings.userId==1].head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [38]:
#recommend movies for user 5
user id =5

SyntaxError: invalid syntax (3813949547.py, line 1)

In [39]:
u.loc[5]

age                   1.699565
time_spent_per_day   -1.169532
u_avg_rating         -1.859363
hour                 -1.664898
Name: 5, dtype: float64

In [40]:
m.loc[1]

genres
Action         0
Adventure      1
Animation      1
Children       1
Comedy         1
Crime          0
Documentary    0
Drama          0
Fantasy        1
Film-Noir      0
Horror         0
IMAX           0
Musical        0
Mystery        0
Romance        0
Sci-Fi         0
Thriller       0
War            0
Western        0
Name: 1, dtype: int64

In [41]:
model.predict(u.loc[1].append(m.loc[1]).values.reshape(1,-1))

array([3.71729074])

In [42]:
## COllabroative Filtering
from cmfrec import CMF

In [43]:
rm_raw = ratings[['userId', 'movieId', 'rating']].copy()
rm_raw.columns = ['UserId', 'ItemId', 'Rating']  # Lib requires specific column names
rm_raw.head(2)

Unnamed: 0,UserId,ItemId,Rating
0,1,16,4.0
1,1,24,1.5


In [46]:
rm_raw.shape

(63250, 3)

In [51]:
CMF?

[0;31mInit signature:[0m
[0mCMF[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mk[0m[0;34m=[0m[0;36m40[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlambda_[0m[0;34m=[0m[0;36m10.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmethod[0m[0;34m=[0m[0;34m'als'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_cg[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muser_bias[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mitem_bias[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcenter[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_implicit_features[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_lam[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_lam_sideinfo[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_bias_const[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m  

In [44]:
model = CMF(method="als", k=2, lambda_=0.1, user_bias=False, item_bias=False, verbose=False)
model.fit(rm_raw)

Collective matrix factorization model
(explicit-feedback variant)


In [47]:
model.A_.shape

(668, 2)

In [48]:
model.B_.shape

(1000, 2)

In [49]:
rm_raw.Rating.mean(),model.glob_mean_

(3.6659130434782607, 3.6659131050109863)

In [50]:
top_items = model.topN(user=1, n=10)
movies.loc[movies.movieId.isin(top_items)]

Unnamed: 0,movieId,title,genres
279,318,"Shawshank Redemption, The (1994)",Crime|Drama
743,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance
744,923,Citizen Kane (1941),Drama|Mystery
938,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
973,1212,"Third Man, The (1949)",Film-Noir|Mystery|Thriller
978,1217,Ran (1985),Drama|War
4472,5971,My Neighbor Totoro (Tonari no Totoro) (1988),Animation|Children|Drama|Fantasy
5390,7502,Band of Brothers (2001),Action|Drama|War
6958,44555,"Lives of Others, The (Das leben der Anderen) (...",Drama|Romance|Thriller
9908,109374,"Grand Budapest Hotel, The (2014)",Comedy|Drama


In [None]:
rm_raw = ratings[['userId', 'movieId', 'rating']].copy()
rm_raw.columns = ['UserId', 'ItemId', 'Rating']  # Lib requires specific column names
model = CMF(method="als", k=2, lambda_=0.1, user_bias=False, item_bias=False, verbose=False)
model.fit(rm_raw)
top_items = model.topN(user=1, n=10)