#  movie recommentation with Dask

In [3]:
import numpy as np
from sklearn.decomposition import NMF
import pandas as pd
import pandas_profiling as pp 
import dask.dataframe as dd 

In [4]:
df = dd.read_csv('ml-latest/ratings.csv')

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [6]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, userId to timestamp
dtypes: float64(1), int64(3)

In [7]:
df = df.drop('timestamp', axis =1)

In [8]:
df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 3 entries, userId to rating
dtypes: float64(1), int64(2)

In [9]:
df 

Unnamed: 0_level_0,userId,movieId,rating
npartitions=12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int64,int64,float64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [10]:
 df.rating.mean()

dd.Scalar<series-..., dtype=float64>

In [11]:
 df.rating.mean().compute() 

3.5304452124932677

In [12]:
df.isna().sum().compute()

userId     0
movieId    0
rating     0
dtype: int64

In [13]:
df = df.categorize(columns=['userId'])

In [14]:
df2 = df.pivot_table(index= 'movieId', columns='userId', values= 'rating' )

## first attempt to machine learning with dask df 

In [15]:
%matplotlib inline
import dask_ml.datasets
import dask_ml.cluster
import matplotlib.pyplot as plt
from dask_ml.cluster import KMeans

In [None]:
X = df2.persist() 

In [None]:
km = dask_ml.cluster.KMeans(n_clusters=3, init_max_iter=2, oversampling_factor=10)
%time km.fit(X)

In [None]:
fig, ax = plt.subplots()
ax.scatter(df2[::10000, 0], df2[::10000, 1], marker='.', c=km.labels_[::10000],
           cmap='viridis', alpha=0.25);

In [None]:
km.labels_

In [None]:
km.labels_[:10].compute()

In [56]:
df2.isna().sum().sum()

5830804

In [60]:
df2.notna().sum().sum()

100836

### filling the missing values with median

In [12]:
med_values = df2.median().median()

In [14]:
df2.fillna(med_values,inplace=True)

In [15]:
df2

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.5,4.0,3.5,3.5,4.0,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
2,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,4.0,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,3.5,3.5,3.5,3.5,3.5,2.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
607,4.0,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
608,2.5,2.0,2.0,3.5,3.5,3.5,3.5,3.5,3.5,4.0,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
609,3.0,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,4.0,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


## train the NMF model

In [8]:
m = NMF(n_components=30)

In [9]:
m.fit(df.pivot(index='userId', columns='movieId', values='rating'))

ValueError: Unstacked DataFrame is too big, causing int32 overflow

### checking the matrices

In [23]:
Q = m.components_
P = m.transform(df2)
error = m.reconstruction_err_ #this is an absolute score, so no intuition from looking at in isolation! 
P.shape, Q.shape, error

((610, 30), (30, 9724), 260.3998528350142)

### make a prediction based on new user input

In [144]:
# the ratings went from 0.5 to 5
ratings = [0.5, 1, 1.5, 2., 2.5, 3, 3.5, 4, 4.5, 5, np.nan]

In [132]:
#finding the probabilities of a rating to appear if we have all values:
prob_ratings = []
for i in ratings:
    prob = len(df['rating'][df['rating'] == i])/ len(df)
    prob_ratings.append(prob)

In [133]:
#probabilities that there is a Nan value:
prob_nan = df2.isna().sum().sum() / (610*9724)
# probability that there is a rating for a movie for a user:
prob_notna = 1 - prob_nan

In [134]:
#probabilities for the ratings:
prob_rated = np.asarray(prob_ratings) * prob_notna

In [137]:
prob_rated = prob_rated.tolist()

In [138]:
prob_rated.append(prob_nan)

In [127]:
prob_rated.append(prob_nan)

In [141]:
prob_rated 

[0.00023096479219912182,
 0.0004738992926071032,
 0.00030194010425447246,
 0.0012730037561281525,
 0.0009356602895657856,
 0.0033796724008874415,
 0.002214564606078587,
 0.0045211779541577,
 0.0014415911956895554,
 0.002227208664045692,
 0.9830003169443864]

In [147]:
# generate a user input , so a new row of 9724 values corresponding to random ratings for each of the 9724 movies
new_user_input = pd.Series(np.random.choice(ratings, 9724, p=prob_rated))

In [148]:
new_user_input

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
9719   NaN
9720   NaN
9721   NaN
9722   NaN
9723   NaN
Length: 9724, dtype: float64

In [149]:
new_user_input.isna().sum()

9543

In [150]:
#Fill missing data
new_user_input = new_user_input.fillna(med_values)

In [153]:
# make sure the new input has >1 dimension & has as many columns as there are films!
new_user_input = np.array(new_user_input).reshape(1,9724)

In [155]:
#Prediction step 1 - generate extra a user_P
user_P = m.transform(new_user_input)

In [156]:
#new user R - reconstruct R but for this new user only
user_R = np.dot(user_P,Q)

In [157]:
user_R #impute with median - order of recommendations is the same as below

array([[3.58367722, 3.50758858, 3.4737011 , ..., 3.49973142, 3.49973142,
        3.50017344]])

In [158]:
user_R.shape

(1, 9724)