In [2]:
import pandas as pd
from pandas import DataFrame as df

movie_name_genres = pd.read_csv('ml-25m/movies.csv')
movie_imdbid_tmdbid = pd.read_csv('ml-25m/links.csv')
movie_ratings = pd.read_csv('ml-25m/ratings.csv')

merged_names_ids = pd.merge(movie_name_genres,movie_imdbid_tmdbid,on='movieId')

df_votes = movie_ratings.groupby('movieId')['movieId'].count().to_frame(name='votes').reset_index()
df_avgrating = movie_ratings.groupby('movieId')['rating'].mean().to_frame(name='avg_rating').reset_index()
df_votes_avgrating_merged = df.merge(df_votes,df_avgrating,on='movieId')
df_merged_moviedata = df.merge(movie_imdbid_tmdbid,df_votes_avgrating_merged,how='left',on='movieId')

merged_moviedata = df.merge(merged_names_ids,df_merged_moviedata,on=['movieId','tmdbId','imdbId'])

merged_moviedata['genres'] = merged_moviedata['genres'].apply(lambda x: x.split('|') if x != '(no genres listed)' else [])

In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Reading 'rating' data

In [4]:
ratings_df = pd.read_csv('ml-100k/u.data', sep='\t', header=None)
ratings_df.columns = ['userId', 'movieId', 'rating', 'timestamp']

In [5]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [6]:
movies_info = pd.read_csv('ml-100k/u.item', sep='|', header=None, encoding='ISO-8859-1', usecols=[0, 1])
movies_info.columns = ['movieId', 'title']

In [7]:
movies_info

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [8]:
# checking for match in 25M dataset based on title
movies_info = df1 = movies_info.assign(exist=movies_info['title'].isin(merged_moviedata['title']))
tmdbids = pd.read_csv('ml-100k/100K_idsOfUnmatchedtitles.csv', skip_blank_lines=False) # this contains unmatched title ids
movies_info_false = df1[df1['exist'] == False]
movies_info_false.insert(3, 'tmdbId', tmdbids.values) # inserting tmdbids as fourth column to unmatched dataframe in above line
movies_info_false

Unnamed: 0,movieId,title,exist,tmdbId
6,7,Twelve Monkeys (1995),False,63.0
10,11,Seven (Se7en) (1995),False,807.0
13,14,"Postino, Il (1994)",False,11010.0
17,18,"White Balloon, The (1995)",False,46785.0
18,19,Antonia's Line (1995),False,880.0
...,...,...,...,...
1666,1667,"Next Step, The (1995)",False,291634.0
1676,1677,Sweet Nothing (1995),False,124851.0
1677,1678,Mat' i syn (1997),False,44361.0
1680,1681,You So Crazy (1994),False,38129.0


In [9]:
movies_info_false.drop('exist', axis=1)

Unnamed: 0,movieId,title,tmdbId
6,7,Twelve Monkeys (1995),63.0
10,11,Seven (Se7en) (1995),807.0
13,14,"Postino, Il (1994)",11010.0
17,18,"White Balloon, The (1995)",46785.0
18,19,Antonia's Line (1995),880.0
...,...,...,...
1666,1667,"Next Step, The (1995)",291634.0
1676,1677,Sweet Nothing (1995),124851.0
1677,1678,Mat' i syn (1997),44361.0
1680,1681,You So Crazy (1994),38129.0


In [10]:
movies_info_true = df1[df1['exist'] == True] # getting rows with matched titles in 25M df

In [11]:
# This step merges matched title rows of 25M and 100k 
movies_info = df.merge(merged_moviedata, movies_info_true, on='title', how='inner', left_index=True)

In [12]:
movies_info

Unnamed: 0,movieId_x,title,genres,imdbId,tmdbId,votes,avg_rating,movieId_y,exist
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",114709,862.0,57309.0,3.893708,1,True
754,2,Jumanji (1995),"[Adventure, Children, Fantasy]",113497,8844.0,24228.0,3.251527,755,True
1027,3,Grumpier Old Men (1995),"[Comedy, Romance]",113228,15602.0,11804.0,3.142028,1028,True
1310,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",114885,31357.0,2523.0,2.853547,1311,True
755,5,Father of the Bride Part II (1995),[Comedy],113041,11862.0,11714.0,3.058434,756,True
...,...,...,...,...,...,...,...,...,...
314,2320,Apt Pupil (1998),"[Drama, Thriller]",118636,9445.0,1721.0,3.103719,315,True
908,2563,Dangerous Beauty (1998),[Drama],118892,8583.0,865.0,3.646821,909,True
616,4970,"Blue Angel, The (Blaue Engel, Der) (1930)",[Drama],20697,228.0,1005.0,3.889055,617,True
586,6531,"Hour of the Pig, The (1993)","[Crime, Drama, Mystery]",107146,70912.0,149.0,3.359060,587,True


In [13]:
movies_info = movies_info[['movieId_y', 'title', 'tmdbId']]
movies_info

Unnamed: 0,movieId_y,title,tmdbId
0,1,Toy Story (1995),862.0
754,755,Jumanji (1995),8844.0
1027,1028,Grumpier Old Men (1995),15602.0
1310,1311,Waiting to Exhale (1995),31357.0
755,756,Father of the Bride Part II (1995),11862.0
...,...,...,...
314,315,Apt Pupil (1998),9445.0
908,909,Dangerous Beauty (1998),8583.0
616,617,"Blue Angel, The (Blaue Engel, Der) (1930)",228.0
586,587,"Hour of the Pig, The (1993)",70912.0


In [14]:
movies_info.columns = ['movieId', 'title', 'tmdbId']
# The below step merges unmatched with matched to get the required df to work on with tmdbids
movies_info = movies_info.append(movies_info_false).sort_values('movieId').drop('exist', axis=1).drop_duplicates('movieId')

In [32]:
movies_info

Unnamed: 0,movieId,title,tmdbId
0,1,Toy Story (1995),862.0
1,2,GoldenEye (1995),710.0
2,3,Four Rooms (1995),5.0
3,4,Get Shorty (1995),8012.0
4,5,Copycat (1995),1710.0
...,...,...,...
1677,1678,Mat' i syn (1997),44361.0
1678,1679,B. Monkey (1998),2923.0
1679,1680,Sliding Doors (1998),10215.0
1680,1681,You So Crazy (1994),38129.0


#### Defining X and y

In [16]:
X = ratings_df[['userId', 'movieId']].values
y = ratings_df['rating'].values

#### Defining a Rating matrix R

In [17]:
from scipy import sparse
import numpy as np

def ConvertToDense(X, y ,shape):
    row = X[:,0]    # gets the userId values as ndarray
    col = X[:,1]    #gets the movieId values as ndarray
    data = y        # rating values as ndarray
    matrix_sparse = sparse.csr_matrix((data,(row,col)), shape=(shape[0]+1,shape[1]+1))  
    R = matrix_sparse.todense()       # getting the R matrix where empty cells are replaced with zeros
    R = R[1:, 1:]                     # This is done because we have userIds and movieIds starting from 1 so entire zeroth row and column is unnecessary
    R= np.asarray(R)
    return R

n_users = len(ratings_df['userId'].unique())
n_movies = len(ratings_df['movieId'].unique())
R_shape = (n_users, n_movies)

R = ConvertToDense(X, y, R_shape)
print(R.shape)

(943, 1682)


In [18]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=20)     
# Matrix factorization               
nmf_model.fit(R)                     
Theta = nmf_model.transform(R)   #W    
M = nmf_model.components_.T      #H.T    

# Making the predictions
R_pred = M.dot(Theta.T)              
R_pred = R_pred.T                  

In [19]:
def GetShape(filename):
    ratings_df = pd.read_csv(filename, sep='\t', header=None)
    ratings_df.columns = ['userId', 'movieId', 'rating', 'timestamp']
    n_users = len(ratings_df['userId'].unique())
    n_items = len(ratings_df['movieId'].unique())
    return (n_users, n_items)

def LoadData(filename, R_shape):
    ratings_df = pd.read_csv(filename, sep='\t', header=None)
    ratings_df.columns = ['userId', 'movieId', 'rating', 'timestamp']  
    X = ratings_df[['userId', 'movieId']].values
    y = ratings_df['rating'].values  
    return X, y, ConvertToDense(X, y, R_shape)
 
R_shape = GetShape('ml-100k/u.data') 
X, y, R = LoadData('ml-100k/u.data', R_shape)

In [20]:
from sklearn.model_selection import train_test_split, KFold

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
R_train = ConvertToDense(X_train, y_train, R_shape)
R_test = ConvertToDense(X_test, y_test, R_shape)
X_train.shape, X_test.shape

((75000, 2), (25000, 2))

In [21]:
from sklearn.decomposition import NMF

parametersNMF = {
                    'n_components' : 20,     # number of latent factors
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : 0.01,          # regularization term
                    'l1_ratio' : 0,          # set regularization = L2 
                    'max_iter' : 15
                }

estimator = NMF(**parametersNMF)

In [22]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()     
    actual = actual[actual.nonzero()].flatten() 
    return np.sqrt(mean_squared_error(pred, actual))

In [23]:
import time
err = 0
n_iter = 0.
n_splits = 5
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(X):   
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Converting sparse array to dense array
    R_train = ConvertToDense(X_train, y_train, R_shape)
    R_test = ConvertToDense(X_test, y_test, R_shape)

    # Training (matrix factorization)
    t0 = time.time()
    estimator.fit(R_train)  
    Theta = estimator.transform(R_train)       # user features
    M = estimator.components_.T                # item features
    print("Fit in {0:.3f}s".format(time.time() - t0))
    n_iter += estimator.n_iter_ 
    # Making the predictions
    R_pred = M.dot(Theta.T)
    R_pred = R_pred.T      
    
    # Clipping values                                                    
    R_pred[R_pred > 5] = 5.           # clips ratings above 5             
    R_pred[R_pred < 1] = 1.           # clips ratings below 1

    # Computing the error on the validation set 
    err += get_rmse(R_pred, R_test)
    print(get_rmse(R_pred, R_test))
    
print("*** RMSE Error : ", err/n_splits)
print("Mean number of iterations:", n_iter/n_splits)

Fit in 0.339s
2.5571903519847945
Fit in 0.223s
2.462036122910539
Fit in 0.214s
2.4246259117422664
Fit in 0.362s
2.4267105088047214
Fit in 0.209s
2.473506165218397
*** RMSE Error :  2.4688138121321437
Mean number of iterations: 15.0


In [24]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) 

In [25]:
param =        {
                    'n_components' : [15, 20, 25, 35],
                    'alpha' : [0.001, 0.01, 0.1],
                    'l1_ratio' : [0], 
                    'max_iter' : [15, 20, 25, 50]
                }

# Keep track of RMSE and parameters
grid_search = pd.DataFrame([[0, 0, 0, 0, 0]])
grid_search.columns = ['n_components', 'alpha', 'l1_ratio', 'max_iter'] + ['RMSE']

# nb of folds in ShuffleSplit CV
n_splits = 5      
i = 0

# Performing the Grid search
for n_components in param['n_components']:
    for alpha in param['alpha']:
        for l1_ratio in param['l1_ratio']:
            for max_iter in param['max_iter']:

                err = 0
                n_iter = 0
                print('Search', i, '/', 4*3*4*1 - 1)
                for train_index, test_index in cv.split(X):
    
                    X_train_cv, X_test_cv = X[train_index], X[test_index]
                    y_train_cv, y_test_cv = y[train_index], y[test_index]
    
                    # Converting sparse array to dense array
                    R_train = ConvertToDense(X_train_cv, y_train_cv, R_shape)
                    R_test = ConvertToDense(X_test_cv, y_test_cv, R_shape)

                    # updating the parameters
                    parametersNMF = {
                    'n_components' : n_components,
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : alpha,
                    'l1_ratio' : l1_ratio,
                    'max_iter' : max_iter}
                    
                    estimator = NMF(**parametersNMF)
                
                    # Training (matrix factorization)
                    t0 = time.time()
                    estimator.fit(R_train)  
                    Theta = estimator.transform(R_train)       # user features
                    M = estimator.components_.T                # item features
                    #print "Fit in %0.3fs" % (time.time() - t0)
                    n_iter += estimator.n_iter_ 

                    # Making the predictions
                    R_pred = M.dot(Theta.T).T
                    
                    # Clipping values                                                    
                    R_pred[R_pred > 5] = 5.           # clips ratings above 5             
                    R_pred[R_pred < 1] = 1.           # clips ratings below 1

                    # Computing the error on the validation set 
                    err += get_rmse(R_pred, R_test)
    
                #print "RMSE Error : ", err / n_folds
                grid_search.loc[i] = [n_components, alpha, l1_ratio, max_iter, err/n_splits]
                print(grid_search.loc[i].tolist(), "Mean number of iterations:", n_iter/n_splits)
                i += 1

best_params = grid_search.sort_values('RMSE')[:1]
print('*** best params ***')
print(best_params)

Search 0 / 47
[15.0, 0.001, 0.0, 15.0, 2.4171804516490822] Mean number of iterations: 15.0
Search 1 / 47
[15.0, 0.001, 0.0, 20.0, 2.4160247733334357] Mean number of iterations: 20.0
Search 2 / 47
[15.0, 0.001, 0.0, 25.0, 2.415261256250394] Mean number of iterations: 25.0
Search 3 / 47
[15.0, 0.001, 0.0, 50.0, 2.4132728039065556] Mean number of iterations: 50.0
Search 4 / 47
[15.0, 0.01, 0.0, 15.0, 2.4172053164630096] Mean number of iterations: 15.0
Search 5 / 47
[15.0, 0.01, 0.0, 20.0, 2.4160482609565954] Mean number of iterations: 20.0
Search 6 / 47
[15.0, 0.01, 0.0, 25.0, 2.4152846427192323] Mean number of iterations: 25.0
Search 7 / 47
[15.0, 0.01, 0.0, 50.0, 2.4132854218703597] Mean number of iterations: 50.0
Search 8 / 47
[15.0, 0.1, 0.0, 15.0, 2.417455034066946] Mean number of iterations: 15.0
Search 9 / 47
[15.0, 0.1, 0.0, 20.0, 2.41628185785686] Mean number of iterations: 20.0
Search 10 / 47
[15.0, 0.1, 0.0, 25.0, 2.415514901327386] Mean number of iterations: 25.0
Search 11 / 4

In [26]:
parametersNMF_opt = {
                    'n_components' : 15,     # number of latent factors
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : 0.001,          # regularization term
                    'l1_ratio' : 0.0,          # set regularization = L2 
                    'max_iter' : 50
                }

In [27]:
estimator = NMF(**parametersNMF_opt)
                
# Training (matrix factorization)
estimator.fit(R)  
Theta = estimator.transform(R)            # user features
M = estimator.components_.T               # movie features

# Making the predictions
R_pred = M.dot(Theta.T).T
                    
# Clipping values                                                    
R_pred[R_pred > 5] = 5.           # clips ratings above 5             
R_pred[R_pred < 1] = 1.           # clips ratings below 1

In [28]:
R_pred

array([[4.30397667, 1.7618573 , 1.21023095, ..., 1.        , 1.        ,
        1.        ],
       [2.26448777, 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [2.08311383, 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.43348709, 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.59474244, 2.03823438, 1.        , ..., 1.        , 1.        ,
        1.        ]])

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(M)                   #gets the pairwise cosine similarity between movies based on their feature vectors

In [30]:
def make_recommendation_newuser(item_sim, movie_idx, k=5):
    '''
    movie_idx ...... select an item
    k  ............ number of movies to recommend
    '''
    reco_item_df = pd.DataFrame(item_sim).iloc[movie_idx-1, :]      # getting the pairwise cosine similarity row for selected movie
    reco_item_df = pd.concat([reco_item_df, movies_info], axis=1)   # merge list with the movie's title
    reco_item_df.columns = ['similarity','movieId', 'title', 'tmdbId']
    reco_item_df = reco_item_df.sort_values(by='similarity',ascending=False)

    print('Recommended movies for a new user (without rating history), currently looking at movie:', reco_item_df.iloc[0]['title']) 
    print(reco_item_df[1:k+1])         # returns the 5 movies the most similar to item_idx

    

In [31]:
make_recommendation_newuser(item_sim, movie_idx=2, k=5)

Recommended movies for a new user (without rating history), currently looking at movie: GoldenEye (1995)
     similarity  movieId                              title   tmdbId
549    0.991618      550  Die Hard: With a Vengeance (1995)   1572.0
232    0.991292      233                 Under Siege (1992)   8845.0
575    0.990004      576                 Cliffhanger (1993)   9350.0
577    0.987173      578              Demolition Man (1993)   9739.0
384    0.986590      385                   True Lies (1994)  36955.0
