In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise.reader import Reader
from surprise.dataset import Dataset
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from tqdm import tqdm
import io
import csv

### Read the Sparse Data

In [2]:
sparseData = sp.load_npz('data/sparse_data.npz')

### Sparsity of Matrix

In [3]:
rows,cols = sparseData.shape
non_zero_elements = sparseData.count_nonzero()

print(f'Sparsity of matrix : {(1-(non_zero_elements/(rows*cols)))*100}')

Sparsity of matrix : 99.78658865580644


### Global Average

In [4]:
globalAverageRating = sparseData.sum()/sparseData.count_nonzero()

print(f'Global Average of Data: {globalAverageRating}')

Global Average of Data: 3.604289964420661


### Create a Function to get Average Rating

In [5]:
def get_average_rating(matrix, user=True):
    
    if user:
        average = matrix.sum(axis = 1)/(matrix != 0).sum(axis = 1)  # for users
    else:
        average = matrix.sum(axis = 0)/(matrix != 0).sum(axis = 0)  # for movies
        
    return average.reshape(-1, 1)

### Average Ratng per User

In [6]:
averageOfUser = get_average_rating(sparseData, True)

print(f'Average rating of user 6 is {averageOfUser[6][0, 0]}')

Average rating of user 6 is 3.4185303514377


  average = matrix.sum(axis = 1)/(matrix != 0).sum(axis = 1)  # for users


### Average per Movie

In [7]:
averageOfMovie = get_average_rating(sparseData, False)

print(f'Average rating of Movie 6 is {averageOfMovie[6][0,0]}')

Average rating of Movie 6 is 3.084396467124632


  average = matrix.sum(axis = 0)/(matrix != 0).sum(axis = 0)  # for movies


### Movie to Movie Similarity

In [8]:
m_msimilarity = cosine_similarity(sparseData.T, dense_output=False)

In [9]:
print(f'Movie to Movie similarity matrix shape: {m_msimilarity.shape}')

Movie to Movie similarity matrix shape: (17771, 17771)


In [10]:
rows, cols, ratings = sp.find(sparseData)

### Create a function to get sample matrix

In [11]:
def get_sample_matrix(matrix, n_users, n_movies):
    
    users, movies, rating = sp.find(matrix)
    
    users_uniq = np.unique(users)
    movies_uniq = np.unique(movies)
    
    np.random.seed(0)
    
    user_selected = np.random.choice(users_uniq, n_users)
    movie_selected = np.random.choice(movies_uniq, n_movies)
    
    mask = np.logical_and(np.isin(users, user_selected), np.isin(movies, movie_selected))
    
    return sp.csr_matrix((rating[mask],(users[mask], movies[mask])))

### Create a function to make dataframe from matrix

In [12]:
## Utiltiy function to make dataframe from matrix
def get_df_from_matrix(matrix):
    
    user, movie, rating = sp.find(matrix)
    
    df = {'user_id':user, 'movie_id':movie, 'rating':rating}
    
    return pd.DataFrame(df)

### Preapare a Sample Matrix

In [13]:
sample_matrix = get_sample_matrix(sparseData, 5000, 500)

print(f'Shape of Sample Matrix is: {sample_matrix.shape}')
print(f'Sparsity of Sample Matrix is: {1-sample_matrix.count_nonzero()/(sample_matrix.shape[0]*sample_matrix.shape[1])}')

Shape of Sample Matrix is: (2648254, 17754)
Sparsity of Sample Matrix is: 0.9999992732857004


In [14]:
print(f'Sample Matrix: {sample_matrix}')

Sample Matrix:   (267, 12911)	4
  (578, 3151)	3
  (578, 7125)	4
  (578, 9458)	3
  (578, 10418)	5
  (578, 12071)	4
  (578, 12075)	5
  (578, 12911)	4
  (578, 13856)	3
  (2003, 1145)	1
  (2003, 10027)	1
  (2003, 12299)	2
  (3432, 1804)	5
  (3432, 3151)	1
  (3432, 3290)	5
  (3432, 3742)	5
  (3432, 4029)	5
  (3432, 6195)	4
  (3432, 9472)	5
  (3432, 9625)	5
  (3432, 10276)	5
  (3432, 10729)	5
  (3432, 12582)	4
  (3432, 13546)	5
  (3432, 13856)	5
  :	:
  (2646462, 16793)	5
  (2646462, 16954)	4
  (2646462, 17189)	5
  (2646512, 1145)	4
  (2646512, 2452)	3
  (2646512, 8743)	4
  (2646512, 12582)	2
  (2646512, 13402)	3
  (2646512, 16954)	3
  (2646635, 3742)	5
  (2647746, 1145)	4
  (2647746, 1406)	3
  (2647746, 2452)	5
  (2647746, 4745)	3
  (2647746, 12911)	4
  (2647746, 14621)	5
  (2647746, 16954)	5
  (2648100, 3151)	3
  (2648100, 12582)	4
  (2648116, 13402)	4
  (2648253, 3290)	3
  (2648253, 4745)	3
  (2648253, 12582)	4
  (2648253, 14982)	3
  (2648253, 17127)	4


### Convert Sample Matrix to Dataframe

In [15]:
sample_df = get_df_from_matrix(sample_matrix)

print('Sample Dataframe is..')
print(sample_df.head())

Sample Dataframe is..
   user_id  movie_id  rating
0  1198173        54       3
1  1273033        54       2
2  1881088        54       3
3  1952829        54       3
4  1144190       109       2


In [16]:
# sample_df.to_csv('data/sample_rating.csv')
# sp.save_npz('data/sample_matrix.npz', sample_matrix)

# Train Test Split

In [17]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(sample_df, test_size=0.2, random_state=0)

In [18]:
reader = Reader(rating_scale=(1, 5))

train_data = Dataset.load_from_df(train_df[['user_id', 'movie_id', 'rating']], reader)
test_data = Dataset.load_from_df(test_df[['user_id', 'movie_id', 'rating']], reader)

trainset = train_data.build_full_trainset()
testset = test_data.build_full_trainset()
testset = testset.build_testset()

In [19]:
print(train_df.head(10))

       user_id  movie_id  rating
13695  1032298      7745       3
19383   792958     10906       4
5969   2518644      2452       5
18172  1995860     10162       3
22455   904391     12299       2
9607    413930      4745       4
23381   365293     12582       5
8555   1301819      3569       3
28414    47165     14621       3
7911    629859      3290       5


In [20]:
print(test_df.head(10))

       user_id  movie_id  rating
4131   1022254      2395       3
23594   719725     12582       4
28860   858589     14621       4
19615  1712855     10906       3
30966   717135     16793       4
18321   858589     10276       4
25620  1498624     12911       4
7418   2014479      3151       5
30207  1455755     15530       5
33407   516343     17189       5


# Surprise Model

In [21]:
from surprise.model_selection.validation import cross_validate
from surprise.prediction_algorithms.random_pred import NormalPredictor
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.knns import KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp, NMF
from surprise.prediction_algorithms.slope_one import SlopeOne
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise import accuracy


sim_options = {'name': 'cosine'}

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(sim_options=sim_options), KNNBasic(sim_options=sim_options), KNNWithMeans(sim_options=sim_options), KNNWithZScore(sim_options=sim_options), BaselineOnly()]:#, NMF() CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, train_data, measures=['RMSE'], cv=3, verbose=False)
    algorithm.fit(trainset)
    rmse = accuracy.rmse(algorithm.test(testset), verbose=False)
    predictions = algorithm.test(testset)
    
    # Get results & append algorithm name
    results['extra_test'] = rmse
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix..

In [22]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Unnamed: 0_level_0,test_rmse,fit_time,test_time,extra_test
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVDpp,0.957594,5.94339,0.350761,0.946693
BaselineOnly,0.967777,0.036974,0.069207,0.963642
SVD,0.973234,1.424823,0.133416,0.970545
KNNBaseline,0.979657,4.340167,3.085365,0.971751
KNNBasic,1.023521,4.300701,2.892733,1.022714
SlopeOne,1.033325,0.112441,0.141335,0.999597
KNNWithMeans,1.054333,4.104774,2.805058,1.017249
KNNWithZScore,1.058312,4.362696,2.886681,1.016851
NormalPredictor,1.456048,0.02779,0.082906,1.453744


# Regression Model

In [23]:
print(train_df.head(10))

       user_id  movie_id  rating
13695  1032298      7745       3
19383   792958     10906       4
5969   2518644      2452       5
18172  1995860     10162       3
22455   904391     12299       2
9607    413930      4745       4
23381   365293     12582       5
8555   1301819      3569       3
28414    47165     14621       3
7911    629859      3290       5


In [24]:
print(test_df.head(10))

       user_id  movie_id  rating
4131   1022254      2395       3
23594   719725     12582       4
28860   858589     14621       4
19615  1712855     10906       3
30966   717135     16793       4
18321   858589     10276       4
25620  1498624     12911       4
7418   2014479      3151       5
30207  1455755     15530       5
33407   516343     17189       5


# Movie to Movie Similarity

In [25]:
m2m_similarity = cosine_similarity(sample_matrix.T, dense_output = False)

print(m2m_similarity)

  (54, 16954)	0.006260858582095146
  (54, 14193)	0.04356068418690321
  (54, 11701)	0.014813595639006074
  (54, 11199)	0.034092865808321576
  (54, 9097)	0.016652025488231394
  (54, 8743)	0.009177735043308844
  (54, 5861)	0.03305245869744102
  (54, 4570)	0.010638899909514296
  (54, 3290)	0.013038412858810782
  (54, 1492)	0.07265392195447241
  (54, 1289)	0.019274180271486575
  (54, 564)	0.04836543252561524
  (54, 17189)	0.03137958578492142
  (54, 17446)	0.052119752405704295
  (54, 16149)	0.05768430838909743
  (54, 15717)	0.05625192270599867
  (54, 15530)	0.0156847067205133
  (54, 14425)	0.04249765015149823
  (54, 13407)	0.18481233109010922
  (54, 11245)	0.09274777915203367
  (54, 10111)	0.03608178183615071
  (54, 10027)	0.013828299047000897
  (54, 9472)	0.032913419928086204
  (54, 9240)	0.009256283823191798
  (54, 7757)	0.01974068740244427
  :	:
  (17701, 2763)	0.08349429836577887
  (17701, 2452)	0.06905162900903714
  (17701, 2395)	0.060959267885914464
  (17701, 2137)	0.11770525672905457


### Create a function to repare fwatured data

In [26]:
def feature_train_data(df, sample_matrix):
    
    df.reset_index(drop=True, inplace=True)
    u_ratings = []
    m_ratings = []
    movieGlobalAverage = []
    userGlobalAverage = []
    #print(df)
    for i in tqdm(range(len(df))):
        user_id = df['user_id'][i]
        movie_id = df['movie_id'][i]
        
        movieGlobalAverage.append(averageOfMovie[movie_id][0, 0])
        userGlobalAverage.append(averageOfUser[user_id][0, 0])
        
        
        similar_user_v = cosine_similarity(sample_matrix[user_id], sample_matrix).ravel()
        similar_users = similar_user_v.argsort()[::-1][1:]
        
        # Rating for the movie by similar users
        rating = sample_matrix[similar_users, movie_id].toarray().ravel()
        rating = rating[rating != 0][:5]
        rating = np.hstack((rating,[0, 0, 0, 0, 0]))
        rating = rating[:5]
        
        u_ratings.append(rating)
        
        similar_movie_v = m2m_similarity[movie_id].toarray().ravel()
        
        similar_movies = similar_movie_v.argsort()[::-1][1:]
        
        # Rating of user given to similar movies
        rating = sample_matrix[user_id, similar_movies].toarray().ravel()
        rating = rating[rating != 0][:5]
        
        rating = np.hstack((rating,[0, 0, 0, 0, 0]))
        rating = rating[:5]
        
        m_ratings.append(rating)
        #break
        
       
    m_ratings = np.array(m_ratings)
    u_ratings = np.array(u_ratings)
    
    df['sm1'] = m_ratings[:, 0]
    df['sm2'] = m_ratings[:, 1]
    df['sm3'] = m_ratings[:, 2]
    df['sm4'] = m_ratings[:, 3]
    df['sm5'] = m_ratings[:, 4]
    
    df['su1'] = u_ratings[:, 0]
    df['su2'] = u_ratings[:, 1]
    df['su3'] = u_ratings[:, 2]
    df['su4'] = u_ratings[:, 3]
    df['su5'] = u_ratings[:, 4]
    
    df['userGlobalAverage'] = userGlobalAverage
    df['movieGlobalAverage'] = movieGlobalAverage
        
    return df
        

### Featuring Train Data

In [27]:
try:
    train_featured = pd.read_csv('data/train_featured.csv')
    print('Train Data is Found and Loaded.')
except:
    print('Train Data File was missing.')
    print('Going For complete read.')
    train_featured = feature_train_data(train_df, sample_matrix)
    
    train_featured.to_csv('data/train_featured.csv')

train_featured.head()

Train Data is Found and Loaded.


Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,sm1,sm2,sm3,sm4,sm5,su1,su2,su3,su4,su5,userGlobalAverage,movieGlobalAverage
0,0,1032298,7745,3,4,4,2,5,4,4,2,3,4,3,3.684647,4.082576
1,1,792958,10906,4,5,5,2,0,0,4,4,3,3,3,3.109756,3.496262
2,2,2518644,2452,5,4,5,3,3,4,5,5,4,5,5,3.625731,4.434708
3,3,1995860,10162,3,5,5,3,4,3,2,5,5,5,4,3.080985,3.985408
4,4,904391,12299,2,3,5,4,3,5,4,4,2,3,4,3.198358,3.646641


In [28]:
X_train, y_train = train_featured.iloc[:, 4:].values, train_featured.iloc[:, 3].values

### Featuring Test Data

In [29]:
try:
    test_featured = pd.read_csv('data/test_featured.csv')
    print('Test Data is found and Loaded')
except:
    print('Test data File was missing.')
    print('Going For Complete read')
    test_featured = feature_train_data(test_df, sample_matrix)
    
    test_featured.to_csv('data/test_featured.csv')

test_featured.head()

Test Data is found and Loaded


Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,sm1,sm2,sm3,sm4,sm5,su1,su2,su3,su4,su5,userGlobalAverage,movieGlobalAverage
0,0,1022254,2395,3,4,3,4,5,5,4,3,3,4,4,3.845924,3.459608
1,1,719725,12582,4,4,4,4,5,4,3,4,5,4,4,3.723857,3.86175
2,2,858589,14621,4,3,3,2,2,4,4,5,5,3,5,3.444444,4.341956
3,3,1712855,10906,3,4,3,3,4,5,3,3,4,3,5,3.197652,3.496262
4,4,717135,16793,4,2,4,4,4,2,5,4,4,4,5,3.416462,3.925


In [30]:
X_test, y_test = test_featured.iloc[:, 4:].values, test_featured.iloc[:, 3].values

In [31]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

LinearRegression()

In [32]:
print(f'R2 for train data :{model.score(X_train, y_train)}')
print(f'R2 for test data :{model.score(X_test, y_test)}')

R2 for train data :0.3235282326870881
R2 for test data :0.3267819834698753


In [33]:
from sklearn.metrics import mean_squared_error

print(f'rmse for train data :{mean_squared_error(model.predict(X_train), y_train, squared=False)}')
print(f'rmse for test data :{mean_squared_error(model.predict(X_test), y_test, squared=False)}')

rmse for train data :0.8899744983534313
rmse for test data :0.8913186172259664


# Model Comparioson

In [34]:
surprise_rmse = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 
surprise_rmse

Unnamed: 0_level_0,test_rmse,fit_time,test_time,extra_test
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVDpp,0.957594,5.94339,0.350761,0.946693
BaselineOnly,0.967777,0.036974,0.069207,0.963642
SVD,0.973234,1.424823,0.133416,0.970545
KNNBaseline,0.979657,4.340167,3.085365,0.971751
KNNBasic,1.023521,4.300701,2.892733,1.022714
SlopeOne,1.033325,0.112441,0.141335,0.999597
KNNWithMeans,1.054333,4.104774,2.805058,1.017249
KNNWithZScore,1.058312,4.362696,2.886681,1.016851
NormalPredictor,1.456048,0.02779,0.082906,1.453744


In [35]:
surprise_rmse.loc["SVDpp",["test_rmse"]]

test_rmse    0.957594
Name: SVDpp, dtype: float64

In [36]:
surprise_rmse.iloc[0,0]

0.9575940776389783

In [37]:
d = {"SVDpp"           : surprise_rmse.iloc[0,0],
     "BaselineOnly"    : surprise_rmse.iloc[1,0],
     "SVD"             : surprise_rmse.iloc[2,0],
     "KNNBaseline"     : surprise_rmse.iloc[3,0],
     "KNNBasic"        : surprise_rmse.iloc[4,0],
     "SlopeOne"        : surprise_rmse.iloc[5,0],
     "KNNWithZScore"   : surprise_rmse.iloc[6,0],
     "KNNWithMeans"    : surprise_rmse.iloc[7,0],
     "NormalPredictor" : surprise_rmse.iloc[8,0],
     "LinearRegression" : mean_squared_error(model.predict(X_test), y_test, squared=False)}

comparison_table = pd.DataFrame.from_dict(d, orient = "index")
comparison_table.columns = ["test_rmse"]
comparison_table["test_rmse"] = comparison_table["test_rmse"].round(4)
comparison_table = comparison_table.sort_values(["test_rmse"])
comparison_table

Unnamed: 0,test_rmse
LinearRegression,0.8913
SVDpp,0.9576
BaselineOnly,0.9678
SVD,0.9732
KNNBaseline,0.9797
KNNBasic,1.0235
SlopeOne,1.0333
KNNWithZScore,1.0543
KNNWithMeans,1.0583
NormalPredictor,1.456


# Check Users

In [38]:
uid = int(1032298)
mid = int(7745)

my_pred = []
# get a prediction fo specific usr and movie.
for i, algorithm in enumerate([SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(sim_options=sim_options), KNNBasic(sim_options=sim_options), KNNWithMeans(sim_options=sim_options), KNNWithZScore(sim_options=sim_options), BaselineOnly()]):#, NMF() CoClustering()]:
    algorithm.fit(trainset)
    pred = algorithm.predict(uid, mid, r_ui = 3 , verbose = True)
    algo_names = ["SVD", "SVDpp", "SlopeOne", "NormalPredictor", "KNNBaseline", "KNNBasic", "KNNWithMeans", "KNNWithZScore", "BaselineOnly"]
    df = [algo_names[i], pred]
    my_pred.append(df)
    
surprise_predictions = []
for i in my_pred:
    df = [i[0], i[1][0], i[1][1], i[1][2], i[1][3]]
    surprise_predictions.append(df)
surprise_predictions_user1 = pd.DataFrame(surprise_predictions)
surprise_predictions_user1.columns = ["model_name", "user_id", "movie_id", "actual_rating", "predicted_rating"]


user: 1032298    item: 7745       r_ui = 3.00   est = 3.78   {'was_impossible': False}
user: 1032298    item: 7745       r_ui = 3.00   est = 3.62   {'was_impossible': False}
user: 1032298    item: 7745       r_ui = 3.00   est = 3.69   {'was_impossible': False}
user: 1032298    item: 7745       r_ui = 3.00   est = 4.16   {'was_impossible': False}
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 1032298    item: 7745       r_ui = 3.00   est = 3.88   {'actual_k': 40, 'was_impossible': False}
Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 1032298    item: 7745       r_ui = 3.00   est = 4.00   {'actual_k': 40, 'was_impossible': False}
Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 1032298    item: 7745       r_ui = 3.00   est = 3.70   {'actual_k': 40, 'was_impossible': False}
Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 10

In [39]:
train_featured2 = train_featured
train_featured2["model_name"] = "Linear Regression"
train_featured2["predicted_rating"] = model.predict(X_train)
train_featured2.rename(columns = {"rating": "actual_rating"}, inplace = True) 
linear_prediction_user1 = train_featured2[["model_name", "user_id", "movie_id", "actual_rating", "predicted_rating"]]
linear_prediction_user1 = linear_prediction_user1[(linear_prediction_user1["user_id"] == 1032298) & (linear_prediction_user1["movie_id"] == 7745)]

In [40]:
user1 = linear_prediction_user1.append(surprise_predictions_user1)
user1.reset_index().iloc[:,1:]

Unnamed: 0,model_name,user_id,movie_id,actual_rating,predicted_rating
0,Linear Regression,1032298,7745,3,3.685386
1,SVD,1032298,7745,3,3.775326
2,SVDpp,1032298,7745,3,3.619591
3,SlopeOne,1032298,7745,3,3.691292
4,NormalPredictor,1032298,7745,3,4.162063
5,KNNBaseline,1032298,7745,3,3.879853
6,KNNBasic,1032298,7745,3,4.0
7,KNNWithMeans,1032298,7745,3,3.700082
8,KNNWithZScore,1032298,7745,3,3.707194
9,BaselineOnly,1032298,7745,3,3.964127


# Check Recommendation System

### Some Pre-Processing

In [41]:
with open('data/movie_titles.csv', 'r', encoding='ISO-8859-1') as file:
    lines = csv.reader(file, delimiter=',')
    data = []
    for line in tqdm(lines):
        row = {}
        row['movie_id'] = line[0]
        row['release_year'] = line[1]
        row['movie_name'] = ' '.join(line[2:])
        data.append(row)

## Movies data frame to store titles.

movies = pd.DataFrame(data)
movies.head()

17770it [00:00, 92953.93it/s]


Unnamed: 0,movie_id,release_year,movie_name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [42]:
row, column, rating = sp.find(sample_matrix)

# Sort the data based on Popularity
uniq_movie = np.unique(column)
movies = movies.iloc[uniq_movie - 1]

movie_sim = cosine_similarity(sample_matrix.T, dense_output=False)

### Create a Function to Predict Rating

In [43]:
def predict_rating(movieid, userarray, similar_users):

        # returns: numeric rating

        user_global = userarray.sum()/np.count_nonzero(userarray)
        movie_global = sample_matrix[:, movieid].sum()/sample_matrix[:, movieid].count_nonzero()

        similar_movies = movie_sim[movieid].toarray().ravel().argsort()[::-1][1:]

        movie_ratings = sample_matrix[similar_users, movieid].toarray().ravel()
        movie_ratings = movie_ratings[movie_ratings != 0][:5]
        movie_ratings = np.hstack((movie_ratings, [0, 0, 0, 0, 0]))
        movie_ratings = movie_ratings[:5]

        user_ratings = userarray[similar_movies]
        user_ratings = user_ratings[user_ratings != 0][:5]
        user_ratings = np.hstack((user_ratings, [0, 0, 0, 0, 0]))
        user_ratings = user_ratings[:5]

        features = np.hstack((movie_ratings, user_ratings, [user_global, movie_global]))
        rating = model.predict([features])

        return rating[0]

### Create a Funtion to Get Recommendation

In [44]:
def recommendation(user_id, number):

        # user: An array which contains ratings given by user to some Movies.
        # number: How much results you want
        # return: a List of recommended Movies.
        
    movie_box = []
    if not user_id in row:
        
        return 'User is not in dataset'

    user = sample_matrix[user_id].toarray().ravel()

    similar_users = cosine_similarity([user], sample_matrix).ravel().argsort()[::-1][1:]

    for movie in uniq_movie:
        if user[movie] == 0:
            rating = predict_rating(movie, np.array(user), similar_users)
            movie_box.append((rating, movie))

    movie_box.sort(reverse=True)
    array_index = np.array([item[1] for item in movie_box])

    temp = movies.copy()
    temp['rating'] = [0]*len(temp)
    for item in movie_box:
        temp.loc[item[1] - 1, 'rating'] = round(item[0], 0)
    df = pd.DataFrame(temp.loc[array_index - 1].values.tolist()[:number])
    df.columns = ["movie_id", "release_year", "movie_name", "predicted_rating"]

    return df

In [45]:
recommendation(578, 10)

Unnamed: 0,movie_id,release_year,movie_name,predicted_rating
0,14791,2003,Trailer Park Boys: Season 3,5.0
1,8129,1971,Little Murders,5.0
2,2811,1990,Roger Waters: The Wall: Live in Berlin,5.0
3,5888,1986,Cheers: Season 5,5.0
4,11545,2001,The Blair Thumb,5.0
5,10390,2003,Midsomer Murders: Painted in Blood,5.0
6,6490,2004,Dream Theater: Images and Words: Live in Tokyo...,5.0
7,9148,1991,A Night at the Opera,5.0
8,14640,2002,Rugrats: Mysteries,4.0
9,7429,2000,The Most Terrible Time in My Life,4.0
