In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
df = pd.merge(movies, ratings, on = 'movieId')
df = df.drop(['genres', 'timestamp'], axis = 1)

In [7]:
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [8]:
df['userId'].value_counts()

414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
53       20
207      20
431      20
442      20
189      20
Name: userId, Length: 610, dtype: int64

In [9]:
df.shape


(100836, 4)

In [10]:
grp = df.groupby('movieId').agg(n_ratings = ('rating', 'count'), avg_rating = ('rating', 'mean'))
grp


Unnamed: 0_level_0,n_ratings,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.920930
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429
...,...,...
193581,1,4.000000
193583,1,3.500000
193585,1,3.500000
193587,1,3.500000


In [11]:
grp = grp[grp['n_ratings'] > 100]
grp

Unnamed: 0_level_0,n_ratings,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.920930
2,110,3.431818
6,102,3.946078
10,132,3.496212
32,177,3.983051
...,...,...
48516,107,4.252336
58559,149,4.238255
60069,104,4.057692
68954,105,4.004762


In [12]:
df = df[df['movieId'].isin(grp.index)]
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [13]:
df.shape

(19788, 4)

In [14]:
matrix = pd.pivot_table(df, index = 'movieId', columns='userId', values='rating')
matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
6,4.0,,,,,4.0,,,,,...,,3.0,4.0,3.0,,,,,,5.0
10,,,,,,3.0,,2.0,,,...,,3.0,,,,,,4.0,4.0,
32,,,,2.0,,4.0,,3.0,,,...,,3.0,3.0,4.0,,4.0,,3.5,,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48516,,4.0,,,,,1.0,,,,...,5.0,,,,,3.5,,,,5.0
58559,,4.5,,,,,,,,4.5,...,,,,,,,,,,4.5
60069,,,,,,,,,,,...,5.0,,,,,4.0,,,,4.5
68954,,,,,,,,,,4.0,...,5.0,,,,,,,,,3.5


NAN values here are values that have not been seen by the user

In [15]:
matrix = matrix.fillna(0)

In [16]:
matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
10,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0
32,0.0,0.0,0.0,2.0,0.0,4.0,0.0,3.0,0.0,0.0,...,0.0,3.0,3.0,4.0,0.0,4.0,0.0,3.5,0.0,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48516,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,5.0
58559,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
60069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.5
68954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


Turn movie_ID to a list inorder to access its index

In [20]:
movie_ids = list(matrix.index)

In [21]:
movie_ids

[1,
 2,
 6,
 10,
 32,
 34,
 39,
 47,
 50,
 110,
 111,
 150,
 153,
 161,
 165,
 185,
 208,
 223,
 231,
 253,
 260,
 292,
 293,
 296,
 316,
 318,
 329,
 344,
 349,
 356,
 357,
 364,
 367,
 377,
 380,
 434,
 454,
 457,
 480,
 500,
 527,
 539,
 541,
 586,
 587,
 588,
 589,
 590,
 592,
 593,
 595,
 597,
 608,
 648,
 733,
 736,
 778,
 780,
 858,
 924,
 1036,
 1073,
 1089,
 1097,
 1136,
 1193,
 1196,
 1197,
 1198,
 1200,
 1206,
 1208,
 1210,
 1213,
 1214,
 1221,
 1222,
 1240,
 1258,
 1265,
 1270,
 1291,
 1527,
 1580,
 1682,
 1704,
 1721,
 1732,
 1923,
 1968,
 2028,
 2115,
 2329,
 2571,
 2628,
 2683,
 2706,
 2716,
 2762,
 2858,
 2918,
 2959,
 3147,
 3578,
 3793,
 3996,
 4226,
 4306,
 4878,
 4886,
 4896,
 4963,
 4973,
 4993,
 4995,
 5349,
 5418,
 5445,
 5816,
 5952,
 5989,
 6377,
 6539,
 6874,
 7153,
 7361,
 7438,
 8961,
 33794,
 48516,
 58559,
 60069,
 68954,
 79132]

In [22]:
matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
10,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0
32,0.0,0.0,0.0,2.0,0.0,4.0,0.0,3.0,0.0,0.0,...,0.0,3.0,3.0,4.0,0.0,4.0,0.0,3.5,0.0,4.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48516,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,5.0
58559,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
60069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.5
68954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


In [23]:
# Calculating similarity score
simScores = cosine_similarity(matrix)

In [16]:
rec = matrix.loc[:, [2]]
rec

userId,2
movieId,Unnamed: 1_level_1
1,0.0
2,0.0
6,0.0
10,0.0
32,0.0
...,...
48516,4.0
58559,4.5
60069,0.0
68954,0.0


In [42]:
simScores

array([[1.        , 0.41056206, 0.37631587, ..., 0.33241025, 0.33664951,
        0.32396756],
       [0.41056206, 1.        , 0.29700932, ..., 0.29070681, 0.34536285,
        0.25457889],
       [0.37631587, 0.29700932, 1.        , ..., 0.18842483, 0.16419155,
        0.19689442],
       ...,
       [0.33241025, 0.29070681, 0.18842483, ..., 1.        , 0.68938263,
        0.56857582],
       [0.33664951, 0.34536285, 0.16419155, ..., 0.68938263, 1.        ,
        0.60617328],
       [0.32396756, 0.25457889, 0.19689442, ..., 0.56857582, 0.60617328,
        1.        ]])

Create a method used to predict an item to a user

In [71]:
from multiprocessing.sharedctypes import Value


def prediction(active_uid, matrix):
    # create a dataframe to get a users records
    records = matrix.loc[:, [active_uid]]
    # rename DF to rating
    records  = records.rename(columns = {active_uid: 'rating'})
    # seperate movies users has seen from unseen movies
    seenRec = records[records['rating'] != 0]
    unSeenRec = records[records['rating'] == 0]

    # Creating target value
    targetItems = np.random.choice(unSeenRec.index, 10, False)
    for targetItem in targetItems:
        # Get the index of our target item
        idx = movie_ids.index(targetItem)
        # Similarity score index
        sim = simScores[idx]
        # Dataframe of the similarity score of our target item with every movie in the dataframe
        simDf = pd.DataFrame()
        simDf['movieId'] = movie_ids
        simDf['similarity_score'] = sim

        seenRecs = seenRec.reset_index()
    
        simDf = pd.merge(seenRecs, simDf, on = 'movieId')
        # Top 5 most similar items to the target
        simDf = simDf.sort_values(by='similarity_score', ascending=False)
        top_5 = simDf.head().copy()
        # Formula to calculate the predicted rating the active user would give the target item
        top_5['weighted_rating'] = top_5['rating'] * top_5['similarity_score']
        prediction = top_5['weighted_rating'].sum()/top_5['similarity_score'].sum()
        print(prediction)

        # Recommendation
        if prediction >= 3.5:
            title = movies[movies['movieId'] == targetItem]['title'].values[0]
            print(title)

In [72]:
prediction(6, matrix)

3.7871352013185096
Indiana Jones and the Last Crusade (1989)
4.005591880785197
Ghostbusters (a.k.a. Ghost Busters) (1984)
4.609707050877835
Home Alone (1990)
3.1924335709554907
3.411401727974821
4.3944865755565194
Minority Report (2002)
4.416947038611319
Finding Nemo (2003)
2.5879746966898134
3.627287218113959
Sixth Sense, The (1999)
4.571891377472642
X-Men (2000)
