## RECOMMENDATION SYSTEM BASICS

#### types
#### collaborative filtering
#### content based
#### hybrid

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Movie.csv')
df

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5
...,...,...,...
8987,7087,GoldenEye (1995),3.0
8988,7088,GoldenEye (1995),1.0
8989,7105,GoldenEye (1995),2.0
8990,7113,GoldenEye (1995),3.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8992 entries, 0 to 8991
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   userId  8992 non-null   int64  
 1   movie   8992 non-null   object 
 2   rating  8992 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 210.9+ KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,8992.0,3521.880116,2012.497384,1.0,1779.0,3530.0,5213.25,7120.0
rating,8992.0,3.557162,0.967071,0.5,3.0,3.5,4.0,5.0


In [5]:
df[df.duplicated]

Unnamed: 0,userId,movie,rating


In [6]:
df.isnull().sum()

userId    0
movie     0
rating    0
dtype: int64

In [7]:
for col in df:
    print(col,df[col].unique(),df[col].nunique())

userId [   3    6    8 ... 7080 7087 7105] 4081
movie ['Toy Story (1995)' 'Jumanji (1995)' 'Grumpier Old Men (1995)'
 'Waiting to Exhale (1995)' 'Father of the Bride Part II (1995)'
 'Heat (1995)' 'Sabrina (1995)' 'Tom and Huck (1995)'
 'Sudden Death (1995)' 'GoldenEye (1995)'] 10
rating [4.  5.  4.5 3.  1.  3.5 1.5 2.  2.5 0.5] 10


In [8]:
df['movie'].value_counts()

movie
Toy Story (1995)                      2569
GoldenEye (1995)                      1548
Heat (1995)                           1260
Jumanji (1995)                        1155
Sabrina (1995)                         700
Grumpier Old Men (1995)                685
Father of the Bride Part II (1995)     657
Sudden Death (1995)                    202
Waiting to Exhale (1995)               138
Tom and Huck (1995)                     78
Name: count, dtype: int64

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
# reshape user user similarity matrix
print(df.userId.unique())
len(df.userId.unique())

[   3    6    8 ... 7080 7087 7105]


4081

In [11]:
user_df = df.pivot_table(index = 'userId',columns = 'movie',values = 'rating')
user_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,,,,,,,,,
7116,3.5,,,,,,,,4.0,
7117,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7119,,,,,,,,,5.0,


In [12]:
# fill NAN with 0
user_df.fillna(0,inplace = True)
user_df.head()

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#calculating cosine similarities between users
from sklearn.metrics import pairwise_distances

In [14]:
user_sim = 1 -pairwise_distances(user_df.values,metric = 'cosine')
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [15]:
# as same person 100 similarity

In [16]:
np.fill_diagonal(user_sim,0)
user_sim

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 0.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 0.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 0.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        0.        ]])

In [17]:
sim_df = pd.DataFrame(user_sim)
sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080
0,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.390567,0.707107,0.615457,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.458831,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,1.000000,0.622543
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.615457,0.000000,0.388514,...,0.800000,0.000000,0.000000,0.000000,0.989949,0.000000,0.000000,0.619422,0.000000,0.000000
4,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,1.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.553372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4076,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.658505,0.000000,0.000000,0.000000
4077,0.000000,0.000000,0.752577,0.000000,0.000000,0.489886,0.000000,0.370543,0.752577,0.657870,...,0.000000,0.752577,0.000000,0.532152,0.000000,0.658505,0.000000,0.345306,0.752577,0.468511
4078,0.000000,0.458831,0.458831,0.619422,0.000000,0.701884,0.567775,0.889532,0.458831,0.568212,...,0.344124,0.458831,0.000000,0.324443,0.648886,0.000000,0.345306,0.000000,0.458831,0.476071
4079,0.000000,0.000000,1.000000,0.000000,0.000000,0.650945,0.000000,0.492366,1.000000,0.874157,...,0.000000,1.000000,0.000000,0.707107,0.000000,0.000000,0.752577,0.458831,0.000000,0.622543


In [17]:
# sim_df.index = df.userId.unique()
# sim_df.columns = df.userId.unique()

In [30]:
sim_df.index = user_df.index
sim_df.columns = user_df.index

In [19]:
sim_df.iloc[0:15,0:15]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0,0.437595,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,0.58346,0.685994,0.789352,0.0,0.707107
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615457,0.0,0.388514,0.262557,0.411597,0.0,0.8,0.424264
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.0,0.0,0.0
6,0.0,0.390567,0.650945,0.0,0.0,0.0,0.73646,0.56088,0.650945,0.569028,0.83556,0.446544,0.913466,0.0,0.460287
7,0.0,0.707107,0.0,0.0,0.0,0.73646,0.0,0.435194,0.0,0.0,0.618853,0.0,0.434122,0.0,0.0
8,0.0,0.615457,0.492366,0.615457,0.0,0.56088,0.435194,0.0,0.492366,0.669519,0.71819,0.59108,0.38865,0.492366,0.609272
10,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,0.0,0.874157,0.58346,0.685994,0.789352,0.0,0.707107
11,0.0,0.0,0.874157,0.388514,0.0,0.569028,0.0,0.669519,0.874157,0.0,0.510036,0.849528,0.690018,0.485643,0.618123


In [31]:
sim_df.shape

(4081, 4081)

In [40]:
row_id = 1   #  user you want

cols = []
for j in sim_df.columns:
    if sim_df.loc[row_id, j] > 0.9:
        cols.append(j)

print("User ",row_id ," has >0.9 in columns:", cols)

User  1  has >0.9 in columns: [5, 119, 120, 128, 132, 142, 152, 255, 326, 343, 369, 395, 469, 493, 574, 630, 670, 698, 703, 706, 722, 791, 831, 893, 980, 1255, 1265, 1294, 1310, 1313, 1441, 1546, 1629, 1652, 1707, 1927, 1967, 2306, 2343, 2467, 2516, 2598, 2611, 2638, 2671, 2810, 2816, 2842, 2936, 2943, 2985, 2992, 2999, 3069, 3106, 3111, 3251, 3262, 3274, 3348, 3405, 3483, 3491, 3552, 3585, 3661, 3663, 3728, 3733, 3796, 3819, 3847, 3900, 3938, 3961, 3983, 3988, 4030, 4032, 4042, 4054, 4063, 4115, 4120, 4125, 4141, 4212, 4292, 4514, 4516, 4520, 4525, 4538, 4574, 4577, 4604, 4605, 4640, 4641, 4670, 4731, 4821, 4848, 4865, 4923, 5012, 5116, 5179, 5374, 5410, 5488, 5588, 5699, 5700, 5724, 5818, 5888, 5922, 6037, 6064, 6077, 6110, 6135, 6206, 6224, 6351, 6389, 6401, 6550, 6572, 6614, 6638, 6752, 6795, 6836, 6885, 6932, 6943, 6965, 7016, 7100, 7102, 7108]


In [41]:
len(cols)

143

In [42]:
user = 1
# Get columns where this row has a value (not NaN) here already replaced with 0 
moviesBy3 = user_df.loc[user].dropna().index.tolist()
moviesBy3

['Father of the Bride Part II (1995)',
 'GoldenEye (1995)',
 'Grumpier Old Men (1995)',
 'Heat (1995)',
 'Jumanji (1995)',
 'Sabrina (1995)',
 'Sudden Death (1995)',
 'Tom and Huck (1995)',
 'Toy Story (1995)',
 'Waiting to Exhale (1995)']

In [43]:
# rating greater than 3
moviesBy3Recommend = user_df.loc[user][user_df.loc[user] > 0].index.tolist()
moviesBy3Recommend

['Jumanji (1995)']

In [45]:
def recommend(row_id):
    cols = []
    for j in sim_df.columns:
        if sim_df.loc[row_id, j] > 0.9:
            cols.append(j)

    print("User ", row_id, " has >0.9 in columns:", cols)
    recommendlist = []
    for x in cols:
        # use x directly (x is a user label), and extend not overwrite
        recommendlist.extend(user_df.loc[x][user_df.loc[x] > 0].index.tolist())
    # dedupe
    recommendlist = list(dict.fromkeys(recommendlist))
    return recommendlist
user_input = int(input("Enter a userId: "))
print("Recommendations for user", user_input, ":", recommend(user_input))

Enter a userId:  1


User  1  has >0.9 in columns: [5, 119, 120, 128, 132, 142, 152, 255, 326, 343, 369, 395, 469, 493, 574, 630, 670, 698, 703, 706, 722, 791, 831, 893, 980, 1255, 1265, 1294, 1310, 1313, 1441, 1546, 1629, 1652, 1707, 1927, 1967, 2306, 2343, 2467, 2516, 2598, 2611, 2638, 2671, 2810, 2816, 2842, 2936, 2943, 2985, 2992, 2999, 3069, 3106, 3111, 3251, 3262, 3274, 3348, 3405, 3483, 3491, 3552, 3585, 3661, 3663, 3728, 3733, 3796, 3819, 3847, 3900, 3938, 3961, 3983, 3988, 4030, 4032, 4042, 4054, 4063, 4115, 4120, 4125, 4141, 4212, 4292, 4514, 4516, 4520, 4525, 4538, 4574, 4577, 4604, 4605, 4640, 4641, 4670, 4731, 4821, 4848, 4865, 4923, 5012, 5116, 5179, 5374, 5410, 5488, 5588, 5699, 5700, 5724, 5818, 5888, 5922, 6037, 6064, 6077, 6110, 6135, 6206, 6224, 6351, 6389, 6401, 6550, 6572, 6614, 6638, 6752, 6795, 6836, 6885, 6932, 6943, 6965, 7016, 7100, 7102, 7108]
Recommendations for user 1 : ['Jumanji (1995)', 'GoldenEye (1995)', 'Father of the Bride Part II (1995)', 'Sabrina (1995)']


In [None]:
# for row_id in sim_df.index:
#     cols = []
#     for j in sim_df.columns:
#         val = sim_df.loc[row_id, j]
#         if pd.notna(val) and val > 0.9:   # check NaN + condition
#             cols.append(j)
#     print("User",row_id, cols)

User 3 [11, 189, 199, 209, 221, 251, 271, 432, 578, 600, 624, 666, 749, 787, 900, 1015, 1071, 1119, 1136, 1140, 1167, 1315, 1376, 1457, 1570, 1966, 1984, 2031, 2056, 2065, 2253, 2394, 2536, 2574, 2660, 2948, 3029, 3567, 3624, 3858, 3956, 4065, 4084, 4135, 4180, 4389, 4401, 4434, 4630, 4648, 4713, 4726, 4729, 4824, 4877, 4889, 5084, 5102, 5138, 5270, 5348, 5462, 5483, 5579, 5619, 5736, 5749, 5912, 5921, 6040, 6067, 6085, 6157, 6238, 6293, 6326, 6342, 6407, 6410, 6417, 6449, 6460, 6534, 6543, 6553, 6578, 6721, 6827, 698, 703, 743, 760, 925, 1313, 1417, 1892, 1912, 2306, 2343, 2671, 3405, 4044, 4457, 4520, 4962, 6007, 7100, 1260, 5213, 6169, 6136, 2926, 101, 154, 497, 1442, 2475, 2922, 4444, 4807, 4952, 5334, 5585, 6425, 6568, 3285, 4705, 5201, 795, 1005, 1405, 1671, 2964, 3233, 3691, 4346, 4948, 5093, 5582, 5906, 6926, 6962, 6993]
User 6 [168, 667, 715, 744, 775, 776, 797, 809, 1555, 1761, 2206, 2384, 2405, 2745, 2899, 2979, 3073, 3492, 3565, 3586, 3595, 3981, 4369, 4565, 4868, 5444, 551