# Load the libraries

In [1]:
# importing libraries
import pandas as pd
import numpy as np

# Load the Dataset

In [2]:
# reading ratings file:
r_cols=['user_id','movie_id','rating','unix_timestamp']
ratings=pd.read_csv('ml-100k/u.data',sep='\t',names=r_cols,encoding='latin-1')

In [3]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
n_users=ratings.user_id.unique().shape[0]
n_items=ratings.movie_id.unique().shape[0]

In [5]:
print("The number of user:",n_users)
print("The number of n_items:",n_items)

The number of user: 943
The number of n_items: 1682


# Create pivot table for user and movie basedon ratings

In [6]:
datama=ratings.pivot_table(index='user_id',columns='movie_id',values='rating')

In [7]:
data_matrix=datama.replace(np.nan,0)

In [8]:
data_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Find Cosine Similatity for user and Item

In [9]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

# Using formula for user and item we are calcuating the score value

In [10]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = ratings - mean_user_rating.to_numpy()[:, np.newaxis]
        pred = mean_user_rating.to_numpy()[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred


In [11]:
# prediction Table
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [12]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

# As per User based filtering ,first have to find similarity between the input user and others

In [13]:
#1. Select input user

input_item=34

In [14]:
#2. Convert the user_sim table into DataFrame

item_sim_table=pd.DataFrame(item_similarity)

In [15]:
item_sim_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,2.220446e-16,5.976178e-01,6.697552e-01,0.545062,0.713286,0.883656,0.379021,0.518886,0.503712,0.726065,...,0.964613,1.0,1.000000,1.000000,0.964613,1.0,1.0,1.0,0.952817,0.952817
1,5.976178e-01,2.220446e-16,7.269308e-01,0.497429,0.681164,0.916437,0.616597,0.662998,0.744748,0.828918,...,1.000000,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,0.921701,0.921701
2,6.697552e-01,7.269308e-01,1.110223e-16,0.675134,0.787043,0.893278,0.627079,0.799206,0.726331,0.841896,...,1.000000,1.0,1.000000,1.000000,0.967708,1.0,1.0,1.0,1.000000,0.903125
3,5.450621e-01,4.974292e-01,6.751336e-01,0.000000,0.665761,0.909692,0.510717,0.509764,0.580956,0.747439,...,1.000000,1.0,0.905978,0.905978,0.962391,1.0,1.0,1.0,0.943587,0.924782
4,7.132865e-01,6.811638e-01,7.870434e-01,0.665761,0.000000,0.962701,0.665231,0.740839,0.727552,0.944547,...,1.000000,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,1.000000,0.905789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,1.000000,1.000000
1678,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,1.000000,1.000000
1679,1.000000e+00,1.000000e+00,1.000000e+00,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,0.0,0.0,0.0,1.000000,1.000000
1680,9.528169e-01,9.217006e-01,1.000000e+00,0.943587,1.000000,1.000000,0.948502,0.917967,0.942640,1.000000,...,1.000000,1.0,1.000000,1.000000,1.000000,1.0,1.0,1.0,0.000000,1.000000


In [16]:
#3. Find similarity user for 78 using cosine table

similar_input_item= item_sim_table[input_item].sort_values(ascending=True).head(5).index

In [17]:
similar_input_item

Index([34, 77, 246, 1030, 794], dtype='int64')

In [18]:
#4.Convert in to list

similar_item_input=list(similar_input_item)

In [19]:
#5. Using similar_user_input,can select movie id from ratings table

similar_item_userid_list=[]
for sim_item in similar_item_input:
    sim=list(ratings[ratings['user_id']==sim_item]['movie_id'])
    similar_item_userid_list.append(sim)

In [20]:
similar_item_userid_list

[[312,
  242,
  690,
  310,
  259,
  299,
  245,
  332,
  329,
  286,
  1024,
  324,
  294,
  292,
  990,
  289,
  898,
  899,
  288,
  991],
 [484,
  518,
  172,
  474,
  91,
  195,
  238,
  168,
  125,
  23,
  498,
  431,
  153,
  4,
  181,
  179,
  176,
  276,
  50,
  265,
  174,
  210,
  69,
  405,
  523,
  1028,
  52,
  144,
  778,
  89,
  483,
  357,
  100,
  133,
  201,
  511,
  97,
  56,
  268,
  250,
  215,
  209,
  127,
  641,
  246,
  15,
  175,
  455,
  833,
  31,
  252,
  25,
  132,
  1,
  154,
  96,
  173,
  28,
  519,
  527,
  121,
  156,
  199,
  192,
  636,
  98,
  191,
  134,
  222,
  183,
  42,
  228],
 [201,
  919,
  416,
  94,
  561,
  409,
  665,
  721,
  239,
  425,
  1218,
  68,
  82,
  8,
  406,
  816,
  728,
  184,
  121,
  596,
  80,
  423,
  356,
  202,
  578,
  98,
  294,
  1073,
  215,
  196,
  426,
  849,
  99,
  284,
  109,
  802,
  17,
  410,
  568,
  368,
  100,
  451,
  231,
  840,
  385,
  675,
  720,
  541,
  469,
  1101,
  1,
  588,
  895,
  719,
 

In [21]:
len(similar_item_userid_list)

5

In [22]:
#6. Convert all the list as single
import itertools
similar_item_userid_single_list=list(itertools.chain.from_iterable(similar_item_userid_list))

In [23]:
len(similar_item_userid_single_list)

326

In [24]:
#7. Unique userid from the list

Unique_userid_similar_item=set(similar_item_userid_single_list)

In [25]:
len(Unique_userid_similar_item)

279

In [26]:
#8. Input movie watched movie_list

input_item_watched_userid=list(ratings[ratings['movie_id']==input_item]['user_id'].values)

In [27]:
input_item_watched_userid

[286, 276, 94, 184, 1, 551, 297]

In [28]:
#9. Create a list which should have recom movieid to the input user

recom=[]
for per_id in Unique_userid_similar_item:
    if(per_id in input_item_watched_userid):
        pass
    else:
        recom.append(per_id)

In [29]:
len(recom)

274

In [30]:
sorted(recom)

[3,
 4,
 8,
 11,
 12,
 13,
 14,
 15,
 17,
 19,
 23,
 24,
 25,
 28,
 29,
 31,
 38,
 41,
 42,
 50,
 52,
 55,
 56,
 66,
 67,
 68,
 69,
 77,
 80,
 81,
 82,
 83,
 89,
 91,
 92,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 109,
 111,
 116,
 117,
 118,
 121,
 125,
 127,
 132,
 133,
 134,
 137,
 138,
 144,
 145,
 150,
 151,
 153,
 154,
 155,
 156,
 158,
 159,
 161,
 164,
 168,
 172,
 173,
 174,
 175,
 176,
 178,
 179,
 181,
 183,
 185,
 187,
 191,
 192,
 195,
 196,
 198,
 199,
 201,
 202,
 204,
 208,
 209,
 210,
 211,
 215,
 216,
 219,
 221,
 222,
 223,
 224,
 226,
 227,
 228,
 230,
 231,
 232,
 235,
 236,
 238,
 239,
 240,
 242,
 245,
 246,
 248,
 249,
 250,
 252,
 254,
 257,
 259,
 260,
 265,
 268,
 269,
 273,
 275,
 284,
 285,
 288,
 289,
 292,
 294,
 299,
 310,
 312,
 324,
 329,
 332,
 356,
 357,
 368,
 369,
 384,
 385,
 393,
 401,
 402,
 403,
 404,
 405,
 406,
 409,
 410,
 411,
 412,
 413,
 416,
 418,
 420,
 423,
 425,
 426,
 431,
 432,
 433,
 441,
 444,
 447,
 451,
 455,
 469,
 470,
 473,
 474,

In [31]:
# Cross Checking 
sorted(Unique_userid_similar_item)

[1,
 3,
 4,
 8,
 11,
 12,
 13,
 14,
 15,
 17,
 19,
 23,
 24,
 25,
 28,
 29,
 31,
 38,
 41,
 42,
 50,
 52,
 55,
 56,
 66,
 67,
 68,
 69,
 77,
 80,
 81,
 82,
 83,
 89,
 91,
 92,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 109,
 111,
 116,
 117,
 118,
 121,
 125,
 127,
 132,
 133,
 134,
 137,
 138,
 144,
 145,
 150,
 151,
 153,
 154,
 155,
 156,
 158,
 159,
 161,
 164,
 168,
 172,
 173,
 174,
 175,
 176,
 178,
 179,
 181,
 183,
 184,
 185,
 187,
 191,
 192,
 195,
 196,
 198,
 199,
 201,
 202,
 204,
 208,
 209,
 210,
 211,
 215,
 216,
 219,
 221,
 222,
 223,
 224,
 226,
 227,
 228,
 230,
 231,
 232,
 235,
 236,
 238,
 239,
 240,
 242,
 245,
 246,
 248,
 249,
 250,
 252,
 254,
 257,
 259,
 260,
 265,
 268,
 269,
 273,
 275,
 276,
 284,
 285,
 286,
 288,
 289,
 292,
 294,
 299,
 310,
 312,
 324,
 329,
 332,
 356,
 357,
 368,
 369,
 384,
 385,
 393,
 401,
 402,
 403,
 404,
 405,
 406,
 409,
 410,
 411,
 412,
 413,
 416,
 418,
 420,
 423,
 425,
 426,
 431,
 432,
 433,
 441,
 444,
 447,
 451,
 4

In [32]:
# Cross Checking
sorted(input_item_watched_userid)

[1, 94, 184, 276, 286, 297, 551]

In [33]:
# Checking the common movie list
list(set(Unique_userid_similar_item)&set(input_item_watched_userid))

[1, 276, 184, 94, 286]

In [34]:
item_pred=pd.DataFrame(item_prediction)

In [35]:
item_pred

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.446278,0.475473,0.505938,0.443633,0.512667,0.547939,0.446243,0.463059,0.474916,0.515821,...,0.580579,0.576202,0.582478,0.582478,0.575717,0.588155,0.588155,0.588155,0.573107,0.566696
2,0.108544,0.132957,0.125589,0.124932,0.131178,0.129005,0.110883,0.122223,0.109599,0.121525,...,0.135490,0.136546,0.134829,0.134829,0.134108,0.134458,0.134458,0.134458,0.136576,0.137111
3,0.085685,0.091690,0.087643,0.089966,0.089658,0.089985,0.083492,0.089725,0.085188,0.088331,...,0.089770,0.090506,0.086261,0.086261,0.089201,0.084659,0.084659,0.084659,0.089768,0.090845
4,0.053693,0.059604,0.058114,0.058364,0.059356,0.061472,0.053374,0.058615,0.055905,0.060601,...,0.061349,0.061686,0.061195,0.061195,0.060693,0.057937,0.057937,0.057937,0.061673,0.062281
5,0.224739,0.229171,0.263280,0.226387,0.259973,0.296529,0.232710,0.237109,0.258581,0.275076,...,0.297628,0.295990,0.299922,0.299922,0.298188,0.302051,0.302051,0.302051,0.293373,0.294309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.092574,0.113870,0.110211,0.112040,0.112768,0.123140,0.098578,0.110839,0.098858,0.118579,...,0.123829,0.124430,0.120776,0.120776,0.121360,0.125056,0.125056,0.125056,0.123470,0.124327
940,0.164358,0.184894,0.196502,0.164884,0.195860,0.209652,0.162840,0.165606,0.171761,0.194536,...,0.217536,0.215515,0.219136,0.219136,0.216173,0.218583,0.218583,0.218583,0.216582,0.216819
941,0.032300,0.045024,0.042924,0.043223,0.047493,0.051077,0.032761,0.042646,0.039399,0.047421,...,0.052762,0.053042,0.052692,0.052692,0.051514,0.053028,0.053028,0.053028,0.051910,0.052280
942,0.157779,0.174095,0.189000,0.163514,0.186140,0.194151,0.164910,0.156970,0.167038,0.181295,...,0.197537,0.194479,0.198479,0.198479,0.197969,0.199793,0.199793,0.199793,0.197394,0.200031


In [36]:
item_pred[input_item]

user_id
1      0.572414
2      0.139686
3      0.092378
4      0.062496
5      0.286815
         ...   
939    0.122120
940    0.217996
941    0.053046
942    0.198270
943    0.332126
Name: 34, Length: 943, dtype: float64

In [37]:
item_pred_Trans=item_pred.T

In [38]:
item_pred_Trans

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,0.446278,0.108544,0.085685,0.053693,0.224739,0.358069,0.782713,0.090678,0.044465,0.366060,...,0.300762,0.058182,0.268047,0.062958,0.148765,0.092574,0.164358,0.032300,0.157779,0.247672
1,0.475473,0.132957,0.091690,0.059604,0.229171,0.403467,0.807765,0.093242,0.050556,0.402074,...,0.315097,0.077715,0.312001,0.078982,0.186952,0.113870,0.184894,0.045024,0.174095,0.244892
2,0.505938,0.125589,0.087643,0.058114,0.263280,0.422446,0.879616,0.113380,0.051605,0.423936,...,0.351531,0.076278,0.279381,0.074514,0.179351,0.110211,0.196502,0.042924,0.189000,0.282630
3,0.443633,0.124932,0.089966,0.058364,0.226387,0.358540,0.749868,0.088327,0.047089,0.351163,...,0.286341,0.076032,0.301951,0.072409,0.183795,0.112040,0.164884,0.043223,0.163514,0.241440
4,0.512667,0.131178,0.089658,0.059356,0.259973,0.424767,0.856625,0.111532,0.052202,0.420332,...,0.347137,0.078739,0.308205,0.077618,0.188391,0.112768,0.195860,0.047493,0.186140,0.279338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.588155,0.134458,0.084659,0.057937,0.302051,0.458549,0.957402,0.133863,0.055114,0.463085,...,0.385295,0.091365,0.316109,0.080072,0.210365,0.125056,0.218583,0.053028,0.199793,0.344104
1678,0.588155,0.134458,0.084659,0.057937,0.302051,0.458549,0.957402,0.133863,0.055114,0.463085,...,0.385295,0.091365,0.316109,0.080072,0.210365,0.125056,0.218583,0.053028,0.199793,0.344104
1679,0.588155,0.134458,0.084659,0.057937,0.302051,0.458549,0.957402,0.133863,0.055114,0.463085,...,0.385295,0.091365,0.316109,0.080072,0.210365,0.125056,0.218583,0.053028,0.199793,0.344104
1680,0.573107,0.136576,0.089768,0.061673,0.293373,0.450434,0.935085,0.128403,0.055457,0.452134,...,0.377542,0.089950,0.316951,0.080491,0.207785,0.123470,0.216582,0.051910,0.197394,0.330514


In [39]:
item_pred_Trans[34]

0       0.048955
1       0.051866
2       0.049287
3       0.051942
4       0.050116
          ...   
1677    0.045356
1678    0.045356
1679    0.045356
1680    0.048563
1681    0.048910
Name: 34, Length: 1682, dtype: float64

In [40]:
# From recomd list select hightest rated film which would like by the user. Based on item prediction

highest_Rated=[]
input_item_pre=pd.DataFrame(item_pred_Trans[input_item])
input_item_pred=input_item_pre.T
for re in recom:
    value=input_item_pred[re].values
    if(value>=1):
        highest_Rated.append(re)

In [41]:
#checking the common movie list
list(set(recom)&set(input_item_watched_userid))

[]

In [42]:
def itembased(input_user, item_similarity, item_pred, top_n=5, threshold=0.8):

    # user_id in MovieLens starts from 1; adjust for zero-based index
    user_idx = input_user - 1  

    # Get predictions for this user
    user_ratings = item_pred.iloc[user_idx]

    # Get items the user has already rated
    already_rated = data_matrix.iloc[user_idx]
    rated_items = already_rated[already_rated > 0].index.tolist()

    # Filter out rated items
    predictions = user_ratings.drop(index=rated_items)

    # Filter by threshold and get top N
    recommendations = predictions[predictions >= threshold].sort_values(ascending=False).head(top_n)

    return list(recommendations.index)


In [43]:
Recommended_user = itembased(5, item_similarity, item_pred, 5, 0.3)



In [44]:
len(Recommended_user)

5

In [45]:
Recommended_user

[1255, 1643, 1379, 1256, 1318]