In [1]:
# import library 
import pandas as pd 
import numpy as np

## Load Dataset ##

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

In [4]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [5]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [7]:
print("The number of user:", n_users)
print("The number of n_items:", n_items)

# shape[0] means --> row count
# shape[1] means --> column count

The number of user: 943
The number of n_items: 1682


## Create pivot table for user and movie basedon ratings ##

In [8]:
datama = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')

In [9]:
datama

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# Replace zero value for none values
data_matrix = datama.replace(np.nan, 0)

In [11]:
data_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Find Cosine Similartity for user and item ##

In [16]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [17]:
user_similarity

array([[1.33226763e-15, 8.33069016e-01, 9.52540457e-01, ...,
        8.51383057e-01, 8.20492117e-01, 6.01825261e-01],
       [8.33069016e-01, 0.00000000e+00, 8.89408675e-01, ...,
        8.38515222e-01, 8.27732187e-01, 8.94202122e-01],
       [9.52540457e-01, 8.89408675e-01, 0.00000000e+00, ...,
        8.98757435e-01, 8.66583851e-01, 9.73444131e-01],
       ...,
       [8.51383057e-01, 8.38515222e-01, 8.98757435e-01, ...,
        0.00000000e+00, 8.98358201e-01, 9.04880419e-01],
       [8.20492117e-01, 8.27732187e-01, 8.66583851e-01, ...,
        8.98358201e-01, 0.00000000e+00, 8.17535338e-01],
       [6.01825261e-01, 8.94202122e-01, 9.73444131e-01, ...,
        9.04880419e-01, 8.17535338e-01, 0.00000000e+00]])

## Using formula for user and item we are calculation the score ##

In [20]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred   

In [None]:
# Prediction Table
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [None]:
user_prediction

## user based filtering, first we have to find similarity between the input user and others ##

In [None]:
# Step 1: select input user
input_user = 34

In [None]:
# Step 2: convert the user_similarity table into DataFrame
user_sim_table = pd.DataFrame(user_similarity)

In [None]:
user_sim_table

In [None]:
# Step 3: Find similarity user for 34 using cosing table
similar_input_user= user_sim_table[input_user].sort_values(ascending=True).head(5).index

In [None]:
similar_input_user

In [None]:
# Step 4: Convert it into list
similar_user_input=list(similar_input_user)

In [None]:
# Using similar_user_input, can selet movie id from ratings table

similar_user_movieid_list=[]
for sim_user in similar_user_input:
    sim=list(ratings[ratings['user_id']==sim_user]['movie_id'])
    similar_user_movieid_list.append(sim)

In [None]:
len(similar_user_movieid_list)

In [None]:
# Step 6: Convert all the list as single
import itertools
similar_user_movieid_single_list = list(itertools.chain.from_iterable(similar_user_movieid_list))

In [None]:
# Step 7: Unique movieid from the list
Unique_movieid_similar_user=set(similar_user_movieid_single_list)

In [None]:
len(Unique_movieid_similar_user)

In [None]:
# Step 8: Input user watched movie_list
input_user_watched_movieid=list(ratings[ratings['user_id']==input_user]['movie_id'].values)

In [None]:
input_user_watched_movieid

In [None]:
# Step 9: Create a list which shold have recommendation movieid to the input user

recom = []
for per_id in Unique_movieid_similar_user:
    if(per_id in input_user_watched_movieid):
        pass
    else:
        recom.append(per_id)

In [None]:
len(recom)

In [None]:
sorted(recom)

In [None]:
# Cross Checking
sorted(Unique_movieid_similar_user)

In [None]:
# Cross Checking
sorted(input_user_watched_movieid)

In [None]:
# Checking the common movie list
list(set(Unique_movieid_similar_user) &set(input_user_watched_movieid))

In [None]:
user_pred=pd.DataFrame(user_prediction)

In [None]:
user_pred

In [None]:
user_pred_Trans=user_pred.T

In [None]:
user_pred_Trans

In [None]:
gg=user_user_Trans[34]

In [None]:
gg

In [None]:
g = pd.DataFrame(gg)

In [None]:
g

In [None]:
s = g.T

In [None]:
s[187].values >=0.4

In [None]:
# From recomd list select highest reated flim which would like by the user. Based on user prediction 
highest_Rated = []
input_user_pre = pd.DataFrame(user_pred_Trans[input_user])
input_user_pred = input_user_pre.T
for re in recom:
    value=input_user_pred[re].values
    if(value>=1):
        highest_Rated.append(re)
        

In [None]:
len(highest_Rated)

In [None]:
# Now we give movieid respective movie list
i_cols = ['movie_id', 'movie_title', 'release date', 'video release date', 'IMDb URL', 
          'unknown','Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
          ]
items = pd.read_csv('ml-100k/ul.item', sep='|', names = i_cols)

In [None]:
# Creating Movie List based on recom movieid
movie_title=[]
for movieid in highest_Rated:
    mov=items[items['movie id']==movieid]['movie title'].values
    movie_title.append(mov)

In [None]:
movie_title

In [None]:
# Converting into pure list
movie_title_list = []
for m in movie_title:
    print(m)
    mv=list(m)
    movie_title_list.append(mv)

In [None]:
# Converting into whole list
import itertools
Final_Recommend_movie=list(itertools.chain.from_iterable(movie_title_list))

In [None]:
Final_Recommend_movie

In [None]:
# Checking the common movie list
list(set(recom)&set(input_user_watched_movieid))
[]