In [21]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
# import dataset

movies = pd.read_csv('/content/ratings_small.csv')
movies.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


in this dataset, the variable `timestamp` is not helpful to build this recommendation system, so we can delete this variable from our dataset

In [4]:
# remove the variable timestamp from the dataset
movies = movies.drop(['timestamp'], axis = 1)

### Task 2: Exploring the dataset

In [5]:
# see the shape of the dataset
movies.shape

(100004, 3)

In [6]:
# check for missing values

movies.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [10]:
#Total Movies and User
data = {'Total Movies' : [movies.movieId.nunique()], 'Total User' : [movies.userId.nunique()]}
pd.DataFrame(data)                     

Unnamed: 0,Total Movies,Total User
0,9066,671


In [55]:
# which movie get the most review
ratemovie = pd.DataFrame(movies.groupby('movieId').count()['rating']).sort_values('rating', ascending = False)
ratemovie = ratemovie.reset_index()[:10]
listmovie = [str(x) for x in list(ratemovie.movieId)]
fig = px.bar(ratemovie, x=listmovie, y = "rating", labels={"x": "Movie ID", "rating" : "Total Rating"}, title = "TOP 10 Movies with The Most Rating",template = "plotly_dark")
fig.show()

In [54]:
# create a histogram of all the interactions by all the users present in the dataset
mostuser = pd.DataFrame(movies.groupby('userId').count()['rating']).sort_values('rating', ascending = False)
mostuser = mostuser.reset_index()[:10]
listuser = [str(x) for x in list(mostuser.userId)]
fig = px.bar(mostuser, x=listuser, y = "rating", labels={"x": "User ID", "rating" : "Total Rating"}, title = "TOP 10 Users Give The Most Rating",template = "plotly_dark")
fig.update_traces(marker_color='#FEA3AA')
fig.show()

### Task 3: Creating user item matrix

Creating user item interactions matrix

In [56]:
# create the user item matrix using the ratings dataset - Hint: try using pivot function 
interaction_matrix = movies.pivot(index = 'userId', columns = 'movieId', values = 'rating')

In [57]:
# check the shape of the matrix
interaction_matrix.shape

(671, 9066)

In [58]:
# check head of the matrix and see how it looks like
interaction_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,


In [59]:
# replace all the missing values with zero
interaction_matrix = interaction_matrix.fillna(0)

In [64]:
interaction_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Task 4: Finding similar users

![alt text](cosine.png "Cosine Similarity")

In [60]:
# import cosine_similarity from sklearn
from sklearn.metrics.pairwise import cosine_similarity

def similar_users(user_id, interactions_matrix):
    
    # compute similarity of each user to the provided user
    similarity = []
    for user in range(1, interactions_matrix.shape[0]+1):
      sim = cosine_similarity([interactions_matrix.loc[user_id]], [interactions_matrix.loc[user]])
      similarity.append((user, sim))
    
    # sort by similarity
    similarity.sort(key=lambda x: x[1], reverse = True)
    
    # create list of just the user ids
    most_similar_users = [tup[0] for tup in similarity]
    
    # create list of similarity score
    similarity_score = [tup[1] for tup in similarity]
    
    # remove the user's own id
    most_similar_users.remove(user_id)
    
    # remove the user's own similarity score
    similarity_score.remove(similarity_score[0])
       
    return most_similar_users, similarity_score

In [71]:
# check the implemented function above

similar_users(564, interaction_matrix)[0][:5]

[306, 102, 518, 232, 119]

### Task 5: Creating similarity based collaborative recommendation system

In [76]:
def recommendations(user_id, num_of_movies, user_item_interactions):
    print(f'The {num_of_movies} recommendation movies ID are:\n')
    # find the most similar users to the user_id for which we want to recommend movies
    most_similar_users = similar_users(user_id, user_item_interactions)[0]

    # find out those movies which this user has already interacted with
    movie_ids = set(list(interaction_matrix.columns[np.where(interaction_matrix.loc[user_id]>0)]))

    # create an empty list to store the recommended movies
    recommendations = []

    # copy those movies which are already interacted by user_id
    already_interacted = movie_ids.copy()

    # loop through each similar user from the list of most_similar_users
    for similar_user in most_similar_users:
        
        # implement the below code till the length of recommended movies does not become equal to num_of_movies
          if len(recommendations) < num_of_movies:

            # store all the movies interacted by each similar user to user_id
            similar_user_movie_ids = set(list(interaction_matrix.columns[np.where(interaction_matrix.loc[similar_user] > 0 )]))

            # add those movies in the recommended list which are present in similar_user_movie_ids but not present in already_interacted
            recommendations.extend(list(similar_user_movie_ids.difference(already_interacted)))

            # now add all those movies into already_interacted which we already added in recommendations
            already_interacted = already_interacted.union(similar_user_movie_ids) 

          else:
              break
    
    return recommendations[:num_of_movies]

In [77]:
# check the implemented function above

recommendations(564, 5, interaction_matrix)

The 5 recommendation movies ID are:



[1537, 515, 1027, 2565, 524]

### Task 6: Conclusion

In [80]:
# recommend top three movies to the new user
list(ratemovie.movieId[:3])


[356, 296, 318]