In [2]:
import pandas as pd
from scipy.spatial.distance import correlation
import numpy as np

In [3]:
movies_df = pd.read_csv("movies.csv", usecols=[0, 1])
movies_df


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [4]:
ratings_df = pd.read_csv("ratings.csv", usecols=[0, 1, 2])
ratings_df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [5]:
movie_info = pd.merge(movies_df, ratings_df, left_on='movieId', right_on='movieId')
movie_info

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5
...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),184,4.0
100832,193583,No Game No Life: Zero (2017),184,3.5
100833,193585,Flint (2017),184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),184,3.5


In [6]:
movie_info = pd.DataFrame.sort_values(movie_info, ['userId', 'movieId'], ascending=[0, 1])
movie_info.head()

Unnamed: 0,movieId,title,userId,rating
214,1,Toy Story (1995),610,5.0
534,6,Heat (1995),610,5.0
954,16,Casino (1995),610,4.5
1678,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),610,4.5
2309,47,Seven (a.k.a. Se7en) (1995),610,5.0


In [8]:
num_users = max(movie_info.userId)
num_movies = max(movie_info.movieId)
print(num_users)
print(num_movies)

610
193609


In [11]:
movie_per_user = movie_info['userId'].value_counts()
movie_per_user.head()

userId
414    2698
599    2478
474    2108
448    1864
274    1346
Name: count, dtype: int64

In [12]:
users_per_movie = movie_info['title'].value_counts()
users_per_movie.head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: count, dtype: int64

In [15]:
def fav_movies(current_user, N):

    favourite_movies = pd.DataFrame.sort_values(movie_info[movie_info['userId'] == current_user], ['rating'], ascending=[0])[:N]
    return favourite_movies.title.tolist()


In [16]:
print(fav_movies(5, 3))

['Dances with Wolves (1990)', 'In the Name of the Father (1993)', "Schindler's List (1993)"]


In [18]:
user_movie_rating_matrix = pd.pivot_table(movie_info, values='rating', index=['userId'], columns='movieId')
user_movie_rating_matrix.head()


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [19]:
def similarity(user1, user2):

    user1 = np.array(user1) - np.nanmean(user1) 
    user2 = np.array(user2) - np.nanmean(user2)

    common_movie_ids = [i for i in range(len(user1)) if user1[i] > 0 and user2[i] > 0]
    if(len(common_movie_ids) == 0):
        return 0
    else:
        user1 = np.array([user1[i] for i in common_movie_ids])
        user2 = np.array([user2[i] for i in common_movie_ids])
        return correlation(user1, user2)


In [20]:
def nearest_neighbour_ratings(current_user, K):

    similarity_matrix = pd.DataFrame(index = user_movie_rating_matrix.index, 
                                    columns = ['similarity'])
    for i in user_movie_rating_matrix.index:
        similarity_matrix.loc[i] = similarity(user_movie_rating_matrix.loc[current_user],
                                             user_movie_rating_matrix.loc[i])

    similarity_matrix = pd.DataFrame.sort_values(similarity_matrix,
                                                ['similarity'], ascending= [0])

    nearest_neighbours = similarity_matrix[:K]

    neighbour_movie_ratings = user_movie_rating_matrix.loc[nearest_neighbours.index]

    predicted_movie_rating = pd.DataFrame(index = user_movie_rating_matrix.columns, columns = ['rating'])

    for i in user_movie_rating_matrix.columns:
        predicted_rating = np.nanmean(user_movie_rating_matrix.loc[current_user])

        for j in neighbour_movie_ratings.index:
            if(user_movie_rating_matrix.loc[j,i] > 0):
                predicted_rating += ((user_movie_rating_matrix.loc[j,i] -np.nanmean(user_movie_rating_matrix.loc[j])) *
                                                    nearest_neighbours.loc[j, 'similarity']) / nearest_neighbours['similarity'].sum()

        predicted_movie_rating.loc[i, 'rating'] = predicted_rating

    return predicted_movie_rating

In [23]:
def top_n_recommendations(current_user, N):
    predicted_movie_rating = nearest_neighbour_ratings(current_user, 10)
    movies_already_watched = list(user_movie_rating_matrix.loc[current_user]
                                  .loc[user_movie_rating_matrix.loc[current_user] > 0].index)
    
    predicted_movie_rating = predicted_movie_rating.drop(movies_already_watched)
    
    top_n_recommendations = pd.DataFrame.sort_values(predicted_movie_rating, ['rating'], ascending=[0])[:N]
    
    top_n_recommendation_titles = movies_df.loc[movies_df.movieId.isin(top_n_recommendations.index)]

    return list(top_n_recommendation_titles.title)

In [24]:
current_user = 140
print("User's favorite movies are : ", fav_movies(current_user, 5),
      "\nUser's top recommendations are: ", top_n_recommendations(current_user, 3))

  dist = 1.0 - uv / math.sqrt(uu * vv)


User's favorite movies are :  ['Magnificent Seven, The (1960)', 'Enemy at the Gates (2001)', 'Lawrence of Arabia (1962)', 'Presumed Innocent (1990)', 'Fargo (1996)'] 
User's top recommendations are:  ['Spider-Man 2 (2004)', 'Wolf of Wall Street, The (2013)', 'Coco (2017)']
