## Set up Ratings and Movies

In [25]:
import pandas as pd
import numpy as np
import scipy as sp
import re

ratings_data_frame = pd.read_csv("./movies-dataset-small/ratings.csv")
movies_data_frame = pd.read_csv("./movies-dataset-small/movies.csv", index_col=0, header=0, usecols=[0,1,2])

In [2]:
ratings_data_frame.shape

(100836, 4)

In [3]:
ratings_data_frame.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [26]:
years = []
for title in movies_data_frame["title"]:
    year = re.findall(r'\d+', title)
    if len(year) > 0:
        years.append(year[0])
    else:
        years.append(np.nan)
movies_data_frame.insert(2, "year", years, True)     

## Looking up metadata about specific Movie

In [41]:
def movieMetaData(movie_id):
    title = movies_data_frame.at[movie_id, "title"]
    return title

## Finding users top N movies

In [42]:
def favouriteMovies(user_Id, N):
    userRatings = ratings_data_frame[ratings_data_frame["userId"] == user_Id]
    sortedRatings = pd.DataFrame.sort_values(userRatings,["rating"],ascending=[0])[:N]
    sortedRatings["title"] = sortedRatings["movieId"].apply(movieMetaData)
    return sortedRatings

In [39]:
#checking if rated movies are in the movies dataset
ratings_data_frame = ratings_data_frame[ratings_data_frame["movieId"].isin(movies_data_frame.index)]

In [43]:
favouriteMovies(1, 5)

Unnamed: 0,userId,movieId,rating,timestamp,title
231,1,5060,5.0,964984002,M*A*S*H (a.k.a. MASH) (1970)
185,1,2872,5.0,964981680,Excalibur (1981)
89,1,1291,5.0,964981909,Indiana Jones and the Last Crusade (1989)
90,1,1298,5.0,964984086,Pink Floyd: The Wall (1982)
190,1,2948,5.0,964982191,From Russia with Love (1963)


## Constructing Rating Matrix

In [56]:
#get distinct movies and number of ratings on them
usersPerMovie = ratings_data_frame.movieId.value_counts()
#get distinct users and number of movies they have rated
moviesPerUser = ratings_data_frame.userId.value_counts()

ratings_data_frame.rating.value_counts()

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

In [46]:
userMovieRatingMatrix = pd.pivot_table(ratings_data_frame, values="rating", index=["userId"], columns=["movieId"])

## Distance between userA and userB

In [62]:
from scipy.spatial.distance import hamming

def hammingDistanceBetweenUsers(userA, userB):
    try:
        userARatings = userMovieRatingMatrix.transpose()[userA]
        userBRatings = userMovieRatingMatrix.transpose()[userB]
        distance = hamming(userARatings, userBRatings)
    except:
        distance = np.NaN
    return distance

In [67]:
userMovieRatingMatrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


## Find K nearest Neighbors

In [96]:
def findNearestNeighbors(user_id, k=10):
    allUsers = pd.DataFrame(userMovieRatingMatrix.index)
    allUsers = allUsers[allUsers.userId != user_id]
    #add column with distance to allUsers
    allUsers["distance"] = allUsers["userId"].apply(lambda user: hammingDistanceBetweenUsers(user_id, user))
    return allUsers.sort_values(["distance"], ascending=True)["userId"][:k]

## Recommend Movies to user_Id

In [99]:
def recommend_movies_to_user(uid, N):
    KNearestNeighbors = findNearestNeighbors(user_id=uid)
    NNRatings = userMovieRatingMatrix[userMovieRatingMatrix.index.isin(KNearestNeighbors)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    #get list of movies already watched by active user
    moviesAlreadyWatched = userMovieRatingMatrix.transpose()[uid].dropna().index
    avgRating = avgRating[~avgRating.index.isin(moviesAlreadyWatched)]
    topMoviesToRecommend = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topMoviesToRecommend).apply(movieMetaData)

In [100]:
recommend_movies_to_user(1, 5)

  results[i] = self.f(v)


0                 Other Sister, The (1999)
1                   Wild Bunch, The (1969)
2    Gigantic (A Tale of Two Johns) (2002)
3                         Room, The (2003)
4                       Day & Night (2010)
Name: movieId, dtype: object