In [1]:
import pandas as pd
import numpy as np

In [2]:
def cosim(X, Y):
    num = np.nansum(X*Y) #np.dot(X, Y) 
    denom = np.sqrt(np.nansum(X*X)*np.nansum(Y*Y)) # np.sqrt(np.dot(X, X)) * np.sqrt(np.dot(Y, Y))
    return num/denom

In [3]:
movies = pd.read_csv('./data/ml-latest-small/movies.csv', usecols=['movieId', 'title'])
movies.tail()

Unnamed: 0,movieId,title
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)
9741,193609,Andrew Dice Clay: Dice Rules (1991)


In [4]:
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv', usecols=['movieId', 'userId', 'rating'])
ratings.tail()

Unnamed: 0,userId,movieId,rating
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0
100835,610,170875,3.0


In [5]:
movies['title_raw'] = movies['title']
movies['title'] = movies['title_raw'].str.extract(r'([^\()]+)', expand=False).str.lower().str.strip()
movies['year'] = movies['title_raw'].str.extract(r' \((\d+)\)')
movies.tail()

Unnamed: 0,movieId,title,title_raw,year
9737,193581,black butler: book of the atlantic,Black Butler: Book of the Atlantic (2017),2017
9738,193583,no game no life: zero,No Game No Life: Zero (2017),2017
9739,193585,flint,Flint (2017),2017
9740,193587,bungo stray dogs: dead apple,Bungo Stray Dogs: Dead Apple (2018),2018
9741,193609,andrew dice clay: dice rules,Andrew Dice Clay: Dice Rules (1991),1991


In [6]:
input_movies = ('Into the wild', \
                'Forrest Gump', \
                'Pulp Fiction', \
                'Lock stock and two smoking barrels', \
#                 'Confessions of a Dangerous Mind', \
                'City of God', \
                'Once')

input_movies = list(x.lower() for x in input_movies)
# input_movies

In [7]:
def find_movies_by_title(movies, movie_titles):
    return pd.DataFrame(movie_titles, columns=['title']).merge(movies, how='inner', on='title')
    
input_movies = find_movies_by_title(movies, input_movies)
input_movies

Unnamed: 0,title,movieId,title_raw,year
0,into the wild,55247,Into the Wild (2007),2007
1,forrest gump,356,Forrest Gump (1994),1994
2,pulp fiction,296,Pulp Fiction (1994),1994
3,city of god,6016,City of God (Cidade de Deus) (2002),2002
4,once,53123,Once (2006),2006


In [8]:
input_ratings = pd.Series([5] * len(input_movies), index=input_movies['movieId'])
# input_ratings

In [9]:
def find_users_id_by_movies(movies, ratings):
    m = movies.copy()
    m = m.merge(ratings, how='inner', on='movieId')
    m['cnt'] = 1
    s = m.groupby(by='userId')['cnt'].sum()

    # take only users who watched more than 30% of movies from the list
    return s[s.sort_values() > len(movies) * 0.3].index.tolist()

# find users (reference users) who watched all movies from the input list
reference_users_id = find_users_id_by_movies(input_movies, ratings)

In [10]:
def filter_ratings(reference_users_id, input_movies, ratings):
    return ratings[(ratings['userId'].isin(reference_users_id)) & \
                   (ratings['movieId'].isin(input_movies['movieId'])) & \
                   (ratings['rating'] >= 4)
                  ] # TODO: check results if this filter is removed

ratings_filtered = filter_ratings(reference_users_id, input_movies, ratings)
# ratings_filtered

In [11]:
df = movies[['movieId', 'title']]. \
    merge(ratings_filtered[['userId', 'movieId', 'rating']], on="movieId"). \
    pivot(columns='movieId', values='rating', index='userId'). \
    transpose()
df

userId,1,6,8,14,15,17,18,21,24,26,...,590,592,599,600,602,603,606,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
296,,,4.0,,4.0,5.0,4.0,,4.0,4.0,...,4.5,4.0,5.0,4.5,5.0,5.0,5.0,5.0,4.0,5.0
356,4.0,5.0,,4.0,5.0,5.0,4.5,4.5,4.5,,...,5.0,5.0,,4.0,,,4.0,,4.0,
6016,,,,,,4.5,,,,,...,,,,,,,4.0,,,5.0
53123,,,,,,,,,,,...,,,,,,,,,,
55247,,,,,,,,,,,...,,,,,,,4.0,,,


In [12]:
cosim_dict = {}
for reference_user in df.columns:
    cosim_dict[reference_user] = cosim(df[reference_user], input_ratings)

# TODO: use heapq
cosim_reference_users = pd.DataFrame(cosim_dict.values(), index=cosim_dict.keys(), columns=['cosim'])
cosim_reference_users.sort_values(inplace=True, by='cosim')
reference_users_id = cosim_reference_users.tail(20).index.tolist()

In [13]:
# prepare candidates
# movies that are not in user input and that have biggest amount of positive ratings from reference users
ratings_for_unseen_movies = ratings[(~ratings['movieId'].isin(input_movies['movieId'])) & \
                                    (ratings['userId'].isin(reference_users_id))]
candidate_movies_id = ratings_for_unseen_movies.groupby('movieId').\
                        agg({'userId': 'count'}).\
                        rename(columns={'userId': 'numberOfReviews'}).\
                        sort_values('numberOfReviews').\
                        tail(20).\
                        index.tolist()
# candidate_movies_id

In [14]:
# prediction: predict ratings for candidate movies
candidate_predictions = {}
for candidate_movie_id in candidate_movies_id:
    num = 0
    denom = 0

    for reference_user_id, cosim in cosim_reference_users.tail(100).to_dict()['cosim'].items():
        r = ratings[(ratings['userId'] == reference_user_id) & (ratings['movieId'] == candidate_movie_id)]['rating']
        rating = r.values[0] if len(r) else 0

        num += cosim * rating
        denom += cosim

    candidate_predictions[candidate_movie_id] = num / denom if denom > 0 else 0

# candidate_predictions = dict(sorted(candidate_predictions.items(), key=lambda item: item[1], reverse=True))
candidate_predictions = pd.DataFrame(\
                                     list(candidate_predictions.values()), \
                                     index=list(candidate_predictions.keys()), \
                                     columns=['predicted'])
movies[['title']].\
    merge(candidate_predictions, left_index=True, right_index=True, how='inner').\
    sort_values('predicted', ascending=False)

Unnamed: 0,title,predicted
318,i love trouble,3.477155
2571,teenage mutant ninja turtles ii: the secret of...,3.37825
593,cemetery man,3.335056
2959,billy elliot,3.310476
2858,autumn in new york,2.857958
47,mighty aphrodite,2.764292
50,georgia,2.761638
858,escape from new york,2.597284
260,quiz show,2.543235
4993,animal crackers,2.460527
