In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
links = pd.read_csv('data/links.csv')
tags = pd.read_csv('data/tags.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


## All recommenders individually

### Popularity-based

In [20]:
def top_movies(number):
    ratings_new = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
    ratings_new['rating_count'] = ratings.groupby('movieId')['rating'].count()
    # 1. initialize the transformer (optionally, set parameters)
    my_min_max = StandardScaler()
    # 2. fit the transformer to the data
    my_min_max.fit(ratings_new)
    # 3. use the transformer to transform the data
    min_max_scaled_ratings_new = my_min_max.transform(ratings_new)
    # 4. reconvert the transformed data back to a DataFrame
    df_min_max_scaled_ratings_new = pd.DataFrame(min_max_scaled_ratings_new,
                 index=ratings_new.index,
                 columns=ratings_new.columns)
    df_min_max_scaled_ratings_new['score'] = df_min_max_scaled_ratings_new['rating'] * df_min_max_scaled_ratings_new['rating_count']
    top_number = df_min_max_scaled_ratings_new.sort_values(by = 'score', ascending = False).head(number).reset_index()
    top_number_with_titles = pd.merge(movies, top_number, how='inner', on = 'movieId')
    return top_number_with_titles.sort_values(by = 'score', ascending = False)['title'].to_list()

In [21]:
top_movies(10)

['Shawshank Redemption, The (1994)',
 'Forrest Gump (1994)',
 'Pulp Fiction (1994)',
 'Matrix, The (1999)',
 'Silence of the Lambs, The (1991)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Fight Club (1999)',
 "Schindler's List (1993)",
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Usual Suspects, The (1995)']

### Item-based

In [106]:
movies

Unnamed: 0,movieId,title,genres,title_wo_yr
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life: Zero
9739,193585,Flint (2017),Drama,Flint
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs: Dead Apple


In [22]:
def top_movies(title, number):
    movies['title_wo_yr'] = movies['title'].str[:-7]
    movies_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
    top_popular_movieID = int(movies['movieId'].loc[movies['title_wo_yr'] == title])
    top_ratings = movies_crosstab[top_popular_movieID]
    top_ratings[top_ratings.notna()]
    similar_to_movie_id = movies_crosstab.corrwith(top_ratings)
    corr_movie_id = pd.DataFrame(similar_to_movie_id, columns=['PearsonR'])
    corr_movie_id.dropna(inplace=True)
    rating = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
    rating['rating_count'] = ratings.groupby('movieId')['rating'].count()
    movie_id_corr_summary = corr_movie_id.join(rating['rating_count'])
    movie_id_corr_summary.drop(top_popular_movieID, inplace=True)
    top_number = movie_id_corr_summary[movie_id_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(number)
    top_number_with_titles = pd.merge(movies, top_number, how='inner', on = 'movieId')
    return top_number_with_titles['title'].to_list()

In [23]:
top_movies('Transformers', 10)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


['Disclosure (1994)',
 'Piano, The (1993)',
 'Renaissance Man (1994)',
 'Six Degrees of Separation (1993)',
 'All Dogs Go to Heaven 2 (1996)',
 'Friday the 13th (1980)',
 'Elizabeth (1998)',
 'Young Sherlock Holmes (1985)',
 'Milk (2008)',
 'Warrior (2011)']

### User-based

In [24]:
def movie_recom(user_id, n):
    movie_titles = movies[['movieId', 'title']]
    users_items = pd.pivot_table(data=ratings, 
                                 values='rating', 
                                 index='userId', 
                                 columns='movieId')
    users_items.fillna(0, inplace=True)
    user_similarities = pd.DataFrame(cosine_similarity(users_items),
                                 columns=users_items.index, 
                                 index=users_items.index)
    weights = (user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id]))
    not_visited_restaurants = users_items.loc[users_items.index!=user_id, users_items.loc[user_id,:]==0]
    weighted_averages = pd.DataFrame(not_visited_restaurants.T.dot(weights), columns=["predicted_rating"])
    recommendations = weighted_averages.merge(movie_titles, left_index=True, right_on="movieId")
    return recommendations.sort_values("predicted_rating", ascending=False).head(n)['title'].to_list()

In [25]:
movie_recom(602, 10)

['Aladdin (1992)',
 'Toy Story (1995)',
 'Matrix, The (1999)',
 'Beauty and the Beast (1991)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Godfather, The (1972)',
 'Fight Club (1999)',
 'Saving Private Ryan (1998)']

## All three together

In [101]:
def recommend_me_movies(*arg):
    # Popularity-based:
    if len(arg) == 1:
        number_poprec = arg[0]
        print(f'You have entered a single number. Here are your top {number_poprec} movie recommendations:')
        ratings_new = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
        ratings_new['rating_count'] = ratings.groupby('movieId')['rating'].count()
        # 1. initialize the transformer (optionally, set parameters)
        my_min_max = StandardScaler()
        # 2. fit the transformer to the data
        my_min_max.fit(ratings_new)
        # 3. use the transformer to transform the data
        min_max_scaled_ratings_new = my_min_max.transform(ratings_new)
        # 4. reconvert the transformed data back to a DataFrame
        df_min_max_scaled_ratings_new = pd.DataFrame(min_max_scaled_ratings_new,
                                                     index=ratings_new.index,
                                                     columns=ratings_new.columns)
        df_min_max_scaled_ratings_new['score'] = df_min_max_scaled_ratings_new['rating'] * df_min_max_scaled_ratings_new['rating_count']
        top_number = df_min_max_scaled_ratings_new.sort_values(by = 'score', ascending = False).head(number_poprec).reset_index()
        top_number_with_titles = pd.merge(movies, top_number, how='inner', on = 'movieId')
        return top_number_with_titles.sort_values(by = 'score', ascending = False)['title'].to_list()
    # User-based:
    elif (len(arg) == 2) and (str(arg[0]).isnumeric()):
        user_id = arg[0]
        number_userrec = arg[1]
        print(f'You have entered two numbers. Here are your top {number_userrec} movie recommendations according to user {user_id}´s ratings:')
        movie_titles = movies[['movieId', 'title']]
        users_items = pd.pivot_table(data=ratings, 
                                     values='rating', 
                                     index='userId', 
                                     columns='movieId')
        users_items.fillna(0, inplace=True)
        user_similarities = pd.DataFrame(cosine_similarity(users_items),
                                         columns=users_items.index, 
                                         index=users_items.index)
        weights = (user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id]))
        not_visited_restaurants = users_items.loc[users_items.index!=user_id, users_items.loc[user_id,:]==0]
        weighted_averages = pd.DataFrame(not_visited_restaurants.T.dot(weights), columns=["predicted_rating"])
        recommendations = weighted_averages.merge(movie_titles, left_index=True, right_on="movieId")
        return recommendations.sort_values("predicted_rating", ascending=False).head(number_userrec)['title'].to_list()
    # Item-based:
    else:
        title = arg[0]
        number_itemrec = arg[1]
        print(f'You have entered a movie title and a number. Here are your top {number_itemrec} movie recommendations according to user {title}´s ratings:')
        movies['title_wo_yr'] = movies['title'].str[:-7]
        movies_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
        top_popular_movieID = int(movies['movieId'].loc[movies['title_wo_yr'] == title])
        top_ratings = movies_crosstab[top_popular_movieID]
        top_ratings[top_ratings.notna()]
        similar_to_movie_id = movies_crosstab.corrwith(top_ratings)
        corr_movie_id = pd.DataFrame(similar_to_movie_id, columns=['PearsonR'])
        corr_movie_id.dropna(inplace=True)
        rating = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
        rating['rating_count'] = ratings.groupby('movieId')['rating'].count()
        movie_id_corr_summary = corr_movie_id.join(rating['rating_count'])
        movie_id_corr_summary.drop(top_popular_movieID, inplace=True)
        top_number = movie_id_corr_summary[movie_id_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(number_itemrec)
        top_number_with_titles = pd.merge(movies, top_number, how='inner', on = 'movieId')
        return top_number_with_titles['title'].to_list()

In [111]:
recommend_me_movies('Terminator', 10)

You have entered a movie title and a number. Here are your top 10 movie recommendations according to user Terminator, The´s ratings:


['Forget Paris (1995)',
 'M (1931)',
 'Body Heat (1981)',
 'Harvey (1950)',
 'Hostel (2005)',
 'Princess and the Frog, The (2009)',
 'Warrior (2011)',
 'Louis C.K.: Live at the Beacon Theater (2011)',
 'Hotel Transylvania (2012)',
 'Horrible Bosses 2 (2014)']