# Movielens data manipulation

Load movilens datasets to provide movie Recommendation based on movie id.


# Import dependencies


In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

# Load data


In [2]:
genome_scores_data = pd.read_csv('../../movielens/genome-scores.csv')
genome_scores_data.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [3]:
movies_data = pd.read_csv('../../movielens/movies.csv')
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_data = pd.read_csv('../../movielens/ratings.csv')
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


# Prepare data for movie matrix

Create 3 dataframes and calculate 3 cosine similarities for each of them. Then they are mixed to obtain the movie similarity matrix.


# mov_tags_df

Data is pivoted to compare movies through tags.


In [5]:
scores_pivot = genome_scores_data.pivot_table(index=["movieId"], columns=["tagId"], values="relevance").reset_index() # group tags for movie
del genome_scores_data

scores_pivot.head()

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


# Join "scores_pivot" and "movies_data"

These two dataframes are joined to get all movieIds.


In [6]:
mov_tags_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left') # get all movieIds
del scores_pivot

mov_tags_df = mov_tags_df.fillna(0) # fill null values
mov_tags_df = mov_tags_df.drop(['title', 'genres'], axis = 1) # drop clumns not used

mov_tags_df.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.02875,0.02375,0.0625,0.07575,0.14075,0.14675,0.0635,0.20375,0.202,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0.04125,0.0405,0.06275,0.08275,0.091,0.06125,0.06925,0.096,0.0765,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0.04675,0.0555,0.02925,0.087,0.0475,0.04775,0.046,0.14275,0.0285,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0.03425,0.038,0.0405,0.031,0.065,0.03575,0.029,0.0865,0.032,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0.043,0.05325,0.038,0.041,0.054,0.06725,0.02775,0.0765,0.0215,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


# mov_genres_df

Split genres field for each movies and create columns for each genres.

Define a function to split genres column and check it if it exists or not.


In [7]:
def set_genres(genres,col):
    '''Split genres column and check if it exists or not.'''
    if genres in col.split('|'): return 1
    else: return 0

In [8]:
mov_genres_df = pd.read_csv('../../movielens/movies.csv')

# split genres field for each movies and create columns for each genres
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)

mov_genres_df.drop(['title','genres'], axis = 1, inplace=True) # drop columns which are no needed anymore

mov_genres_df.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# mov_rating_df

Load movies, extract the year value from the title column and create a new one with this value.


In [9]:
import unicodedata

def set_year(title):
    '''Extract year information from "title" field.'''
    year = title.strip()[-5:-1]
    if year.isdigit(): return int(year)
    else: return 1800


movies = pd.read_csv('../../movielens/movies.csv')
movies = movies_data.drop('genres', axis = 1)
movies['year'] = movies.apply(lambda x: set_year(x['title']), axis=1)

movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995


# Tag the year

Movies are labeled with a number that define the year range of the production.


In [10]:
def set_year_group(year):
    '''Groups the years through 0-6.'''
    if (year < 1900): return 0
    elif (1900 <= year <= 1975): return 1
    elif (1976 <= year <= 1995): return 2
    elif (1996 <= year <= 2003): return 3
    elif (2004 <= year <= 2009): return 4
    elif (2005 <= year <= 2015): return 5
    elif (2016 <= year): return 6
    else: return 0
    
movies['year_group'] = movies.apply(lambda x: set_year_group(x['year']), axis=1)
movies.drop(['title', 'year'], axis = 1, inplace=True)

agg_movies_rat = ratings_data.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index() # group years and rating counts to reduce the scale and increase the similarity calculation
# del ratings_data
agg_movies_rat.columns = ['movieId', 'rating_counts', 'rating_mean']

agg_movies_rat.head()

Unnamed: 0,movieId,rating_counts,rating_mean
0,1,57309,3.893708
1,2,24228,3.251527
2,3,11804,3.142028
3,4,2523,2.853547
4,5,11714,3.058434


# Calculate the mean

Calculate the mean and counts the ratings for each movies and then merge it with movies df.


In [11]:
def set_rating_group(rating_counts):
    '''Group rating counts.'''
    if (rating_counts <= 1): return 0
    elif (2 <= rating_counts <= 10): return 1
    elif (11 <= rating_counts <= 100): return 2
    elif (101 <= rating_counts <= 1000): return 3
    elif (1001 <= rating_counts <= 5000): return 4
    elif (5001 <= rating_counts): return 5
    else: return 0
    
agg_movies_rat['rating_group'] = agg_movies_rat.apply(lambda x: set_rating_group(x['rating_counts']), axis=1)
agg_movies_rat.drop('rating_counts', axis = 1, inplace=True)

mov_rating_df = movies.merge(agg_movies_rat, left_on='movieId', right_on='movieId', how='left')
del movies, agg_movies_rat
mov_rating_df = mov_rating_df.fillna(0)

mov_rating_df.head()

Unnamed: 0,movieId,year_group,rating_mean,rating_group
0,1,2,3.893708,5.0
1,2,2,3.251527,5.0
2,3,2,3.142028,5.0
3,4,2,2.853547,4.0
4,5,2,3.058434,5.0


# Data reduction

To reduce memory usage, only movies produced after 2016 are considered. This df is used to remove not considered movies from previous dfs.


In [12]:
rm_index = mov_rating_df[mov_rating_df['year_group'] > 5]
rm_index = rm_index.drop(['year_group', 'rating_mean', 'rating_group'], axis = 1)

rm_index.head()

Unnamed: 0,movieId
25056,122888
25057,122890
25059,122896
25060,122898
25063,122904


In [13]:
movies_data = movies_data.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
# movies_data.to_csv('./data/output/movies_data.csv', index=False)

In [14]:
mov_tags_df = mov_tags_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_tags_df.head())

mov_tags_df = mov_tags_df.set_index('movieId')
mov_tags_df.head()

   movieId        1        2        3        4        5        6        7  \
0   122888  0.01925  0.01600  0.03275  0.07650  0.08075  0.09075  0.03450   
1   122890  0.03300  0.02700  0.04175  0.07325  0.04750  0.01975  0.01325   
2   122896  0.03300  0.03125  0.14525  0.04350  0.03200  0.01875  0.01475   
3   122898  0.06375  0.05600  0.05850  0.06600  0.10400  0.03225  0.03425   
4   122904  0.03325  0.02500  0.04975  0.05950  0.05575  0.04075  0.04075   

         8        9  ...     1119    1120     1121     1122     1123    1124  \
0  0.04650  0.00950  ...  0.03600  0.0190  0.01575  0.01200  0.14600  0.0515   
1  0.06675  0.02700  ...  0.06100  0.0800  0.00875  0.00750  0.11300  0.0920   
2  0.04900  0.01375  ...  0.04650  0.0180  0.01375  0.01075  0.17900  0.0455   
3  0.14525  0.04825  ...  0.07400  0.0730  0.02425  0.02325  0.20725  0.0875   
4  0.19100  0.04175  ...  0.03775  0.0265  0.02350  0.03800  0.32800  0.1840   

      1125     1126     1127     1128  
0  0.01675  0.02

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
122888,0.01925,0.016,0.03275,0.0765,0.08075,0.09075,0.0345,0.0465,0.0095,0.18925,...,0.036,0.019,0.01575,0.012,0.146,0.0515,0.01675,0.0205,0.115,0.0235
122890,0.033,0.027,0.04175,0.07325,0.0475,0.01975,0.01325,0.06675,0.027,0.0565,...,0.061,0.08,0.00875,0.0075,0.113,0.092,0.03425,0.01125,0.206,0.0315
122896,0.033,0.03125,0.14525,0.0435,0.032,0.01875,0.01475,0.049,0.01375,0.049,...,0.0465,0.018,0.01375,0.01075,0.179,0.0455,0.013,0.0115,0.2755,0.02925
122898,0.06375,0.056,0.0585,0.066,0.104,0.03225,0.03425,0.14525,0.04825,0.03375,...,0.074,0.073,0.02425,0.02325,0.20725,0.0875,0.03975,0.02425,0.14825,0.02975
122904,0.03325,0.025,0.04975,0.0595,0.05575,0.04075,0.04075,0.191,0.04175,0.06725,...,0.03775,0.0265,0.0235,0.038,0.328,0.184,0.08525,0.02275,0.2085,0.04175


In [15]:
mov_genres_df = mov_genres_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_genres_df.head())

mov_genres_df = mov_genres_df.set_index('movieId')
mov_genres_df.head()

   movieId  Action  Adventure  Animation  Children  Comedy  Crime  \
0   122888       0          0          0         0       0      0   
1   122890       1          1          0         0       0      0   
2   122896       0          0          0         0       0      0   
3   122898       1          1          0         0       0      0   
4   122904       1          1          0         0       1      0   

   Documentary  Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  \
0            0      0        0          0       0        0        0        0   
1            0      0        1          0       0        0        0        0   
2            0      0        0          0       0        0        0        0   
3            0      0        0          0       0        0        0        0   
4            0      0        0          0       0        0        0        0   

   Sci-Fi  Thriller  War  Western  (no genres listed)  
0       0         0    0        0               

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
122888,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
122890,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
122896,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
122898,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
122904,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [16]:
mov_rating_df = mov_rating_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_rating_df.head())

mov_rating_df = mov_rating_df.set_index('movieId')
mov_rating_df.head()

   movieId  year_group  rating_mean  rating_group
0   122888           6     2.793296           3.0
1   122890           6     3.243009           4.0
2   122896           6     3.235033           4.0
3   122898           6     2.884520           4.0
4   122904           6     3.852662           5.0


Unnamed: 0_level_0,year_group,rating_mean,rating_group
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122888,6,2.793296,3.0
122890,6,3.243009,4.0
122896,6,3.235033,4.0
122898,6,2.88452,4.0
122904,6,3.852662,5.0


# movies matrix

Create 3 different datasets to calculate the cosine similarity.


In [17]:
cos_tag_factor = 0.5
cos_tag = cosine_similarity(mov_tags_df.values)*cos_tag_factor

cos_genres_factor = 0.25
cos_genres = cosine_similarity(mov_genres_df.values)*cos_genres_factor
mov_genres_df.to_csv('../data/output/movie_genres.csv', index=False)
del mov_genres_df

cos_rating_factor = 0.25
cos_rating = cosine_similarity(mov_rating_df.values)*cos_rating_factor
mov_rating_df.to_csv('../data/output/movie_rating.csv', index=False)
del mov_rating_df

cos = cos_tag+cos_genres+cos_rating

cols = mov_tags_df.index.values
inx = mov_tags_df.index
movies_sim = pd.DataFrame(cos, columns=cols, index=inx)
mov_tags_df.to_csv('../data/output/movie_tags.csv', index=False)
del mov_tags_df, cols, inx
movies_sim.to_csv('../data/output/movies_similarity_matrix.csv', index=False)
movies_sim.head()

Unnamed: 0_level_0,122888,122890,122896,122898,122904,122906,122908,122910,122912,122914,...,209051,209073,209079,209085,209133,209143,209145,209151,209157,209163
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
122888,1.0,0.574753,0.824236,0.594556,0.550992,0.606402,0.246724,0.599941,0.598183,0.603414,...,0.476729,0.222801,0.227611,0.238202,0.490558,0.219416,0.219416,0.476729,0.22357,0.222801
122890,0.574753,1.0,0.636572,0.809419,0.72137,0.79545,0.407192,0.797518,0.794279,0.778593,...,0.215534,0.213292,0.215539,0.230801,0.231292,0.203987,0.203987,0.215534,0.208916,0.213292
122896,0.824236,0.636572,1.0,0.626959,0.571997,0.621988,0.24055,0.62687,0.617356,0.608148,...,0.465496,0.213229,0.215515,0.230764,0.481292,0.20403,0.20403,0.465496,0.208941,0.213229
122898,0.594556,0.809419,0.626959,1.0,0.878718,0.955421,0.49143,0.976895,0.946481,0.938037,...,0.21361,0.210217,0.214268,0.22891,0.231079,0.205771,0.205771,0.21361,0.209887,0.210217
122904,0.550992,0.72137,0.571997,0.878718,1.0,0.884492,0.448759,0.890132,0.899533,0.899028,...,0.204504,0.20415,0.203516,0.222748,0.22101,0.188078,0.188078,0.204504,0.193921,0.292538


# Create movie_similarity df

Define a function to return the 5 most similar movies based on a movieId.


In [18]:
def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_movieId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df
    
movies_similarity = pd.DataFrame(columns=['movieId','sim_movieId','relevance'])

#  Find the 5 most similar movies


In [19]:
for x in movies_sim.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))

movies_similarity.to_csv('../data/output/movies_similarity.csv', index=False)    
# movies_similarity = movies_similarity.drop(['sim_movieId'], axis = 1)
movies_similarity.head()

Unnamed: 0,movieId,sim_movieId,relevance
18,122888,135426,0.871181
65,122888,151593,0.85817
3334,122888,183909,0.846288
4727,122888,191869,0.842831
4589,122888,190949,0.841166


# Try movie Recommendation

In [20]:
def movie_recommender(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    df['sim_moveId'] = df['sim_moveId'].astype(int)
    sim_df = movies_data.merge(df, left_on='movieId', right_on='sim_moveId', how='inner'). \
                sort_values('relevance', axis=0, ascending=False). \
                loc[: , ['movieId_y','title','genres']]. \
                rename(columns={ 'movieId_y': "movieId" })
    return sim_df

# Recommendation for Deadpool, Action|Adventure|Comedy|Sci-Fi


In [21]:
movie_recommender(122904)

Unnamed: 0,movieId,title,genres
3,122904,Deadpool 2 (2018),Action|Comedy|Sci-Fi
2,122904,Thor: Ragnarok (2017),Action|Adventure|Sci-Fi
0,122904,Avengers: Infinity War - Part I (2018),Action|Adventure|Sci-Fi
1,122904,Avengers: Infinity War - Part II (2019),Action|Adventure|Sci-Fi
4,122904,Ant-Man and the Wasp (2018),Action|Adventure|Comedy|Fantasy|Sci-Fi


# Recommendation for GODZILLA: City on the Edge of Battle, Action|Animation|Sci-Fi

In [22]:
movie_recommender(191235)

Unnamed: 0,movieId,title,genres
3,191235,Reign of the Supermen (2019),Action|Animation|Sci-Fi
0,191235,Teen Titans: The Judas Contract (2017),Action|Animation|Sci-Fi
1,191235,Starship Troopers: Traitor of Mars (2017),Action|Animation|Sci-Fi
2,191235,Blade Runner: Black Out 2022 (2017),Action|Animation|Sci-Fi
4,191235,Justice League vs. the Fatal Five (2019),Action|Animation|Sci-Fi


# Prepare movies dataset with year and tmdbId


In [23]:
movies_df = movies_data.drop('genres', axis = 1)

agg_rating_avg = ratings_data.groupby(['movieId']).agg({'rating': np.mean}).reset_index() #calculate mean of ratings for each movies
agg_rating_avg.columns = ['movieId', 'rating_mean']
movies_df = movies_df.merge(agg_rating_avg, left_on='movieId', right_on='movieId', how='left')

movies_df.head()

Unnamed: 0,movieId,title,rating_mean
0,122888,Ben-hur (2016),2.793296
1,122890,Warcraft (2016),3.243009
2,122896,Pirates of the Caribbean: Dead Men Tell No Tal...,3.235033
3,122898,Justice League (2017),2.88452
4,122904,Deadpool (2016),3.852662


## Extract and delete year from title and save it in a new column

In [24]:
def del_year(title):
    '''Delete year information from "title" field.'''
    new_title = title.strip()[:-7]
    return new_title
    
movies_df['year'] = movies_df.apply(lambda x: set_year(x['title']), axis=1)
movies_df['title'] = movies_df.apply(lambda x: del_year(x['title']), axis=1)
movies_df = movies_df.set_index('movieId')

movies_df.head()

Unnamed: 0_level_0,title,rating_mean,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122888,Ben-hur,2.793296,2016
122890,Warcraft,3.243009,2016
122896,Pirates of the Caribbean: Dead Men Tell No Tales,3.235033,2017
122898,Justice League,2.88452,2017
122904,Deadpool,3.852662,2016


## Add tmdbId column

In [25]:
links_df = pd.read_csv('../../movielens/links.csv')
links_df = links_df.set_index('movieId')
links_df = links_df.drop(['imdbId'], axis = 1)

links_df.head()

Unnamed: 0_level_0,tmdbId
movieId,Unnamed: 1_level_1
1,862.0
2,8844.0
3,15602.0
4,31357.0
5,11862.0


## Merge movies_df and links, delete 'imdbId' column and convert 'tmdbId' from float to int

In [30]:
movies_df = movies_df.join(links_df, how='left')
del(links_df)
movies_df = movies_df.dropna()
movies_df['tmdbId'] = movies_df['tmdbId'].astype(int) # convert value to int from float

movies_df.head()

Unnamed: 0_level_0,title,rating_mean,year,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
122888,Ben-hur,2.793296,2016,271969
122890,Warcraft,3.243009,2016,68735
122896,Pirates of the Caribbean: Dead Men Tell No Tales,3.235033,2017,166426
122898,Justice League,2.88452,2017,141052
122904,Deadpool,3.852662,2016,293660


# Create genres dataset

In [31]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"
]
genres_df = pd.DataFrame(genres, columns=['genre'])

genres_df.head()

Unnamed: 0,genre
0,Action
1,Adventure
2,Animation
3,Children
4,Comedy


# Create movies_genres dataset with multiple rows for each movies based on genres

In [32]:
movies_genres_df = movies_data.drop('title', axis = 1)

def get_movie_genres(movieId):
    '''Split genres field.'''
    movie = movies_genres_df[movies_genres_df['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

movies_genres=pd.DataFrame(columns=['movieId','genres']) # extract genres for each movie

for x in movies_genres_df['movieId'].tolist():
    movies_genres=movies_genres.append(get_movie_genres(x))

movies_genres.head()

Unnamed: 0,movieId,genres
0,122888,(no genres listed)
0,122890,Action
1,122890,Adventure
2,122890,Fantasy
0,122896,(no genres listed)


# Get recommendation movies with their information

In [33]:
movies_genres_df = movies_df.merge(movies_genres_df, left_on='movieId', right_on='movieId', how='left')
movies_genres_df.to_csv('../data/output/movies.csv', index=False)

movies_genres_df.head()

Unnamed: 0,movieId,title,rating_mean,year,tmdbId,genres
0,122888,Ben-hur,2.793296,2016,271969,(no genres listed)
1,122890,Warcraft,3.243009,2016,68735,Action|Adventure|Fantasy
2,122896,Pirates of the Caribbean: Dead Men Tell No Tales,3.235033,2017,166426,(no genres listed)
3,122898,Justice League,2.88452,2017,141052,Action|Adventure|Sci-Fi
4,122904,Deadpool,3.852662,2016,293660,Action|Adventure|Comedy|Sci-Fi


In [34]:
id = 122904

def get_movie_info(id):
    '''Get information of a movie with 'movieId'.'''
    query = 'movieId == {}'.format(id)
    movie = movies_genres_df.query(query)
    return movie


get_movie_info(id)

Unnamed: 0,movieId,title,rating_mean,year,tmdbId,genres
4,122904,Deadpool,3.852662,2016,293660,Action|Adventure|Comedy|Sci-Fi


In [35]:
movies_similarity.head()

Unnamed: 0,movieId,sim_movieId,relevance
18,122888,135426,0.871181
65,122888,151593,0.85817
3334,122888,183909,0.846288
4727,122888,191869,0.842831
4589,122888,190949,0.841166


In [36]:
def get_recommendation_movies(id):
    '''Get the recommended movie for a specific movie with 'movieId'.'''
    query = 'movieId == {}'.format(id)
    recommended_movies = movies_similarity.query(query)
    recommended_movies = recommended_movies.drop(['movieId'], axis = 1)
    for index, row in recommended_movies.iterrows():
        df = get_movie_info(row['sim_movieId'])
        df = df.assign(relevance = row['relevance'])
        print(df)

get_recommendation_movies(id)

      movieId       title  rating_mean  year  tmdbId                genres  \
3933   187593  Deadpool 2     3.781831  2018  383498  Action|Comedy|Sci-Fi   

      relevance  
3933   0.917823  
    movieId           title  rating_mean  year  tmdbId  \
10   122916  Thor: Ragnarok     3.888957  2017  284053   

                     genres  relevance  
10  Action|Adventure|Sci-Fi    0.91347  
   movieId                            title  rating_mean  year  tmdbId  \
8   122912  Avengers: Infinity War - Part I     3.906317  2018  299536   

                    genres  relevance  
8  Action|Adventure|Sci-Fi   0.899533  
   movieId                             title  rating_mean  year  tmdbId  \
9   122914  Avengers: Infinity War - Part II     3.897284  2019  299534   

                    genres  relevance  
9  Action|Adventure|Sci-Fi   0.899028  
      movieId                 title  rating_mean  year  tmdbId  \
4015   188301  Ant-Man and the Wasp     3.552015  2018  363088   

               