In [1]:
# load movilens data
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

genome_scores_data = pd.read_csv('./data/movielens/genome-scores.csv')
movies_data = pd.read_csv('./data/movielens/movies.csv')
ratings_data = pd.read_csv('./data/movielens/ratings.csv')

In [2]:
genome_scores_data.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.029
1,1,2,0.02375
2,1,3,0.05425
3,1,4,0.06875
4,1,5,0.16


In [3]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [5]:
# mov_tag_df
scores_pivot = genome_scores_data.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index() # compare movie throgh the tags
del genome_scores_data
scores_pivot.head()

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


In [6]:
#join
mov_tag_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left') # get all movieIds
del scores_pivot

mov_tag_df = mov_tag_df.fillna(0) # fill null values
mov_tag_df = mov_tag_df.drop(['title','genres'], axis = 1) # drop clumns not used

mov_tag_df.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


In [7]:
# mov_genres_df
def set_genres(genres,col):
    '''Split genres column and check if it exists or not.'''
    if genres in col.split('|'): return 1
    else: return 0

In [8]:
mov_genres_df = pd.read_csv('./data/movielens/movies.csv')
# split genres field for each movies and create columns for each genres
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)

# drop columns which are no needed anymore
mov_genres_df.drop(['title','genres'], axis = 1, inplace=True)

mov_genres_df.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# mov_rating_df
import unicodedata

def set_year(title):
    '''Extract year information from "title" field.'''
    year = title.strip()[-5:-1]
    if year.isdigit(): return int(year)
    else: return 1800


movies = pd.read_csv('./data/movielens/movies.csv')

movies = movies_data.drop('genres', axis = 1)

movies['year'] = movies.apply(lambda x: set_year(x['title']), axis=1)

movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995


In [10]:
def set_year_group(year):
    '''Groups the years through 0-6.'''
    if (year < 1900): return 0
    elif (1900 <= year <= 1975): return 1
    elif (1976 <= year <= 1995): return 2
    elif (1996 <= year <= 2003): return 3
    elif (2004 <= year <= 2009): return 4
    elif (2005 <= year <= 2015): return 5
    elif (2016 <= year): return 6
    else: return 0
    
movies['year_group'] = movies.apply(lambda x: set_year_group(x['year']), axis=1)

movies.drop(['title','year'], axis = 1, inplace=True)

# group years and rating counts to reduce the scale and increase the similarity calculation
agg_movies_rat = ratings_data.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index()
# del ratings_data

agg_movies_rat.columns = ['movieId','rating_counts', 'rating_mean']

agg_movies_rat.head()

Unnamed: 0,movieId,rating_counts,rating_mean
0,1,68469.0,3.886649
1,2,27143.0,3.246583
2,3,15585.0,3.173981
3,4,2989.0,2.87454
4,5,15474.0,3.077291


In [11]:
def set_rating_group(rating_counts):
    '''Group rating counts.'''
    if (rating_counts <= 1): return 0
    elif (2 <= rating_counts <= 10): return 1
    elif (11 <= rating_counts <= 100): return 2
    elif (101 <= rating_counts <= 1000): return 3
    elif (1001 <= rating_counts <= 5000): return 4
    elif (5001 <= rating_counts): return 5
    else: return 0
    
agg_movies_rat['rating_group'] = agg_movies_rat.apply(lambda x: set_rating_group(x['rating_counts']), axis=1)

agg_movies_rat.drop('rating_counts', axis = 1, inplace=True)

mov_rating_df = movies.merge(agg_movies_rat, left_on='movieId', right_on='movieId', how='left')
del movies, agg_movies_rat

mov_rating_df = mov_rating_df.fillna(0)

mov_rating_df.head()

Unnamed: 0,movieId,year_group,rating_mean,rating_group
0,1,2,3.886649,5.0
1,2,2,3.246583,5.0
2,3,2,3.173981,5.0
3,4,2,2.87454,4.0
4,5,2,3.077291,5.0


In [12]:
# deletes all movies with year <= 2003 due to memory limit

# print(mov_rating_df.head())
rm_index = mov_rating_df[mov_rating_df['year_group'] > 5]
rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)
rm_index.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,movieId
27548,122888
27549,122890
27551,122894
27552,122896
27553,122898


In [13]:
movies_data = movies_data.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
movies_data.to_csv('./data/output/movies_data.csv', index=False)

In [14]:
mov_tag_df = mov_tag_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_tag_df.head())
mov_tag_df = mov_tag_df.set_index('movieId')
mov_tag_df.head()

   movieId        1        2        3        4        5        6        7  \
0   122888  0.01950  0.01650  0.02700  0.06500  0.08825  0.16450  0.03100   
1   122890  0.02975  0.02525  0.04775  0.07525  0.03925  0.02750  0.01075   
2   122894  0.00000  0.00000  0.00000  0.00000  0.00000  0.00000  0.00000   
3   122896  0.02650  0.02525  0.16525  0.03000  0.02550  0.01800  0.01075   
4   122898  0.06450  0.05800  0.05275  0.06075  0.08975  0.03025  0.03150   

         8        9  ...     1119     1120     1121     1122     1123  \
0  0.03375  0.01150  ...  0.04375  0.02250  0.01575  0.01150  0.15225   
1  0.06475  0.02875  ...  0.06200  0.06850  0.01275  0.00775  0.11300   
2  0.00000  0.00000  ...  0.00000  0.00000  0.00000  0.00000  0.00000   
3  0.05075  0.01450  ...  0.04350  0.02325  0.02300  0.01200  0.20550   
4  0.14200  0.05675  ...  0.06025  0.08525  0.03425  0.02600  0.21100   

      1124     1125     1126     1127     1128  
0  0.05250  0.01500  0.01625  0.12300  0.02200  


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
122888,0.0195,0.0165,0.027,0.065,0.08825,0.1645,0.031,0.03375,0.0115,0.20925,...,0.04375,0.0225,0.01575,0.0115,0.15225,0.0525,0.015,0.01625,0.123,0.022
122890,0.02975,0.02525,0.04775,0.07525,0.03925,0.0275,0.01075,0.06475,0.02875,0.0635,...,0.062,0.0685,0.01275,0.00775,0.113,0.103,0.04,0.0125,0.17275,0.02975
122894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122896,0.0265,0.02525,0.16525,0.03,0.0255,0.018,0.01075,0.05075,0.0145,0.05025,...,0.0435,0.02325,0.023,0.012,0.2055,0.0625,0.01475,0.01425,0.24,0.02475
122898,0.0645,0.058,0.05275,0.06075,0.08975,0.03025,0.0315,0.142,0.05675,0.02825,...,0.06025,0.08525,0.03425,0.026,0.211,0.07925,0.034,0.03025,0.1585,0.0305


In [15]:
mov_genres_df = mov_genres_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_genres_df.head())
mov_genres_df = mov_genres_df.set_index('movieId')
mov_genres_df.head()

   movieId  Action  Adventure  Animation  Children  Comedy  Crime  \
0   122888       0          0          0         0       0      0   
1   122890       1          1          0         0       0      0   
2   122894       1          1          0         0       0      0   
3   122896       0          0          0         0       0      0   
4   122898       1          1          0         0       0      0   

   Documentary  Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  \
0            0      0        0          0       0        0        0        0   
1            0      0        1          0       0        0        0        0   
2            0      0        1          0       0        0        0        0   
3            0      0        0          0       0        0        0        0   
4            0      0        0          0       0        0        0        0   

   Sci-Fi  Thriller  War  Western  (no genres listed)  
0       0         0    0        0               

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
122888,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
122890,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
122894,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
122896,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
122898,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [16]:
mov_rating_df = mov_rating_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_rating_df.head())
mov_rating_df = mov_rating_df.set_index('movieId')
# mov_rating_df.drop(rm_index_p, inplace=True)
mov_rating_df.head()

   movieId  year_group  rating_mean  rating_group
0   122888           6     2.849162           3.0
1   122890           6     3.250538           4.0
2   122894           6     2.451613           2.0
3   122896           6     3.253650           4.0
4   122898           6     2.952278           3.0


Unnamed: 0_level_0,year_group,rating_mean,rating_group
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122888,6,2.849162,3.0
122890,6,3.250538,4.0
122894,6,2.451613,2.0
122896,6,3.25365,4.0
122898,6,2.952278,3.0


In [17]:
# rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)
# print(rm_index)

In [18]:
# # mov_tag_df = pd.merge(mov_tag_df, mov_rating_df, how='inner', on=['movieId'])
# rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)

# mov_tag_df = mov_tag_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
# mov_tag_df.head()

# rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)

In [19]:
# mov_tag_df = mov_tag_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
# mov_tag_df = mov_tag_df.set_index('movieId')

In [20]:
# movies matrix

# set "movieId" field as index in the dfs
# mov_tag_df = mov_tag_df.merge(rm_index_id, left_on='movieId', right_on='movieId', how='right')
# mov_tag_df = mov_tag_df.set_index('movieId')

# mov_tag_df = mov_tag_df.merge(rm_index_id, left_on='movieId', right_on='movieId', how='right')
# mov_genres_df = mov_genres_df.set_index('movieId')

# mov_tag_df = mov_tag_df.merge(rm_index_id, left_on='movieId', right_on='movieId', how='right')
# mov_rating_df = mov_rating_df.set_index('movieId')

#cosine similarity for mov_tag_df
cos_tag = cosine_similarity(mov_tag_df.values)*0.5

#cosine similarity for mov_genres_df
cos_genres = cosine_similarity(mov_genres_df.values)*0.25
del mov_genres_df

#cosine similarity for mov_rating_df
cos_rating = cosine_similarity(mov_rating_df.values)*0.25
del mov_rating_df

#mix
cos = cos_tag+cos_genres+cos_rating

cols = mov_tag_df.index.values
inx = mov_tag_df.index
movies_sim = pd.DataFrame(cos, columns=cols, index=inx)
movies_sim.to_csv('./data/output/movies_sim.csv', index=False)
movies_sim.head()

del mov_tag_df, cols, inx

In [21]:
# create movie_similarity df

def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df
    
# create empty df
movies_similarity = pd.DataFrame(columns=['movieId','sim_moveId','relevance'])

In [22]:
#  for each movie find the 5 most similar movies

for x in movies_sim.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))
    
movies_similarity.head()

movies_similarity.to_csv('./data/output/movies_similarity.csv', index=False)

In [23]:
# check recommendations

def movie_recommender(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    df['sim_moveId'] = df['sim_moveId'].astype(int)
    sim_df = movies_data.merge(df, left_on='movieId', right_on='sim_moveId', how='inner'). \
                sort_values('relevance', axis=0, ascending=False). \
                loc[: , ['movieId_y','title','genres']]. \
                rename(columns={ 'movieId_y': "movieId" })
    return sim_df

In [24]:
# recommendation for Deadpool (2016),Action|Adventure|Comedy|Sci-Fi
movie_recommender(122904)

Unnamed: 0,movieId,title,genres
2,122904,Thor: Ragnarok (2017),Action|Adventure|Sci-Fi
4,122904,Deadpool 2 (2018),Action|Comedy|Sci-Fi
1,122904,Avengers: Infinity War - Part I (2018),Action|Adventure|Sci-Fi
3,122904,Guardians of the Galaxy 2 (2017),Action|Adventure|Sci-Fi
0,122904,Black Panther (2017),Action|Adventure|Sci-Fi


In [25]:
# recommendation for GODZILLA: City on the Edge of Battle (2018),Action|Animation|Sci-Fi
movie_recommender(191235)

Unnamed: 0,movieId,title,genres
0,191235,PSYCHO-PASS: The Movie (2016),Action|Animation|Sci-Fi
3,191235,Blade Runner: Black Out 2022 (2017),Action|Animation|Sci-Fi
1,191235,Teen Titans: The Judas Contract (2017),Action|Animation|Sci-Fi
2,191235,Starship Troopers: Traitor of Mars (2017),Action|Animation|Sci-Fi
4,191235,Mazinger Z: Infinity (2017),Action|Animation|Sci-Fi


In [26]:
# prepare users dataset with unique userdIds
users_df = pd.DataFrame(ratings_data['userId'].unique(), columns=['userId'])
users_df.head()

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5


In [27]:
# prepare movies dataset
movies_df = movies_data.drop('genres', axis = 1)

#calculate mean of ratings for each movies
agg_rating_avg = ratings_data.groupby(['movieId']).agg({'rating': np.mean}).reset_index()

agg_rating_avg.columns = ['movieId', 'rating_mean']

#merge
movies_df = movies_df.merge(agg_rating_avg, left_on='movieId', right_on='movieId', how='left')
print(movies_df)

      movieId                                              title  rating_mean
0      122888                                     Ben-hur (2016)     2.849162
1      122890                                    Warcraft (2016)     3.250538
2      122894                                    Avatar 2 (2016)     2.451613
3      122896  Pirates of the Caribbean: Dead Men Tell No Tal...     3.253650
4      122898                              Justice League (2017)     2.952278
...       ...                                                ...          ...
4966   193866                       Tales from the Hood 2 (2018)     1.000000
4967   193872                             Room Laundering (2018)     4.000000
4968   193880                           Her Name Was Mumu (2016)     2.000000
4969   193882                                       Flora (2017)     2.000000
4970   193886                                        Leal (2018)     3.250000

[4971 rows x 3 columns]


In [28]:
# create genres dataset
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"
]

genres_df = pd.DataFrame(genres, columns=['genre'])
genres_df.to_csv('./data/output/genres.csv', index=False)
genres_df.head()

Unnamed: 0,genre
0,Action
1,Adventure
2,Animation
3,Children
4,Comedy


In [29]:
# create users_movies dataset
users_movies_df = ratings_data.drop('timestamp', axis = 1)
print(users_movies_df)
users_movies_df = users_movies_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right') # remove in excess movies

# set userId back in int64
users_movies_df = users_movies_df.fillna(0)
users_movies_df['userId'] = users_movies_df['userId'].astype(int)
display(users_movies_df.dtypes)

print(users_movies_df)

          userId  movieId  rating
0              1      307     3.5
1              1      481     3.5
2              1     1091     1.5
3              1     1257     4.5
4              1     1449     4.5
...          ...      ...     ...
27753439  283228     8542     4.5
27753440  283228     8712     4.5
27753441  283228    34405     4.5
27753442  283228    44761     4.5
27753443  283228    54286     4.5

[27753444 rows x 3 columns]


userId       int64
movieId      int64
rating     float64
dtype: object

        userId  movieId  rating
0         2745   122888     2.0
1         4871   122888     2.5
2         5176   122888     3.0
3         8446   122888     1.5
4        10714   122888     2.0
...        ...      ...     ...
350156  232524   193872     4.0
350157   81710   193880     2.0
350158   33330   193882     2.0
350159  206009   193886     2.5
350160  226715   193886     4.0

[350161 rows x 3 columns]


In [30]:
# create movies_genres dataset with multiple rows for each movies based on genres
movies_genres_df = movies_data.drop('title', axis = 1)

def get_movie_genres(movieId):
    '''Split genres field.'''
    movie = movies_genres_df[movies_genres_df['movieId']==movieId]
    genres = movie['genres'].tolist()
    df = pd.DataFrame([b for a in [i.split('|') for i in genres] for b in a], columns=['genres'])
    df.insert(loc=0, column='movieId', value=movieId)
    return df

# extract genres for each movie
movies_genres=pd.DataFrame(columns=['movieId','genres'])

for x in movies_genres_df['movieId'].tolist():
    movies_genres=movies_genres.append(get_movie_genres(x))

movies_genres_df.to_csv('./data/output/movies_genres.csv', index=False)
print(movies_genres)

   movieId              genres
0   122888  (no genres listed)
0   122890              Action
1   122890           Adventure
2   122890             Fantasy
0   122894              Action
..     ...                 ...
2   193882              Horror
3   193882              Sci-Fi
0   193886              Action
1   193886               Crime
2   193886               Drama

[8270 rows x 2 columns]


In [31]:
# create users_genres dataset to get a relation between users and genres
user_genres_df = ratings_data.merge(movies_data, left_on='movieId', right_on='movieId', how='right')

#drop columns that will not be used
user_genres_df.drop(['movieId','rating','timestamp','title'], axis = 1, inplace=True)

# set userId back in int64
user_genres_df = user_genres_df.fillna(0)
user_genres_df['userId'] = user_genres_df['userId'].astype(int)
display(user_genres_df.dtypes)

print(user_genres_df)

userId     int64
genres    object
dtype: object

        userId                         genres
0         2745             (no genres listed)
1         4871             (no genres listed)
2         5176             (no genres listed)
3         8446             (no genres listed)
4        10714             (no genres listed)
...        ...                            ...
350156  232524                          Drama
350157   81710                          Drama
350158   33330  Adventure|Drama|Horror|Sci-Fi
350159  206009             Action|Crime|Drama
350160  226715             Action|Crime|Drama

[350161 rows x 2 columns]


In [32]:
# def get_favorite_genre(userId):
#     '''Obtain favorite movie genre for each users.'''
#     user = user_genres_df[user_genres_df['userId']==userId]
#     genres = user['genres'].tolist()
#     movie_list = [b for a in [i.split('|') for i in genres] for b in a]
#     # print(movie_list)
#     counter = Counter(movie_list)
#     # print(type(counter))
#     return counter.most_common(1)
#     # return counter.most_common(1)

# # extract favorite movie genre for each user
# users_genres = pd.DataFrame(columns=['userId','genre'])

# for x in users_df['userId'].tolist():
#     users_genres = users_genres.append(pd.DataFrame([[x,get_favorite_genre(x)]], columns=['userId','genre']))

# users_genres.head()