In [1]:
# load movilens data
import pandas as pd
import numpy as np
import datetime
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

genome_scores_data = pd.read_csv('./data/genome-scores.csv')
movies_data = pd.read_csv('./data/movies.csv')
ratings_data = pd.read_csv('./data/ratings.csv')

In [2]:
genome_scores_data.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.029
1,1,2,0.02375
2,1,3,0.05425
3,1,4,0.06875
4,1,5,0.16


In [3]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [5]:
# mov_tag_df
scores_pivot = genome_scores_data.pivot_table(index = ["movieId"],columns = ["tagId"],values = "relevance").reset_index() # compare movie throgh the tags
del genome_scores_data
scores_pivot.head()

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


In [6]:
#join
mov_tag_df = movies_data.merge(scores_pivot, left_on='movieId', right_on='movieId', how='left') # get all movieIds
del scores_pivot

mov_tag_df = mov_tag_df.fillna(0) # fill null values
mov_tag_df = mov_tag_df.drop(['title','genres'], axis = 1) # drop clumns not used

mov_tag_df.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.029,0.02375,0.05425,0.06875,0.16,0.19525,0.076,0.252,0.2275,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,2,0.03625,0.03625,0.08275,0.08175,0.102,0.069,0.05775,0.101,0.08225,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,3,0.0415,0.0495,0.03,0.09525,0.04525,0.05925,0.04,0.1415,0.04075,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,4,0.0335,0.03675,0.04275,0.02625,0.0525,0.03025,0.02425,0.07475,0.0375,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,5,0.0405,0.05175,0.036,0.04625,0.055,0.08,0.0215,0.07375,0.02825,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575


In [7]:
# mov_genres_df
def set_genres(genres,col):
    '''Split genres column and check if it exists or not.'''
    if genres in col.split('|'): return 1
    else: return 0

In [8]:
mov_genres_df = pd.read_csv('./data/movies.csv')
# split genres field for each movies and create columns for each genres
mov_genres_df["Action"] = mov_genres_df.apply(lambda x: set_genres("Action",x['genres']), axis=1)
mov_genres_df["Adventure"] = mov_genres_df.apply(lambda x: set_genres("Adventure",x['genres']), axis=1)
mov_genres_df["Animation"] = mov_genres_df.apply(lambda x: set_genres("Animation",x['genres']), axis=1)
mov_genres_df["Children"] = mov_genres_df.apply(lambda x: set_genres("Children",x['genres']), axis=1)
mov_genres_df["Comedy"] = mov_genres_df.apply(lambda x: set_genres("Comedy",x['genres']), axis=1)
mov_genres_df["Crime"] = mov_genres_df.apply(lambda x: set_genres("Crime",x['genres']), axis=1)
mov_genres_df["Documentary"] = mov_genres_df.apply(lambda x: set_genres("Documentary",x['genres']), axis=1)
mov_genres_df["Drama"] = mov_genres_df.apply(lambda x: set_genres("Drama",x['genres']), axis=1)
mov_genres_df["Fantasy"] = mov_genres_df.apply(lambda x: set_genres("Fantasy",x['genres']), axis=1)
mov_genres_df["Film-Noir"] = mov_genres_df.apply(lambda x: set_genres("Film-Noir",x['genres']), axis=1)
mov_genres_df["Horror"] = mov_genres_df.apply(lambda x: set_genres("Horror",x['genres']), axis=1)
mov_genres_df["Musical"] = mov_genres_df.apply(lambda x: set_genres("Musical",x['genres']), axis=1)
mov_genres_df["Mystery"] = mov_genres_df.apply(lambda x: set_genres("Mystery",x['genres']), axis=1)
mov_genres_df["Romance"] = mov_genres_df.apply(lambda x: set_genres("Romance",x['genres']), axis=1)
mov_genres_df["Sci-Fi"] = mov_genres_df.apply(lambda x: set_genres("Sci-Fi",x['genres']), axis=1)
mov_genres_df["Thriller"] = mov_genres_df.apply(lambda x: set_genres("Thriller",x['genres']), axis=1)
mov_genres_df["War"] = mov_genres_df.apply(lambda x: set_genres("War",x['genres']), axis=1)
mov_genres_df["Western"] = mov_genres_df.apply(lambda x: set_genres("Western",x['genres']), axis=1)
mov_genres_df["(no genres listed)"] = mov_genres_df.apply(lambda x: set_genres("(no genres listed)",x['genres']), axis=1)

# drop columns which are no needed anymore
mov_genres_df.drop(['title','genres'], axis = 1, inplace=True)

mov_genres_df.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# mov_rating_df
import unicodedata

def set_year(title):
    '''Extract year information from "title" field.'''
    year = title.strip()[-5:-1]
    if year.isdigit(): return int(year)
    else: return 1800


movies = pd.read_csv('./data/movies.csv')

movies = movies_data.drop('genres', axis = 1)

movies['year'] = movies.apply(lambda x: set_year(x['title']), axis=1)

movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995


In [10]:
def set_year_group(year):
    '''Groups the years through 0-5.'''
    if (year < 1900): return 0
    elif (1900 <= year <= 1975): return 1
    elif (1976 <= year <= 1995): return 2
    elif (1996 <= year <= 2003): return 3
    elif (2004 <= year <= 2009): return 4
    elif (2010 <= year): return 5
    else: return 0
    
movies['year_group'] = movies.apply(lambda x: set_year_group(x['year']), axis=1)

movies.drop(['title','year'], axis = 1, inplace=True)

# group years and rating counts to reduce the scale and increase the similarity calculation
agg_movies_rat = ratings_data.groupby(['movieId']).agg({'rating': [np.size, np.mean]}).reset_index()
del ratings_data

agg_movies_rat.columns = ['movieId','rating_counts', 'rating_mean']

agg_movies_rat.head()

Unnamed: 0,movieId,rating_counts,rating_mean
0,1,68469.0,3.886649
1,2,27143.0,3.246583
2,3,15585.0,3.173981
3,4,2989.0,2.87454
4,5,15474.0,3.077291


In [11]:
def set_rating_group(rating_counts):
    '''Group rating counts.'''
    if (rating_counts <= 1): return 0
    elif (2 <= rating_counts <= 10): return 1
    elif (11 <= rating_counts <= 100): return 2
    elif (101 <= rating_counts <= 1000): return 3
    elif (1001 <= rating_counts <= 5000): return 4
    elif (5001 <= rating_counts): return 5
    else: return 0
    
agg_movies_rat['rating_group'] = agg_movies_rat.apply(lambda x: set_rating_group(x['rating_counts']), axis=1)

agg_movies_rat.drop('rating_counts', axis = 1, inplace=True)

mov_rating_df = movies.merge(agg_movies_rat, left_on='movieId', right_on='movieId', how='left')
del movies, agg_movies_rat

mov_rating_df = mov_rating_df.fillna(0)

mov_rating_df.head()

Unnamed: 0,movieId,year_group,rating_mean,rating_group
0,1,2,3.886649,5.0
1,2,2,3.246583,5.0
2,3,2,3.173981,5.0
3,4,2,2.87454,4.0
4,5,2,3.077291,5.0


In [12]:
# deletes all movies with year <= 2003 due to memory limit

# print(mov_rating_df.head())
rm_index = mov_rating_df[mov_rating_df['year_group'] > 4]
rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)
rm_index.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,movieId
14721,73268
14731,73319
14732,73321
14804,73744
14844,73929


In [13]:
movies_data = movies_data.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
movies_data.to_csv('./data/movies_data.csv')

In [14]:
mov_tag_df = mov_tag_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_tag_df.head())
mov_tag_df = mov_tag_df.set_index('movieId')
mov_tag_df.head()

   movieId        1       2        3        4        5        6        7  \
0    73268  0.02350  0.0205  0.11250  0.04500  0.06550  0.06225  0.02150   
1    73319  0.03400  0.0305  0.03325  0.05650  0.03800  0.02600  0.01125   
2    73321  0.02975  0.0255  0.06225  0.03825  0.05175  0.04500  0.02350   
3    73744  0.00000  0.0000  0.00000  0.00000  0.00000  0.00000  0.00000   
4    73929  0.03275  0.0280  0.01825  0.02725  0.03650  0.01700  0.01350   

         8        9  ...     1119     1120     1121     1122     1123  \
0  0.10575  0.07600  ...  0.04125  0.01850  0.01275  0.01025  0.16325   
1  0.03400  0.01025  ...  0.03275  0.02475  0.00850  0.01375  0.25450   
2  0.10325  0.02500  ...  0.07950  0.03900  0.01225  0.02425  0.15700   
3  0.00000  0.00000  ...  0.00000  0.00000  0.00000  0.00000  0.00000   
4  0.04925  0.02650  ...  0.05925  0.02025  0.01350  0.01075  0.34825   

      1124     1125     1126     1127     1128  
0  0.04375  0.02300  0.00975  0.55525  0.18725  
1  0.0

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
73268,0.0235,0.0205,0.1125,0.045,0.0655,0.06225,0.0215,0.10575,0.076,0.11025,...,0.04125,0.0185,0.01275,0.01025,0.16325,0.04375,0.023,0.00975,0.55525,0.18725
73319,0.034,0.0305,0.03325,0.0565,0.038,0.026,0.01125,0.034,0.01025,0.0235,...,0.03275,0.02475,0.0085,0.01375,0.2545,0.07175,0.012,0.0055,0.106,0.0165
73321,0.02975,0.0255,0.06225,0.03825,0.05175,0.045,0.0235,0.10325,0.025,0.087,...,0.0795,0.039,0.01225,0.02425,0.157,0.13,0.01675,0.0125,0.179,0.04475
73744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73929,0.03275,0.028,0.01825,0.02725,0.0365,0.017,0.0135,0.04925,0.0265,0.014,...,0.05925,0.02025,0.0135,0.01075,0.34825,0.071,0.02225,0.006,0.449,0.2285


In [15]:
mov_genres_df = mov_genres_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_genres_df.head())
mov_genres_df = mov_genres_df.set_index('movieId')
mov_genres_df.head()

   movieId  Action  Adventure  Animation  Children  Comedy  Crime  \
0    73268       1          0          0         0       0      0   
1    73319       0          0          0         0       1      0   
2    73321       1          1          0         0       0      0   
3    73744       0          0          0         0       0      0   
4    73929       1          0          0         0       0      0   

   Documentary  Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  \
0            0      1        0          0       1        0        0        0   
1            0      0        0          0       0        0        0        1   
2            0      1        0          0       0        0        0        0   
3            0      1        0          0       0        1        0        1   
4            0      0        1          0       1        0        0        0   

   Sci-Fi  Thriller  War  Western  (no genres listed)  
0       0         1    0        0               

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
73268,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
73319,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
73321,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
73744,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0
73929,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0


In [16]:
mov_rating_df = mov_rating_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
print(mov_rating_df.head())
mov_rating_df = mov_rating_df.set_index('movieId')
# mov_rating_df.drop(rm_index_p, inplace=True)
mov_rating_df.head()

   movieId  year_group  rating_mean  rating_group
0    73268           5     3.211313           4.0
1    73319           5     3.312629           3.0
2    73321           5     3.455823           5.0
3    73744           5     2.807692           2.0
4    73929           5     2.625275           3.0


Unnamed: 0_level_0,year_group,rating_mean,rating_group
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
73268,5,3.211313,4.0
73319,5,3.312629,3.0
73321,5,3.455823,5.0
73744,5,2.807692,2.0
73929,5,2.625275,3.0


In [17]:
# rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)
# print(rm_index)

In [18]:
# # mov_tag_df = pd.merge(mov_tag_df, mov_rating_df, how='inner', on=['movieId'])
# rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)

# mov_tag_df = mov_tag_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
# mov_tag_df.head()

# rm_index.drop(['year_group','rating_mean','rating_group'], axis = 1, inplace=True)

In [19]:
# mov_tag_df = mov_tag_df.merge(rm_index, left_on='movieId', right_on='movieId', how='right')
# mov_tag_df = mov_tag_df.set_index('movieId')

In [20]:
# movies matrix

# set "movieId" field as index in the dfs
# mov_tag_df = mov_tag_df.merge(rm_index_id, left_on='movieId', right_on='movieId', how='right')
# mov_tag_df = mov_tag_df.set_index('movieId')

# mov_tag_df = mov_tag_df.merge(rm_index_id, left_on='movieId', right_on='movieId', how='right')
# mov_genres_df = mov_genres_df.set_index('movieId')

# mov_tag_df = mov_tag_df.merge(rm_index_id, left_on='movieId', right_on='movieId', how='right')
# mov_rating_df = mov_rating_df.set_index('movieId')

#cosine similarity for mov_tag_df
cos_tag = cosine_similarity(mov_tag_df.values)*0.5

#cosine similarity for mov_genres_df
cos_genres = cosine_similarity(mov_genres_df.values)*0.25
del mov_genres_df

#cosine similarity for mov_rating_df
cos_rating = cosine_similarity(mov_rating_df.values)*0.25
del mov_rating_df

#mix
cos = cos_tag+cos_genres+cos_rating

cols = mov_tag_df.index.values
inx = mov_tag_df.index
movies_sim = pd.DataFrame(cos, columns=cols, index=inx)
movies_sim.to_csv('./data/movies_sim.csv')
movies_sim.head()

del mov_tag_df, cols, inx

In [21]:
# create movie_similarity df

def get_similar(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    return df
    
# create empty df
movies_similarity = pd.DataFrame(columns=['movieId','sim_moveId','relevance'])

In [22]:
#  for each movie find the 5 most similar movies

for x in movies_sim.index.tolist():
    movies_similarity = movies_similarity.append(get_similar(x))
    
movies_similarity.head()

movies_similarity.to_csv('./data/movies_similarity.csv')

In [23]:
# check recommendations

def movie_recommender(movieId):
    df = movies_sim.loc[movies_sim.index == movieId].reset_index(). \
            melt(id_vars='movieId', var_name='sim_moveId', value_name='relevance'). \
            sort_values('relevance', axis=0, ascending=False)[1:6]
    df['sim_moveId'] = df['sim_moveId'].astype(int)
    sim_df = movies_data.merge(df, left_on='movieId', right_on='sim_moveId', how='inner'). \
                sort_values('relevance', axis=0, ascending=False). \
                loc[: , ['movieId_y','title','genres']]. \
                rename(columns={ 'movieId_y': "movieId" })
    return sim_df

In [24]:
#get recommendation for Inception

movie_recommender(79132)

Unnamed: 0,movieId,title,genres
0,79132,Source Code (2011),Action|Drama|Mystery|Sci-Fi|Thriller
2,79132,Predestination (2014),Action|Mystery|Sci-Fi|Thriller
3,79132,Sherlock: The Abominable Bride (2016),Action|Crime|Drama|Mystery|Thriller
1,79132,Coherence (2013),Drama|Mystery|Sci-Fi|Thriller
4,79132,Black Mirror: White Christmas (2014),Drama|Horror|Mystery|Sci-Fi|Thriller


In [27]:
movie_recommender(86911)

Unnamed: 0,movieId,title,genres
0,86911,"Hangover Part III, The (2013)",Comedy
3,86911,Ted 2 (2015),Comedy
4,86911,Neighbors 2: Sorority Rising (2016),Comedy
2,86911,Dumb and Dumber To (2014),Comedy
1,86911,Anchorman 2: The Legend Continues (2013),Comedy
