In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('data/movie_to_recommender.csv')
genome_df = pd.read_csv('data/genome_clean.csv', low_memory = False)

df.head(3)

Unnamed: 0,movieId,title,genres,action,adventure,animation,children,comedy,crime,documentary,...,decade_1920,decade_1930,decade_1940,decade_1950,decade_1960,decade_1970,decade_1980,decade_1990,decade_2000,decade_2010
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [50]:
df.columns

Index(['movieId', 'title', 'action', 'adventure', 'animation', 'children',
       'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'horror', 'imax',
       'musical', 'mystery', 'no_genres', 'film_noir', 'romance', 'scifi',
       'thriller', 'war', 'western', 'num_genres', 'avg_rating', 'num_rating',
       'decade_1890', 'decade_1900', 'decade_1910', 'decade_1920',
       'decade_1930', 'decade_1940', 'decade_1950', 'decade_1960',
       'decade_1970', 'decade_1980', 'decade_1990', 'decade_2000',
       'decade_2010'],
      dtype='object')

In [4]:
df = df.drop(columns = 'genres')

In [5]:
genome_df.head(3)

Unnamed: 0,tagId,tag,movieId,relevance
0,1,7,1,0.025
1,1,7,2,0.03975
2,1,7,3,0.0435


### Genome Transformation

In [7]:
genome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 4 columns):
tagId        int64
tag          object
movieId      int64
relevance    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 357.4+ MB


In [8]:
genome_pivot_df = pd.pivot_table(genome_df, values = 'relevance' , index = 'movieId', columns = 'tag')

In [9]:
genome_pivot_df.head(3)

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.032,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.0205,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02675,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185


In [22]:
# left joining the genome features to the movies dataframe
df_rec = pd.merge(left = df, 
                right = genome_pivot_df, 
                left_on=df['movieId'], 
                right_on = genome_pivot_df.index,
                how = 'left').drop('key_0', axis = 1)
df_rec.head(3)

Unnamed: 0,movieId,title,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185


In [23]:
# Exporting data for PCA segementation
df_rec.to_csv('data/movies_to_pca.csv', index = False)

In [24]:
# Tranforming the data, getting ready for pairwise distance
df_rec2 = df_rec
df_rec2.index = df_rec2['title']
df_rec2 = df_rec2.drop(columns = ['title','movieId'])
df_rec2.head(3)

Unnamed: 0_level_0,action_x,adventure_x,animation_x,children_x,comedy_x,crime_x,documentary_x,drama_x,fantasy_x,horror_x,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),0,1,1,1,1,0,0,0,1,0,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
Jumanji (1995),0,1,0,1,0,0,0,0,1,0,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185


### Creating Recommendation Matrix

In [13]:
# transforming the data to sparse matrix
df_sparse = sparse.csr_matrix(df_rec.fillna(0))

In [14]:
recommender = pairwise_distances(df_sparse, metric = 'cosine')

In [15]:
# Checking the shape to see the alignment of movies with other movies
recommender.shape

(27278, 27278)

In [32]:
# Creating a dataframe for the recommender
recommender_df = pd.DataFrame(recommender, index = df_rec2.index, columns = df_rec2.index)
recommender_df.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Ants in the Pants (2000),Werner - Gekotzt wird später (2003),Brother Bear 2 (2006),No More School (2000),Forklift Driver Klaus: The First Day on the Job (2001),Kein Bund für's Leben (2007),"Feuer, Eis & Dosenbier (2002)",The Pirates (2014),Rentun Ruusu (2001),Innocence (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),0.0,0.16387,0.266495,0.30795,0.382137,0.356708,0.420415,0.44038,0.589286,0.465547,...,0.908978,0.908978,0.908965,0.908982,0.908978,0.908982,0.908982,0.908986,0.908985,0.908974
Jumanji (1995),0.16387,0.0,0.238749,0.275358,0.298849,0.352793,0.339898,0.307241,0.430818,0.355202,...,0.76325,0.76325,0.763239,0.763253,0.76325,0.763253,0.763253,0.763256,0.763255,0.763245
Grumpier Old Men (1995),0.266495,0.238749,0.0,0.126189,0.10794,0.226992,0.146397,0.223945,0.291172,0.240743,...,0.575016,0.575018,0.575011,0.57502,0.575018,0.57502,0.57502,0.575026,0.575024,0.575017
Waiting to Exhale (1995),0.30795,0.275358,0.126189,0.0,0.163698,0.229895,0.108277,0.174824,0.257257,0.215557,...,0.482157,0.482158,0.482149,0.482161,0.482158,0.482161,0.482161,0.482166,0.482165,0.482156
Father of the Bride Part II (1995),0.382137,0.298849,0.10794,0.163698,0.0,0.271653,0.10537,0.165924,0.199733,0.197438,...,0.400671,0.400671,0.400668,0.400672,0.400671,0.400672,0.400672,0.400677,0.400676,0.400671


In [74]:
# Creating a formula to search through the recommendation engine and provide the top 10 picks

def top_ten_recommendations(search, df):
    for title in df[df['title'].str.lower().str.contains(search.lower())]['title'].values:
        print(title)
        print(" ")
        print(recommender_df[title].sort_values()[1:11])
        print(" ")
        print(" ")

In [75]:
print('Average Rating', df[df['title'] == 'Ip Man (2008)']['avg_rating']);

Average Rating 13327    3.973447
Name: avg_rating, dtype: float64


In [77]:
top_ten_recommendations('napoleon', df_rec)

Napoleon and Samantha (1972)
 
title
Male and Female (1919)                                                                           1.258250e-07
Born to Win (1971)                                                                               1.395214e-07
Penitentiary (1979)                                                                              1.444057e-07
Sacco and Vanzetti (Sacco e Vanzetti) (1971)                                                     1.465266e-07
Greaser's Palace (1972)                                                                          1.497694e-07
Empire of Passion (a.k.a. In the Realm of Passion) (a.k.a. Phantom Love) (Ai No Borei) (1978)    1.544952e-07
Prince of Central Park, The (1999)                                                               1.545055e-07
Running Free (1999)                                                                              1.861363e-07
Earthling, The (1980)                                                              