In [74]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

 - Movies Genres Feature Enginnering 
 - Genome Tranformations 
 - Combining the data
 - Recommendation matrix

In [77]:
# Reading in the features 
movies_df = pd.read_csv('ml-20m/movies_cleaned.csv', low_memory = False)
genome_df = pd.read_csv('ml-20m/genome_clean.csv', low_memory = False)
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


#### Movies Genres Feature Enginnering 

In [107]:
# Create a decade feature for every movie, instead of a year

# Function to create take a floor round 
def to_decade(num):
    num_str = str(num)[:3] + '0'
    return num_str

# Assign the new feature
movies_df['decade'] = movies_df['year'].map(to_decade)
movies_df.head(3)

Unnamed: 0,movieId,title,genres,year,decade
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,1990
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,1990
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,1990


In [123]:
# Creating dummy variables from decade
movies_df = pd.get_dummies(movies_df, columns=['decade'], drop_first=False)
movies_df.head(3)

Unnamed: 0,movieId,title,genres,year,decade_1890,decade_1900,decade_1910,decade_1920,decade_1930,decade_1940,decade_1950,decade_1960,decade_1970,decade_1980,decade_1990,decade_2000,decade_2010
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,0,0,0,0,0,0,0,0,0,0,1,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,0,0,0,0,0,0,0,0,0,0,1,0,0


In [125]:
# Remove year from the dataframe, as it is not unnecessary
movies_df = movies_df.drop(columns='year',axis = 1 )

In [126]:
# Create features based each genre 

# instantiating count vectorizer
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None) 

# applying count vectorizer to yelp categories column
cat_vector = vectorizer.fit_transform(movies_df['genres']).toarray()

# merging results with original yelp data frame
cat_vector_df = pd.DataFrame(cat_vector, columns = vectorizer.get_feature_names())
cat_vector_df.shape

(27278, 34)

In [130]:
# Merging the vectorizer genres and the movie dataframes
movies_vecrt_df = pd.merge(left = movies_df, 
                            right = cat_vector_df, 
                            on=movies_df.index).drop(columns = ['genres_x','key_0'], axis=1)


In [134]:
movies_vecrt_df.head(3)

Unnamed: 0,movieId,title,decade_1890,decade_1900,decade_1910,decade_1920,decade_1930,decade_1940,decade_1950,decade_1960,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [135]:
movies_vecrt_df.columns

Index(['movieId', 'title', 'decade_1890', 'decade_1900', 'decade_1910',
       'decade_1920', 'decade_1930', 'decade_1940', 'decade_1950',
       'decade_1960', 'decade_1970', 'decade_1980', 'decade_1990',
       'decade_2000', 'decade_2010', '1970', '1990', '1991', '1993', '2002',
       '2003', '2008', '2010', '2011', '2014', 'action', 'adventure',
       'animation', 'children', 'comedy', 'crime', 'documentary', 'drama',
       'fantasy', 'fi', 'film', 'genres_y', 'horror', 'imax', 'listed',
       'musical', 'mystery', 'no', 'noir', 'romance', 'sci', 'thriller', 'war',
       'western'],
      dtype='object')

In [139]:
# Rename address scf-fi movies as the category was split up

movies_vecrt_df = movies_vecrt_df.drop(columns = 'fi', axis = 1)
movies_vecrt_df = movies_vecrt_df.rename(columns = {'sci': 'scifi'})

In [141]:
movies_vecrt_df.head(3)

Unnamed: 0,movieId,title,decade_1890,decade_1900,decade_1910,decade_1920,decade_1930,decade_1940,decade_1950,decade_1960,...,listed,musical,mystery,no,noir,romance,scifi,thriller,war,western
0,1,Toy Story (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Genome Tranformations 

In [142]:
genome_df.head(2)

Unnamed: 0,tagId,tag,movieId,relevance
0,1,7,1,0.025
1,1,7,2,0.03975


In [143]:
genome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 4 columns):
tagId        int64
tag          object
movieId      int64
relevance    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 357.4+ MB


In [144]:
genome_pivot_df = pd.pivot_table(genome_df, values = 'relevance' , index = 'movieId', columns = 'tag')

In [145]:
genome_pivot_df.head(3)

tag,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,19th century,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.025,0.025,0.05775,0.09675,0.14675,0.217,0.067,0.26275,0.262,0.032,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
2,0.03975,0.04375,0.03775,0.048,0.11025,0.0725,0.04775,0.10975,0.09925,0.0205,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
3,0.0435,0.05475,0.028,0.077,0.054,0.0685,0.056,0.185,0.04925,0.02675,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185


In [146]:
# left joining the genome features to the movies dataframe
df = pd.merge(left = movies_vecrt_df, 
                right = genome_pivot_df, 
                left_on=movies_df['movieId'], 
                right_on = genome_pivot_df.index,
                how = 'left').drop('key_0', axis = 1)

In [147]:
df.index = df['title']
df = df.drop(columns = ['title','movieId'])
df.head()

Unnamed: 0_level_0,decade_1890,decade_1900,decade_1910,decade_1920,decade_1930,decade_1940,decade_1950,decade_1960,decade_1970,decade_1980,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),0,0,0,0,0,0,0,0,0,0,...,0.0395,0.018,0.04575,0.03275,0.125,0.0415,0.01925,0.03625,0.07775,0.023
Jumanji (1995),0,0,0,0,0,0,0,0,0,0,...,0.04175,0.01925,0.01725,0.02425,0.1255,0.0225,0.0155,0.01475,0.09025,0.01875
Grumpier Old Men (1995),0,0,0,0,0,0,0,0,0,0,...,0.0415,0.02675,0.02775,0.03425,0.1555,0.03675,0.017,0.0195,0.097,0.0185
Waiting to Exhale (1995),0,0,0,0,0,0,0,0,0,0,...,0.0575,0.03375,0.02275,0.03975,0.18525,0.05925,0.015,0.01525,0.0645,0.013
Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,0,0,...,0.0425,0.02825,0.0215,0.026,0.14275,0.02075,0.0165,0.01675,0.1075,0.01825


In [148]:
# Exporting data for segmentation analysis
df.to_csv('ml-20m/movie_features.csv',index = False)

### Creating Recommendation Matrix

In [149]:
# transforming the data to sparse matrix
df_sparse = sparse.csr_matrix(df.fillna(0))

In [150]:
recommender = pairwise_distances(df_sparse, metric = 'cosine')

In [151]:
# Checking the shape to see the alignment of movies with other movies
recommender.shape

(27278, 27278)

In [152]:
# Creating a dataframe for the recommender
recommender_df = pd.DataFrame(recommender, index = df.index, columns = df.index)
recommender_df.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Ants in the Pants (2000),Werner - Gekotzt wird später (2003),Brother Bear 2 (2006),No More School (2000),Forklift Driver Klaus: The First Day on the Job (2001),Kein Bund für's Leben (2007),"Feuer, Eis & Dosenbier (2002)",The Pirates (2014),Rentun Ruusu (2001),Innocence (2014)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),0.0,0.222372,0.317415,0.377871,0.335517,0.403743,0.364206,0.284261,0.480431,0.383067,...,0.935196,0.870393,0.770884,0.920632,0.935196,0.920632,0.920632,0.920632,1.0,0.887757
Jumanji (1995),0.222372,0.0,0.352592,0.408555,0.344576,0.495507,0.393309,0.216954,0.38263,0.356511,...,1.0,1.0,0.823329,1.0,1.0,1.0,1.0,0.897999,1.0,0.855749
Grumpier Old Men (1995),0.317415,0.352592,0.0,0.22701,0.146934,0.388472,0.200862,0.320446,0.401178,0.363687,...,0.777392,0.888696,0.921296,0.863681,0.888696,0.863681,0.863681,1.0,1.0,1.0
Waiting to Exhale (1995),0.377871,0.408555,0.22701,0.0,0.264884,0.429322,0.177504,0.314381,0.446998,0.442683,...,0.775617,0.887808,0.920668,0.862594,0.887808,0.862594,0.862594,1.0,1.0,1.0
Father of the Bride Part II (1995),0.335517,0.344576,0.146934,0.264884,0.0,0.483961,0.217024,0.352655,0.427506,0.421202,...,0.901292,0.901292,0.930203,0.879108,0.901292,0.879108,0.879108,1.0,1.0,1.0


In [162]:
# Creating a formula to search through the recommendation engine and provide the top 10 picks

def top_ten_recommendations(search):
    for title in movies_df[movies_df['title'].str.lower().str.contains(search.lower())]['title'].values:
        print(title)
#         print('Average Rating', df[df['title'] == title]['rating'].mean())
#         print('Number of Ratings', df[df['title']==title].shape[0])
        print(" ")
        print(recommender_df[title].sort_values()[1:11])
        print(" ")
        print(" ")

In [160]:
top_ten_recommendations('ip man')

Ip Man (2008)
 
title
Jet Li's Fearless (Huo Yuan Jia) (2006)                    0.053770
Fist of Legend (Jing wu ying xiong) (1994)                 0.081450
Ip Man 2 (2010)                                            0.105720
Mongol (2007)                                              0.109614
Once Upon a Time in China (Wong Fei Hung) (1991)           0.114487
Dragon: The Bruce Lee Story (1993)                         0.118179
Last Samurai, The (2003)                                   0.121621
Hero (Ying xiong) (2002)                                   0.126130
Gladiator (2000)                                           0.130256
Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)    0.135393
Name: Ip Man (2008), dtype: float64
 
 
Ip Man 2 (2010)
 
title
Jet Li's Fearless (Huo Yuan Jia) (2006)                      0.097311
Ip Man (2008)                                                0.105720
Fist of Legend (Jing wu ying xiong) (1994)                   0.108415
Legend of Drunken Master