In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
!ls ml-latest-small

README.txt  links.csv   movies.csv  ratings.csv tags.csv


In [6]:
df_links = pd.read_csv('ml-latest-small/links.csv')
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_tags = pd.read_csv('ml-latest-small/tags.csv')

In [7]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Convert movie lookup to dictionary

In [229]:
np.asarray(df_movies['movieId'])

array([     1,      2,      3, ..., 193585, 193587, 193609])

In [265]:
movies_dict = dict(zip(np.asarray(df_movies['movieId']),np.asarray(df_movies['title'])))
movies_dict

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (199

In [261]:
movies_dict.values()

dict_values([array([     1,      2,      3, ..., 193585, 193587, 193609]), array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Flint (2017)', 'Bungo Stray Dogs: Dead Apple (2018)',
       'Andrew Dice Clay: Dice Rules (1991)'], dtype=object)])

In [9]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
df_ratings['movieId'].value_counts()

356       329
318       317
296       307
593       279
2571      278
         ... 
5986        1
100304      1
34800       1
83976       1
8196        1
Name: movieId, Length: 9724, dtype: int64

In [15]:
df = df_ratings[['userId','movieId','rating']]
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Compute similarity

In [211]:
from sklearn.impute import SimpleImputer 

def get_sim_matrix(df_matrix):
    
    # impute missing values with mean
    
    imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
    
    df_matrix_normed = pd.DataFrame(imputer.fit_transform(df_matrix.T)).T
    df_matrix_normed.columns=df_matrix.columns
    df_matrix_normed.index=df_matrix.index
    df_matrix_normed.head()
    
    # obtain similarity
    sim_matrix = np.dot(df_matrix_normed, df_matrix_normed.T)
    df_ms_normed.index=df_ms.index
    
    # convert to dataframe
    df_sim = pd.DataFrame(sim_matrix, index = df_matrix.index, columns = df_matrix.index )
    
    return df_sim
    

### Compute user-user similarity

In [213]:
df_matrix_user = df.pivot_table(index=['userId'],columns=['movieId'],values='rating')
df_matrix_user.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [214]:
df_user_sim = get_sim_matrix(df_matrix_user)
df_user_sim.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,185538.526531,167638.616974,103425.057858,150975.513889,154396.897727,148326.590229,137151.45355,151770.93938,138454.217766,139202.314778,...,187912.156815,144042.613346,148936.488449,147752.83931,136305.329195,155291.841723,160761.470266,133101.379904,138850.489515,156615.304944
2,167638.616974,151604.463734,93521.494253,136507.157088,139611.637931,134129.728531,124018.830762,137233.324652,125194.677661,125874.483867,...,169916.706982,130250.400128,134680.718763,133607.76,123257.275862,140419.291325,145358.573022,120329.72002,125554.907269,141618.156132
3,103425.057858,93521.494253,57864.380342,84216.462963,86130.693473,82753.393802,76514.166667,84664.474359,77239.130435,77658.428571,...,104830.801599,80356.85812,83099.820499,82429.6,76043.622259,86619.978803,89678.677979,74232.965318,77461.801802,87377.390085
4,150975.513889,136507.157088,84216.462963,123301.901235,125720.737374,120793.607219,111702.125731,123584.716312,112741.062802,113367.837302,...,153013.30363,117310.022222,121303.617179,120316.324444,111006.702363,126461.318834,130905.045752,108339.176695,113066.12012,127528.364183
5,154396.897727,139611.637931,86130.693473,125720.737374,128624.0,123535.667632,114223.178828,126392.665377,115304.347826,115927.3,...,156494.729073,119962.987879,124047.456666,123053.4,113517.139243,129329.763636,133879.091881,110824.09884,115638.402948,130427.286692


### Compute item-item similarity

In [215]:
df_matrix_item = df.pivot_table(index=['movieId'],columns=['userId'],values='rating')
df_matrix_item.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [216]:
df_item_sim = get_sim_matrix(df_matrix_item)
df_item_sim.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,9527.109086,8223.810941,7806.716033,5638.5701,7351.99402,9441.842385,7623.808183,6879.459884,7476.181395,8361.143199,...,8371.186047,7175.302326,9567.069767,9567.069767,8371.186047,9567.069767,8371.186047,8371.186047,8371.186047,9567.069767
2,8223.810941,7268.938017,6836.696678,4934.13961,6438.689935,8265.856506,6672.321128,6019.403409,6543.525568,7319.637913,...,7326.931818,6280.227273,8373.636364,8373.636364,7326.931818,8373.636364,7326.931818,7326.931818,7326.931818,8373.636364
3,7806.716033,6836.696678,6538.05159,4687.980082,6117.578984,7857.729591,6343.768875,5720.776442,6218.21274,6950.99552,...,6959.278846,5965.096154,7953.461538,7953.461538,6959.278846,7953.461538,6959.278846,6959.278846,6959.278846,7953.461538
4,5638.5701,4934.13961,4687.980082,3393.591837,4417.959184,5673.966387,4580.174603,4133.919643,4493.303571,5028.623377,...,5032.5,4313.571429,5751.428571,5751.428571,5032.5,5751.428571,5032.5,5032.5,5032.5,5751.428571
5,7351.99402,6438.689935,6117.578984,4417.959184,5794.040816,7397.407913,5977.384921,5390.714286,5856.580357,6553.360119,...,6557.5,5620.714286,7494.285714,7494.285714,6557.5,7494.285714,6557.5,6557.5,6557.5,7494.285714


#### Find similar movies

In [287]:
def get_top_matches(similar_movies):
    return list(map(lambda x: movies_dict[x],np.asarray(similar_movies.sort_values()[:20].index)))

In [288]:
top_matches = df_item_sim.apply(get_top_matches)

In [295]:
df_top_matches = pd.DataFrame(top_matches,columns=['movie_list'])
df_top_matches.head()

Unnamed: 0_level_0,movie_list
movieId,Unnamed: 1_level_1
1,"[Reptilicus (1961), Alone in the Dark (2005), ..."
2,"[Baby Boy (2001), Killer Shrews, The (1959), F..."
3,"[Wasp Woman, The (1959), Brothers Solomon, The..."
4,"[Haunted House 2, A (2014), Derailed (2002), G..."
5,"[I Know Who Killed Me (2007), Risen (2016), Br..."


In [309]:
import re
df_movies[df_movies['title'].str.contains('star wars',flags=re.IGNORECASE)]

Unnamed: 0,movieId,title,genres
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
898,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
911,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
1979,2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi
3832,5378,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX
5896,33493,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi
6823,61160,Star Wars: The Clone Wars (2008),Action|Adventure|Animation|Sci-Fi
7367,79006,Empire of Dreams: The Story of the 'Star Wars'...,Documentary
8683,122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX
8908,135216,The Star Wars Holiday Special (1978),Adventure|Children|Comedy|Sci-Fi


In [311]:
df_top_m = pd.DataFrame(df_top_matches.movie_list.values.tolist(), 
                        index = list(map(lambda x: movies_dict[x],df_top_matches.index)))
df_top_m.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Toy Story (1995),Reptilicus (1961),Alone in the Dark (2005),Pokemon 4 Ever (a.k.a. Pokémon 4: The Movie) (...,Captain America (1979),Uncle Nino (2003),Captain America II: Death Too Soon (1979),Anaconda: The Offspring (2008),Glitter (2001),The Star Wars Holiday Special (1978),Legionnaire (1998),The Gracefield Incident (2015),Ben-hur (2016),Baby Boy (2001),While the City Sleeps (1956),Born to Be Wild (1995),Oblivion 2: Backlash (1996),Daddy Day Camp (2007),"Wasp Woman, The (1959)",Indestructible Man (1956),The Pumaman (1980)
Jumanji (1995),Baby Boy (2001),"Killer Shrews, The (1959)",Fullmetal Alchemist 2018 (2017),"Follow Me, Boys! (1966)",Begotten (1990),3 Ninjas Knuckle Up (1995),Derailed (2002),Lionheart (1990),The Pumaman (1980),Captain America II: Death Too Soon (1979),Unforgiven (2013),Amer (2009),Case 39 (2009),In the Name of the King: A Dungeon Siege Tale ...,Horrors of Spider Island (Ein Toter Hing im Ne...,The Beast of Hollow Mountain (1956),Idaho Transfer (1973),Captain America (1979),Saving Christmas (2014),Pokémon Heroes (2003)


In [313]:
df_top_m.loc['Star Wars: Episode VII - The Force Awakens (2015)']

0                                       War Room (2015)
1            The Butterfly Effect 3: Revelations (2009)
2                                       Old Dogs (2009)
3     Boudu Saved From Drowning (Boudu sauvé des eau...
4                               Pearl Jam Twenty (2011)
5                                     Unforgiven (2013)
6                                 Carnival Magic (1981)
7            Carabineers, The (Carabiniers, Les) (1963)
8                          While the City Sleeps (1956)
9                        Anaconda: The Offspring (2008)
10                                        Sorrow (2015)
11                                       Satanic (2016)
12                                Pokémon Heroes (2003)
13                               Wasp Woman, The (1959)
14                                Don't Look Now (1973)
15              Journey 2: The Mysterious Island (2012)
16                   Joe Dirt 2: Beautiful Loser (2015)
17    Maria Bamford: The Special Special Special

In [244]:
np.asarray(row.sort_values()[:20].index)

array([  4371,   3933, 184641,   7312,  26717,   4750,  72424,  26696,
         5700, 102749, 144352,  83601,  76030,  57326,   4051, 125221,
       145724, 102735, 141994,   6371])

### Create small dataset for validation

In [114]:
# data processing - clean up NaNs
df_ms = df_matrix[:5].iloc[:,np.arange(5)]
df_ms

movieId,1,2,3,4,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.0,,4.0,,
2,,,,,
3,,,,,
4,,,,,
5,4.0,,,,


### Valdating impute on small dateset

In [162]:
from sklearn.impute import SimpleImputer 
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

In [170]:
def impute_values(row):
    
    row_mean = np.around(np.mean(row),2)
    #print("mean:",row_mean)
    
    row_imputed = row.apply(lambda x: row_mean if np.isnan(x) else x)
    #print("row imputed:\n",row_imputed)
    
    return row_imputed

In [171]:
df_ms.apply(impute_values,axis=1)

movieId,1,2,3,4,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.0,5.0,4.0,4.33,4.33
2,3.0,2.5,2.0,2.5,2.5
3,2.0,3.0,5.0,3.0,2.0
4,2.0,3.0,2.0,2.0,1.0
5,4.0,1.0,2.0,3.0,5.0


In [180]:
df_ms_normed = pd.DataFrame(imputer.fit_transform(df_ms.T)).T
df_ms_normed.columns=df_ms.columns
df_ms_normed.index=df_ms.index
df_ms_normed

movieId,1,2,3,4,5
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4.0,5.0,4.0,4.333333,4.333333
2,3.0,2.5,2.0,2.5,2.5
3,2.0,3.0,5.0,3.0,2.0
4,2.0,3.0,2.0,2.0,1.0
5,4.0,1.0,2.0,3.0,5.0


### Apply normalization on the larger dataset

#### Using proprietary impute method

In [179]:
import time

start = time.time()

temp = df_matrix.apply(impute_values,axis=1)
temp.head()

# run your code
end = time.time()

elapsed = end - start

print("Elapsed Time:",elapsed)

Elapsed Time: 6.812793970108032


#### Using impute

In [178]:
import time

start = time.time()

df_matrix_normed = pd.DataFrame(imputer.fit_transform(df_matrix.T)).T
df_matrix_normed.columns=df_matrix.columns
df_matrix_normed.index=df_matrix.index
df_matrix_normed.head()

# run your code
end = time.time()

elapsed = end - start

print("Elapsed Time:",elapsed)

Elapsed Time: 0.23023104667663574
