In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('precision',2)

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data_path = '../ml-latest'
all_files = [i for i in os.listdir(data_path) if i.endswith('.csv')]
print(all_files)

['genome-tags.csv', 'movies.csv', 'genome-scores.csv', 'tags.csv', 'ratings.csv', 'movies_clean.csv', 'links.csv']


In [4]:
def get_data(fn):
    df = pd.read_csv(os.path.join(data_path, '{}.csv'.format(fn)))
    print('{} total: {} rows.'.format(fn, len(df)))
    return df

In [5]:
movies = get_data('movies_clean')
movies = movies[[i for i in movies.columns if i not in ['Unnamed: 0', 'index']]]
movies.head()

movies_clean total: 45843 rows.


Unnamed: 0,movieId,title,genres,year,Sci-Fi,Thriller,Adventure,Drama,Musical,Fantasy,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1,toy story,[0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1],1995,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,jumanji,[0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0],1995,0,0,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,grumpier old men,[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1],1995,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,waiting to exhale,[0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1],1995,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,father of the bride part ii,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1],1995,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.19,0.0,0.0,0.0,0.0,0.0


In [6]:
n_movies = len(movies)

In [7]:
tags = [i for i in movies.columns if i not in ['Unnamed: 0', 'level_0', 'index', 'movieId', 'title', 'genres', 'year', 'total_ratings']]

## similartiy

In [8]:
movies.dropna(subset=tags, inplace=True)

In [9]:
from scipy import sparse
movies_sparse = sparse.csr_matrix(movies[tags].as_matrix())

In [10]:
# own cosine similarity method because sklearn dies

from sklearn.preprocessing import normalize
def cosine_similarities(m):
    m = m.T
    col_normed_m = normalize(m.tocsc(), axis=0)
    return col_normed_m.T * col_normed_m

similarities = cosine_similarities(movies_sparse)

In [11]:
# set diagonal to 0 since we don't want to return the movie itself
for i in range(similarities.shape[0]):
    similarities[i,i] = 0

In [12]:
movie_id_to_idx_lookup = dict(zip(movies.movieId.values, movies.index.values))
movie_idx_to_id_lookup = dict(zip(movies.index.values, movies.movieId.values))
movie_id_to_title_lookup = dict(zip(movies.movieId.values, movies.title.values))

In [13]:
def get_similarities_for_movie(movie_id):
    movie_idx = movie_id_to_idx_lookup[movie_id]
    similarities_row = similarities[movie_idx]
    similar_id = movie_idx_to_id_lookup[similarities_row.argmax()]
    similar_score = similarities_row.max()
    print('For {} ({}) we suggest {} ({}) with similarity rating {:.2f}.'.format(movie_id_to_title_lookup[movie_id],
                                                                                movie_id,
                                                                                movie_id_to_title_lookup[similar_id],
                                                                                similar_id,
                                                                                similar_score))

In [14]:
def get_similarities_for_movie_title(title):
    movie_id = movies[movies.title==title].movieId.iloc[0]
    get_similarities_for_movie(movie_id)

In [15]:
movies[movies.movieId==1089]

Unnamed: 0,movieId,title,genres,year,Sci-Fi,Thriller,Adventure,Drama,Musical,Fantasy,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
1067,1089,reservoir dogs,[0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0],1992,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.0


In [16]:
print(similarities[1067].argmax())

2874


In [17]:
movies.iloc[31443]

movieId                                                  136990
title                                       the perfect student
genres                [0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0]
year                                                       2011
Sci-Fi                                                        0
Thriller                                                      1
Adventure                                                     0
Drama                                                         0
Musical                                                       0
Fantasy                                                       0
Western                                                       0
Crime                                                         1
(no genres listed)                                            0
Romance                                                       0
Children                                                      0
Mystery                                 

In [29]:
get_similarities_for_movie_title('in the mood for love')

For in the mood for love (4144) we suggest jane eyre (6983) with similarity rating 0.81.
