In [56]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('precision',2)

from sklearn.metrics.pairwise import cosine_similarity

In [58]:
data_path = '../ml-latest'
all_files = [i for i in os.listdir(data_path) if i.endswith('.csv')]
print(all_files)

['genome-tags.csv', 'movies.csv', 'genome-scores.csv', 'tags.csv', 'ratings.csv', 'movies_clean.csv', 'links.csv']


In [59]:
def get_data(fn):
    df = pd.read_csv(os.path.join(data_path, '{}.csv'.format(fn)))
    print('{} total: {} rows.'.format(fn, len(df)))
    return df

In [86]:
movies = get_data('movies_clean')
movies = movies[[i for i in movies.columns if i not in ['Unnamed: 0', 'index']]]

# movies = movies.loc[(movies.total_ratings >= 10000) & (movies.year >= 1990) & (movies.mean_rating >= 4)].reset_index()
movies = movies.loc[movies.total_ratings >= 100].reset_index()

movies_clean total: 45843 rows.


In [87]:
n_movies = len(movies)
print('Dataset contains {} movies'.format(n_movies))

Dataset contains 9976 movies


In [88]:
tags = [i for i in movies.columns if i not in ['Unnamed: 0', 'level_0', 'index', 'movieId', 'title', 'genres', 'year', 'total_ratings']]

In [89]:
#remove all tags that don't have a movie associated to them
for tag in tags:
    if movies[tag].sum() == 0:
        movies = movies.drop(tag, axis=1)

In [90]:
movies.dropna(subset=tags, inplace=True)

In [91]:
# reduced tags list
tags = [i for i in movies.columns if i not in ['Unnamed: 0', 'level_0', 'index', 'movieId', 'title', 'genres', 'year', 'total_ratings']]

In [92]:
movies.head()

Unnamed: 0,index,movieId,title,genres,year,Sci-Fi,Thriller,Adventure,Drama,Musical,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,0,1,toy story,[0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1],1995,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,jumanji,[0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0],1995,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,3,grumpier old men,[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1],1995,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,4,waiting to exhale,[0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1],1995,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,5,father of the bride part ii,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1],1995,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.19,0.0,0.0,0.0,0.0,0.0


## K-means clustering

In [93]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

In [94]:
scaler = StandardScaler()
data = scaler.fit_transform(movies[tags])

In [95]:
# cluster data
cluster = KMeans(n_clusters = 20)
cluster_pred = cluster.fit_predict(data)

In [96]:
# perform t-sne for better plotting
tsne = TSNE(n_components=2)
tsne_fit = tsne.fit_transform(data)

In [97]:
# scatter plot of the movies, colored by their cluster and plotted according to tsne
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [98]:
trace = go.Scatter(
    x = tsne_fit[:,0],
    y = tsne_fit[:,1],
    mode='markers',
    name='Lines, Marers and Text',
    marker=dict(color=cluster_pred,
               colorscale='Portland',
               showscale=True)
)
data = [trace]
iplot(data, filename='scatter')

## similartiy

In [12]:
from scipy import sparse
movies_sparse = sparse.csr_matrix(movies[tags].as_matrix())

In [13]:
# own cosine similarity method because sklearn dies

from sklearn.preprocessing import normalize
def cosine_similarities(m):
    m = m.T
    col_normed_m = normalize(m.tocsc(), axis=0)
    return col_normed_m.T * col_normed_m

similarities = cosine_similarities(movies_sparse)

In [14]:
# set diagonal to 0 since we don't want to return the movie itself
for i in range(similarities.shape[0]):
    similarities[i,i] = 0

In [15]:
movie_id_to_idx_lookup = dict(zip(movies.movieId.values, movies.index.values))
movie_idx_to_id_lookup = dict(zip(movies.index.values, movies.movieId.values))
movie_id_to_title_lookup = dict(zip(movies.movieId.values, movies.title.values))

In [23]:
def get_similarities_for_movie(movie_id):
    movie_idx = movie_id_to_idx_lookup[movie_id]
    row = similarities[movie_idx]
    similar_idx = row.argmax()
    similar_id = movie_idx_to_id_lookup[similar_idx]
    similar_score = row.max()
    print('For {} ({}, rated {:.2f}) we suggest {} ({}, rated {:.2f}) with similarity rating {:.2f}.'.format(movie_id_to_title_lookup[movie_id],
                                                                                 movie_id, movies.iloc[movie_idx].mean_rating,
                                                                                 movie_id_to_title_lookup[similar_id],
                                                                                 similar_id, movies.iloc[similar_idx].mean_rating,
                                                                                 similar_score))
    return similar_id, similar_score

In [17]:
def get_similarities_for_movie_title(title):
    movie_id = movies[movies.title==title].movieId.iloc[0]
    get_similarities_for_movie(movie_id)

In [26]:
get_similarities_for_movie_title('annie hall')

For annie hall (1230, rated 4.05) we suggest saving private ryan (2028, rated 4.05) with similarity rating 0.87.


In [31]:
[i for i in movies.title if 'schindler' in i]

["schindler's list"]

In [36]:
cosine_similarity(movies.iloc[118][tags].as_matrix(), movies.iloc[343][tags].as_matrix())



array([[0.63849069]])

In [37]:
cosine_similarity(movies.iloc[227][tags].as_matrix(), movies.iloc[343][tags].as_matrix())



array([[0.87203722]])

In [33]:
movies[movies.title.isin(['annie hall', 'saving private ryan', "schindler's list"])].T

Unnamed: 0,118,227,343
index,523,1203,1945
movieId,527,1230,2028
title,schindler's list,annie hall,saving private ryan
genres,[1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0],[0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0],[1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0]
year,1993,1977,1998
Drama,1,0,1
Children,0,0,0
Fantasy,0,0,0
Crime,0,0,0
Sci-Fi,0,0,0
