In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
pd.__version__

'0.20.1'

In [2]:
%run 'movies.ipynb'
%run 'tags.ipynb'
%run 'ratings.ipynb'

Tags = getTags()
allMovies = getMovies()
Ratings = getRatings()

verbose ==> False


In [3]:
def scale_year(df, col):
    '''
    Scale column 
    '''
    min_year = df[col].min()
    max_year = df[col].max()
    return ((df[col] - min_year) / (max_year - min_year))

In [4]:
def getAverageRatings(Ratings):
    movieGroup = Ratings.groupby(by=['movieId'])
    mId = []
    for name, group in movieGroup:
        mId.append((name, group['rating'].mean()))
    return pd.DataFrame(data=mId, columns=['movieId', 'Avg_rating'])

In [5]:
def dropFeatures(df, features):
    return df.drop(features, axis=1);

In [6]:
def get_source_and_target_movies(allMovies):
    '''
    Return a movie(sim_to) whose similar movies(sim_from) needs to be find out.
    '''
    # pick a sample from all movies
    sourceMovie = allMovies.sample(1);
    
    # list out features which can not be compared.
    features_not_to_consider = ['movieId'];
    
    # set index for identification
    allMovies = allMovies.set_index('title')
    sourceMovie = sourceMovie.set_index('title')
    
    # remove feature which can not be compared
    df_all_movies = dropFeatures(df=allMovies, features=features_not_to_consider);
    sourceMovie = dropFeatures(df=sourceMovie, features=features_not_to_consider);
    
    return df_all_movies, sourceMovie;


In [7]:
avgRatings = getAverageRatings(Ratings)
allMovies = pd.merge(allMovies, avgRatings, on="movieId", how='left')
allMovies.head(2)

Unnamed: 0,animation,fantasy,imax,thriller,children,crime,adventure,year,film-noir,movieId,mystery,title,sci-fi,western,horror,dra,drama,romancecioccolata,musical,action,documentary,romance,war,comedy,Avg_rating
0,1,1,0,0,1,0,1,1995,0,1,0,Toy Story (1995),0,0,0,0,0,0,0,0,0,0,0,1,3.87247
1,0,1,0,0,1,0,1,1995,0,2,0,Jumanji (1995),0,0,0,0,0,0,0,0,0,0,0,0,3.401869


In [9]:
allMovies['year'] = scale_year(allMovies, 'year')
allMovies['Avg_rating'] = scale_year(allMovies, 'Avg_rating')
sim_from, sim_to = get_source_and_target_movies(allMovies=allMovies)
movie_name = allMovies[allMovies['title'] == sim_to.index.values[0]].movieId.values[0]
print('You want to find movies similar to movie with id <<%s>> and name << %s >>' % (movie_name, sim_to.index.values[0]))

You want to find movies similar to movie with id <<562>> and name << Welcome to the Dollhouse (1995) >>


In [10]:
sim_to.head()

Unnamed: 0_level_0,animation,fantasy,imax,thriller,children,crime,adventure,year,film-noir,mystery,sci-fi,western,horror,dra,drama,romancecioccolata,musical,action,documentary,romance,war,comedy,Avg_rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Welcome to the Dollhouse (1995),0,0,0,0,0,0,0,0.983051,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0.679167


In [11]:
%run 'rec_engine_content_based_non_personalized.ipynb'

similar_movies = find_similar_movies(sim_to=sim_to, sim_from=sim_from);

In [12]:
sim_df = pd.DataFrame(data=similar_movies, columns=['title', 'sim_index'])
sim_df = sim_df.sort_values(by=['sim_index'], ascending=False)
sim_df.head(30)

Unnamed: 0,title,sim_index
503,Welcome to the Dollhouse (1995),1.0
113,Flirting With Disaster (1996),0.841548578816
155,Living in Oblivion (1995),0.841546659865
460,"Ref, The (1994)",0.841533725789
64,Friday (1995),0.84149000688
390,Dazed and Confused (1993),0.841473580051
278,Stuart Saves His Family (1995),0.841457767416
122,"Birdcage, The (1996)",0.841330671014
419,"Hudsucker Proxy, The (1994)",0.841260047677
398,Fear of a Black Hat (1994),0.841208885358
