In [1]:
import pandas as pd
import numpy as np
import re
import math

# Dataframe with IMDB Movie feature Dataset containing Netflix Movie ID
df = pd.read_csv('omdb.txt', sep="|")
# Dataframe with Netflix data containing userid rating and movie id
df_u= pd.read_csv('netflix-user-movie.txt', sep=",")
df_u.columns = ['movieid','userid','rating', 'date']
df_u.head()
#filter null rows.

Unnamed: 0,movieid,userid,rating,date
0,1,822109,5,2005-05-13
1,1,885013,4,2005-10-19
2,1,30878,4,2005-12-26
3,1,823519,3,2004-05-03
4,1,893988,3,2005-11-17


In [2]:
userRating_dict = {}
userMoviesCount_dict = {}
userMoviesAvgRating_dict = {}
userMoviesLiked = {}

#Iterate the dataframe and build aggregate ratings and movie counts for each user
for row in df_u.itertuples():
    userRating_dict[row.userid] = userRating_dict.get(row.userid,0) + row.rating
    userMoviesCount_dict[row.userid] = userMoviesCount_dict.get(row.userid,0) + 1
 
#Compute the average rating
for userid in userRating_dict:
    userMoviesAvgRating_dict[userid] = userRating_dict[userid]/userMoviesCount_dict[userid]



In [None]:
#list to add to the dataframe
like_list = []
userMoviesTitle = []
#Iterate the dataframe and find if the movie is liked by the user or not
for row in df_u.itertuples():
    like_list.extend('1' if row.rating >= userMoviesAvgRating_dict[row.userid] else '0')
    userMoviesTitle.extend(df.loc[df['id'] == row.movieid]['Title'])

#Add a new likedmovie to the dataframe
df_u = df_u.assign(likedmovie = like_list)
df_u = df_u.assign(Title = userMoviesTitle)
df_u.head()

In [4]:
# Get a dataframe of the top rated movies for user 822109
df_filtered_822109 = df_u[(df_u.userid == 822109) & (df_u.likedmovie == '1')]

In [5]:
#Data frame containing the movies which user 822109 liked
df_filtered_822109.head()

Unnamed: 0,movieid,userid,rating,date,likedmovie
0,1,822109,5,2005-05-13,1
3013481,571,822109,5,2005-04-25,1
3215550,607,822109,5,2005-04-07,1
4854997,985,822109,5,2005-04-25,1
5644197,1144,822109,5,2005-05-21,1


In [6]:
# Iterate df_filtered_822109 and find movies similar to the movie list

#Cleansing the IMDB Data which contains netflix movie id
#filter null rows.
df = df[df.Title != 'null']
#pick first year for Series.
df['Year'] = df['Year'].str.replace('–[0-9]*$','')
#Correct Rated column
df.Rated = df.Rated.str.replace('UNRATED','NOT RATED')
df.Rated = df.Rated.str.replace('NR','NOT RATED')
df.Rated = df.Rated.str.replace('Not Rated','NOT RATED')
df.Rated = df.Rated.str.replace('Unrated','NOT RATED')

#remove commas from imdbvotes
df.imdbVotes = df.imdbVotes.str.replace(',','')

# classify runtime by hours
df.Runtime = df.Runtime.str.replace(' min','')
df.Runtime = df.Runtime.str.replace(',','')
df.Runtime = df.Runtime.str.replace('1 h 30','90')
df.Runtime = df.Runtime.str.replace('3 h 48','223')
df.Runtime = df.Runtime.str.replace('1 h 20','80')
#df.Runtime = df.Runtime.str.replace(NaN,'0')

df.Runtime = pd.to_numeric(df.Runtime, errors='coerce')
minRuntime = min(df.Runtime)
maxRuntime = max(df.Runtime)

df['Runtime'] = df['Runtime'].apply(lambda x: 4*((x - minRuntime)/(maxRuntime - minRuntime)) + 1)
df['Runtime'] = df['Runtime'].apply(lambda x: 1 if x < 1 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df.imdbVotes = pd.to_numeric(df.imdbVotes, errors='coerce')
minImdbVotes = min(df.imdbVotes)
maxImdbVotes = max(df.imdbVotes)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 4*((x - minImdbVotes)/(maxImdbVotes - minImdbVotes)) + 1)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 1 if x < 1 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df.imdbRating = pd.to_numeric(df.imdbRating, errors='coerce')
minImdbRating = min(df.imdbRating)
maxImdbRating = max(df.imdbRating)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 4*((x - minImdbRating)/(maxImdbRating - minImdbRating)) + 1)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 1 if x < 1 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df.Year = pd.to_numeric(df.Year, errors='coerce')
minYear = min(df.Year)
maxYear = max(df.Year)
df['Year'] = df['Year'].apply(lambda x: 4*((x - minYear)/(maxYear - minYear)) + 1)
df['Year'] = df['Year'].apply(lambda x: 1 if x < 1 else x)
df['Year'] = df['Year'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['Year'] = df['Year'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['Year'] = df['Year'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['Year'] = df['Year'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df = df[df['Genre'].notnull()]
df = df[df['Writer'].notnull()]
df = df[df['Actors'].notnull()]
df = df[df['Director'].notnull()]
df = df[df['Country'].notnull()]
df = df[df['Language'].notnull()]


In [7]:
#Number of IMDB Dataframe records
len(df)

9156

In [28]:
import graphlab as gl
#Below line is required when running first time
#graphlab.get_dependencies()

df.to_csv('normalized.csv', sep='|')

movieSFrame = gl.SFrame.read_csv("normalized.csv", sep='|')
#'Runtime','imdbRating','Year','imdbVotes'
knn_model = gl.nearest_neighbors.create(movieSFrame,features=['Genre','Writer','Actors','Director','Country','Language','Rated','Awards'],label='id')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,long,str,long,str,str,str,str,str,str,str,str,float,float,str,str,float,str,str,str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Defaulting to brute force instead of ball tree because there are multiple distance components.


In [29]:
#df_filtered_822109[df_filtered_822109['userid'] == 822109]

movieSFrame['id'].astype(str)
movie_571 = movieSFrame[movieSFrame['id'] == '571']
knn_model.query(movie_571)
movie_607 = movieSFrame[movieSFrame['id'] == '607']
knn_model.query(movie_607)
movie_985 = movieSFrame[movieSFrame['id'] == '985']
knn_model.query(movie_985)



ToolkitError: Input dataset either has no rows or no columns. A non-empty SFrame is required.

In [13]:
movieSFrame[movieSFrame['Title'] == 'My Bloody Valentine']

X1,id,Title,Year,Rated,Released,Writer,Actors
23,24,My Bloody Valentine,1,R,11 Feb 1981,"Stephen A. Miller (story concept), John Beaird ...","Paul Kelman, Lori Hallier, Neil Affleck, ..."

Plot,Awards,Director,Genre,imdbRating,Runtime,Language
A decades-old folk tale surrounding a deranged ...,,George Mihalka,"Horror, Mystery, Thriller",4.0,2.0,English

Country,imdbVotes,imdbID,Type,totalSeasons,Response,Metascore
Canada,2.0,tt0082782,movie,,True,


In [None]:
movieSFrame['id'].astype(str)
movie_1 = movieSFrame[movieSFrame['id'].contains('1')]
knn_model.query(movie_1)

In [38]:
df.get_value(3,'Title')

KeyError: 3L

In [40]:
df.head()
df.loc[df['id'] == 3]['Title']

2    Character
Name: Title, dtype: object