## Generalised Recommender based on popular movies by 'Genre'

#### Author: C.K. Jagannath

In [1]:
# Importing the required modules
import pandas as pd
import numpy as np
from pandas import DataFrame as df

In [2]:
# Reads movie information(movieId, imdbId, tmdbId)
movie_imdbid_tmdbid = pd.read_csv('ml-25m/links.csv')

In [3]:
# Reads rating data(contains userId, movieId, rating)
movie_ratings = pd.read_csv('ml-25m/ratings.csv')

In [4]:
# This step is done to count no. of ratings for each movie
# To do this, we first group the 'movie_ratings' dataframe based on 'movieId' column and turn it into a frame with column 'votes'
df_votes = movie_ratings.groupby('movieId')['movieId'].count().to_frame(name='votes').reset_index()

In [5]:
# This step is done to get the average rating for each movie
# To do this, we first group the 'movie_ratings' df based on 'movieId' column and obtain 'rating' column to perform mean and turn it into a frame 
df_avgrating = movie_ratings.groupby('movieId')['rating'].mean().apply(lambda x: round(x,3)).to_frame(name='avg_rating').reset_index()

In [6]:
# Here we merge the 'votes' df and 'avg_rating' df based on 'movieId'
df_votes_avgrating_merged = df.merge(df_votes,df_avgrating,on='movieId')

In [7]:
# Here we merge the obtained df with 'votes' and 'avg_rating' to the df which contains movie information i.e. 'ids'
df_merged_moviedata = df.merge(movie_imdbid_tmdbid,df_votes_avgrating_merged,how='left',on='movieId')

In [8]:
# Reading movie-titles & genres data
movie_name_genres = pd.read_csv('ml-25m/movies.csv')

In [9]:
# Merging titles & genres with 'ids' df based on 'movieId'
merged_names_ids = pd.merge(movie_name_genres,movie_imdbid_tmdbid,on='movieId')

In [10]:
# This gives us the required df to work on, which contains whole information including votes and avg_rating
merged_moviedata = md = df.merge(merged_names_ids,df_merged_moviedata,on=['movieId','tmdbId','imdbId'])

In [11]:
# Modifying the 'genres' column in such a way that 'no genres listed' will be [] and others will be converted to list of genres
md['genres'] = md['genres'].apply(lambda x: x.split('|') if x != '(no genres listed)' else [])

In [12]:
vote_counts = md[md['votes'].notnull()]['votes'].astype(int) # getting only the rows of the df with non-null in 'votes' column(i.e only movies with atleast a vote)
vote_averages = md[md['avg_rating'].notnull()]['avg_rating'] # getting only the rows of the df with non-null in 'avg_rating' column(i.e only movies with atleast a vote)
C = round(vote_averages.mean(), 1)    
m = int(vote_counts.quantile(0.95))
#C is mean votes across whole report and m is min votes required to be qualified for building top movies chart i.e movie should be in 95th percentile

In [13]:
C, m

(3.1, 1503)

In [14]:
# getting movies with min. votes greater than C
qualified = md[(md['votes'] >= m) & (md['votes'].notnull()) & (md['avg_rating'].notnull())][['movieId', 'title', 'genres', 'imdbId', 'tmdbId', 'votes', 'avg_rating']]

In [15]:
# Therefore 2953 movies qualify to be on our top movies chart

In [16]:
def weighted_rating(x):
    v = x['votes']
    R = x['avg_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)  # IMDB's weighted rating formula
# C, m are same as earlier defined and v is votes for a particular movie and R is its corresponding average rating 

In [17]:
#getting weighted rating for each movie
qualified['wei_rating'] = qualified.apply(weighted_rating, axis=1)
qualified['avg_rating'] = qualified['avg_rating'].apply(lambda x: round(x, 1))

In [18]:
# getting the top 250 movies with highest weighted rating in descending order
qualified = qualified.sort_values('wei_rating', ascending=False).head(250)

In [19]:
qualified.head(15)

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,votes,avg_rating,wei_rating
314,318,"Shawshank Redemption, The (1994)","[Crime, Drama]",111161,278.0,81482.0,4.4,4.390201
840,858,"Godfather, The (1972)","[Crime, Drama]",68646,238.0,52498.0,4.3,4.289933
49,50,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]",114814,629.0,55366.0,4.3,4.252708
522,527,Schindler's List (1993),"[Drama, War]",108052,424.0,60411.0,4.2,4.220132
1190,1221,"Godfather: Part II, The (1974)","[Crime, Drama]",71562,240.0,34188.0,4.3,4.213066
2867,2959,Fight Club (1999),"[Action, Crime, Drama, Thriller]",137523,550.0,58773.0,4.2,4.199873
1164,1193,One Flew Over the Cuckoo's Nest (1975),[Drama],73486,510.0,36058.0,4.2,4.174223
292,296,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]",110912,680.0,79672.0,4.2,4.168837
883,904,Rear Window (1954),"[Mystery, Thriller]",47396,567.0,20162.0,4.2,4.159052
734,750,Dr. Strangelove or: How I Learned to Stop Worr...,"[Comedy, War]",57012,935.0,26714.0,4.2,4.156555


In [20]:
md['genres']

0        [Adventure, Animation, Children, Comedy, Fantasy]
1                           [Adventure, Children, Fantasy]
2                                        [Comedy, Romance]
3                                 [Comedy, Drama, Romance]
4                                                 [Comedy]
                               ...                        
62418                                              [Drama]
62419                                        [Documentary]
62420                                      [Comedy, Drama]
62421                                                   []
62422                           [Action, Adventure, Drama]
Name: genres, Length: 62423, dtype: object

#### This first takes each row from 'genres' column, converts them into a series, then it is stacked so that the corresponding columns are removed and added as an extra level as index. Then we remove that unnecessary extra level using reset_index on level 1

In [21]:
s = md.apply(lambda x: pd.Series(x['genres'], dtype=object),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'

In [22]:
s.head(10)

0    Adventure
0    Animation
0     Children
0       Comedy
0      Fantasy
1    Adventure
1     Children
1      Fantasy
2       Comedy
2      Romance
Name: genre, dtype: object

In [23]:
# we add the 's' df to our df and removing 'genres' column
gen_md = md.drop('genres', axis=1).join(s)
gen_md.dropna(inplace = True)

In [24]:
gen_md

Unnamed: 0,movieId,title,imdbId,tmdbId,votes,avg_rating,genre
0,1,Toy Story (1995),114709,862.0,57309.0,3.894,Adventure
0,1,Toy Story (1995),114709,862.0,57309.0,3.894,Animation
0,1,Toy Story (1995),114709,862.0,57309.0,3.894,Children
0,1,Toy Story (1995),114709,862.0,57309.0,3.894,Comedy
0,1,Toy Story (1995),114709,862.0,57309.0,3.894,Fantasy
...,...,...,...,...,...,...,...
62420,209163,Bad Poems (2018),6755366,553036.0,1.0,4.500,Comedy
62420,209163,Bad Poems (2018),6755366,553036.0,1.0,4.500,Drama
62422,209171,Women of Devil's Island (1962),55323,79513.0,1.0,3.000,Action
62422,209171,Women of Devil's Island (1962),55323,79513.0,1.0,3.000,Adventure


### This is a function for filtering movie chart based on genre specified

In [25]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['votes'].notnull()]['votes'].astype('int')
    vote_averages = df[df['avg_rating'].notnull()]['avg_rating']
    C = round(vote_averages.mean(), 1)
    m = int(vote_counts.quantile(percentile))
    
    qualified = df[(df['votes'] >= m) & (df['votes'].notnull()) & (df['avg_rating'].notnull())][['movieId', 'title', 'imdbId', 'tmdbId', 'votes', 'avg_rating']]
    qualified['wei_rating'] = qualified.apply(weighted_rating, axis=1)
    qualified['avg_rating'] = qualified['avg_rating'].apply(lambda x: round(x, 1))
    
    qualified['wei_rating'] = qualified.apply(lambda x: (x['votes']/(x['votes']+m) * x['avg_rating']) + (m/(m+x['votes']) * C), axis=1)
    qualified = qualified.sort_values('wei_rating', ascending=False).head(250)
    return qualified

In [33]:
build_chart('Musical').head(15)

Unnamed: 0,movieId,title,imdbId,tmdbId,votes,avg_rating,wei_rating
878,899,Singin' in the Rain (1952),45152,872.0,10895.0,4.1,4.031115
1223,1256,Duck Soup (1933),23969,3063.0,5556.0,4.1,3.974176
898,919,"Wizard of Oz, The (1939)",32138,630.0,24658.0,3.9,3.875271
893,914,My Fair Lady (1964),58385,11113.0,9692.0,3.9,3.84034
2767,2859,Stop Making Sense (1984),88178,24128.0,2186.0,4.1,3.836905
359,364,"Lion King, The (1994)",110357,8587.0,42745.0,3.8,3.787587
13133,66934,Dr. Horrible's Sing-Along Blog (2008),1227926,14301.0,4322.0,3.9,3.779024
1189,1220,"Blues Brothers, The (1980)",80455,525.0,20460.0,3.8,3.774638
1012,1035,"Sound of Music, The (1965)",59742,15121.0,15248.0,3.8,3.766454
924,945,Top Hat (1935),27125,3080.0,2029.0,4.0,3.753615
