In [10]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

pd.options.display.max_colwidth = 30
pd.options.display.float_format = '{:,.3f}'.format

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')

## Compute scores

In [3]:
movie_scores = ratings.groupby(['movie_id']).rating.agg(['count', 'mean', 'std'])
movie_scores.columns = ['ratings_cnt', 'ratings_avg', 'ratings_std']
movie_scores.sample(10)

Unnamed: 0_level_0,ratings_cnt,ratings_avg,ratings_std
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
93911,1,3.0,
94044,7,3.214286,1.253566
127831,1,3.5,
66927,4,4.0,0.912871
70521,93,3.655914,0.969588
33358,131,3.660305,0.912029
99679,4,2.75,1.658312
104633,1,3.5,
333,8780,3.348349,1.065052
91978,208,3.197115,0.817733


In [11]:
movies = pd.merge(movies, movie_scores, left_on='movie_id', right_index=True)
movies['age'] = 2018 - movies.year
movies.sample(5).T

Unnamed: 0,18874,18904,26623,13520,14045
movie_id,93911,94044,127831,66927,70521
title,Village People Radio Show ...,Terraferma (2011),Tintin and I (2003),Tokyo.Sora (2002),Lost in Austen (2008)
year,2007.00,2011.00,2003.00,2002.00,2008.00
parsed_genres,['documentary'],['drama'],['documentary'],['drama'],"['drama', 'fantasy', 'roma..."
genre_thriller,0,0,0,0,0
genre_animation,0,0,0,0,0
genre_crime,0,0,0,0,0
genre_filmnoir,0,0,0,0,0
genre_documentary,1,0,1,0,0
genre_western,0,0,0,0,0


## Some charts

### Top rated horror movies

In [None]:
print(movies[movies.genre_horror == 1].ratings_cnt.quantile([0.8 + i*0.05 for i in range(5)]))
d = movies[(movies.genre_horror == 1) & (movies.ratings_cnt > 20)].copy()
d['score'] = (
    + 1.00 * d.ratings_avg
    - 0.50 * d.ratings_std
    + 0.20 * d.ratings_cnt)
d = d.sort_values(by='score', ascending=False)
d.head(10)[['movie_id', 'title', 'year', 'parsed_genres', 'ratings_cnt', 'ratings_avg', 'ratings_std', 'score']]

### Controversial horror movies

In [None]:
print(movies[movies.genre_horror == 1].ratings_cnt.quantile([0.8 + i*0.05 for i in range(5)]))
d = movies[(movies.genre_horror == 1) & (movies.ratings_cnt > 20)].copy()
d['score'] = (
    + 0.10 * d.ratings_avg
    + 100.0 * d.ratings_std
    + 0.00 * d.ratings_cnt)
d = d.sort_values(by='score', ascending=False)
d.head(10)[['movie_id', 'title', 'year', 'parsed_genres', 'ratings_cnt', 'ratings_avg', 'ratings_std', 'score']]