In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab
import scipy
import seaborn as sns
import sklearn
from sklearn import *
import statsmodels as sm

np.random.seed(1337)

%matplotlib inline

sns.set(font_scale=1.3)
sns.set_style('whitegrid')
sns.set_palette(sns.color_palette('muted'))

## Load the data

In [2]:
movies = pd.read_csv('data/movies_clean.csv')
ratings = pd.read_csv('data/ratings_train.csv')

## Compute scores

In [3]:
movie_scores = ratings.groupby(['movie_id']).rating.agg(['count', 'mean', 'std'])
movie_scores.columns = ['ratings_cnt', 'ratings_avg', 'ratings_std']
movie_scores.sample(10)

Unnamed: 0_level_0,ratings_cnt,ratings_avg,ratings_std
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2706,85,3.4,1.017115
58309,1,4.5,
6578,1,3.5,
6710,4,3.625,0.478714
362,25,3.56,1.092779
55190,1,3.0,
65188,3,3.5,1.0
2166,3,2.333333,1.154701
94266,3,2.5,1.732051
78316,3,2.833333,1.607275


In [4]:
movies = pd.merge(movies, movie_scores, left_on='movie_id', right_index=True)
movies['age'] = 2018 - movies.year
movies.sample(5)

Unnamed: 0,movie_id,title,year,parsed_genres,genre_scifi,genre_imax,genre_western,genre_comedy,genre_fantasy,genre_action,...,genre_filmnoir,genre_crime,genre_drama,genre_thriller,genre_mystery,genre_musical,ratings_cnt,ratings_avg,ratings_std,age
6520,53993,Evan Almighty (2007),2007.0,"['comedy', 'fantasy']",0,0,0,1,1,0,...,0,0,0,0,0,0,8,2.75,1.224745,11.0
8735,127134,A Walk in the Woods (2015),2015.0,"['adventure', 'comedy', 'drama']",0,0,0,1,0,0,...,0,0,1,0,0,0,1,3.5,,3.0
5566,26743,Only Yesterday (Omohide poro poro) (1991),1991.0,"['animation', 'drama']",0,0,0,0,0,0,...,0,0,1,0,0,0,1,2.5,,27.0
7433,80862,Catfish (2010),2010.0,"['documentary', 'mystery']",0,0,0,0,0,0,...,0,0,0,0,1,0,2,4.0,0.707107,8.0
4903,7348,Spartan (2004),2004.0,['thriller'],0,0,0,0,0,0,...,0,0,0,1,0,0,3,3.5,0.0,14.0


## Some charts

### Top rated horror movies

In [5]:
print(movies[movies.genre_horror == 1].ratings_cnt.quantile([0.8 + i*0.05 for i in range(5)]))
d = movies[(movies.genre_horror == 1) & (movies.ratings_cnt > 20)].copy()
d['score'] = (
    + 1.00 * d.ratings_avg
    - 0.50 * d.ratings_std
    + 0.20 * d.ratings_cnt)
d = d.sort_values(by='score', ascending=False)
d.head(10)[['movie_id', 'title', 'year', 'parsed_genres', 'ratings_cnt', 'ratings_avg', 'ratings_std', 'score']]

0.80      7.0
0.85     10.0
0.90     15.0
0.95     27.0
1.00    217.0
Name: ratings_cnt, dtype: float64


Unnamed: 0,movie_id,title,year,parsed_genres,ratings_cnt,ratings_avg,ratings_std,score
510,593,"Silence of the Lambs, The (1991)",1991.0,"['crime', 'horror', 'thriller']",217,4.175115,0.882824,47.133703
2078,2762,"Sixth Sense, The (1999)",1999.0,"['drama', 'horror', 'mystery']",149,3.822148,0.974461,33.134917
915,1214,Alien (1979),1979.0,"['horror', 'scifi']",112,3.991071,0.910356,25.935893
902,1200,Aliens (1986),1986.0,"['action', 'adventure', 'horror', 'scifi']",99,3.974747,0.988128,23.280684
957,1258,"Shining, The (1980)",1980.0,['horror'],88,4.096591,0.742232,21.325475
217,253,Interview with the Vampire: The Vampire Chroni...,1994.0,"['drama', 'horror']",89,3.398876,0.959935,20.718909
1067,1387,Jaws (1975),1975.0,"['action', 'horror']",71,3.922535,0.966022,17.639524
1972,2617,"Mummy, The (1999)",1999.0,"['action', 'adventure', 'comedy', 'fantasy', '...",75,3.166667,1.128061,17.602636
920,1219,Psycho (1960),1960.0,"['crime', 'horror']",69,4.007246,0.884893,17.3648
1083,1407,Scream (1996),1996.0,"['comedy', 'horror', 'mystery', 'thriller']",61,3.172131,1.117606,14.813328


### Controversial horror movies

In [6]:
print(movies[movies.genre_horror == 1].ratings_cnt.quantile([0.8 + i*0.05 for i in range(5)]))
d = movies[(movies.genre_horror == 1) & (movies.ratings_cnt > 20)].copy()
d['score'] = (
    + 0.10 * d.ratings_avg
    + 100.0 * d.ratings_std
    + 0.00 * d.ratings_cnt)
d = d.sort_values(by='score', ascending=False)
d.head(10)[['movie_id', 'title', 'year', 'parsed_genres', 'ratings_cnt', 'ratings_avg', 'ratings_std', 'score']]

0.80      7.0
0.85     10.0
0.90     15.0
0.95     27.0
1.00    217.0
Name: ratings_cnt, dtype: float64


Unnamed: 0,movie_id,title,year,parsed_genres,ratings_cnt,ratings_avg,ratings_std,score
5371,8957,Saw (2004),2004.0,"['horror', 'mystery', 'thriller']",23,3.217391,1.321381,132.459827
2035,2710,"Blair Witch Project, The (1999)",1999.0,"['drama', 'horror', 'thriller']",50,3.01,1.307318,131.032849
6453,52281,Grindhouse (2007),2007.0,"['action', 'crime', 'horror', 'scifi', 'thrill...",21,3.404762,1.189738,119.314263
1972,2617,"Mummy, The (1999)",1999.0,"['action', 'adventure', 'comedy', 'fantasy', '...",75,3.166667,1.128061,113.122807
2828,3785,Scary Movie (2000),2000.0,"['comedy', 'horror']",40,2.7625,1.12653,112.92928
3090,4148,Hannibal (2001),2001.0,"['horror', 'thriller']",32,2.984375,1.125112,112.809638
1662,2232,Cube (1997),1997.0,"['horror', 'mystery', 'scifi', 'thriller']",24,3.3125,1.121068,112.438051
1083,1407,Scream (1996),1996.0,"['comedy', 'horror', 'mystery', 'thriller']",61,3.172131,1.117606,112.077837
3461,4720,"Others, The (2001)",2001.0,"['drama', 'horror', 'mystery', 'thriller']",47,3.648936,1.107852,111.150082
235,273,Mary Shelley's Frankenstein (Frankenstein) (1994),1994.0,"['drama', 'horror', 'scifi']",25,2.92,1.105667,110.858722
