# Observational Studies

To draw causal conclusions on the impact of our features on movie success, we have to perform the standard operations used in observational studies, such as propensity matching and regression.

## Packages

In [1]:
# Global packages
import pandas as pd
import numpy as np
# Statistical package
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
import statistics
# Matching package
from psmpy import PsmPy
from psmpy.functions import cohenD
from psmpy.plotting import *
# Custom helpers
import feature_and_regression as feat_and_reg
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter(action='ignore')

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,


## Load Data

In [2]:
# First gather the initial regression dataframe.

raw_regression_df = feat_and_reg.get_raw_regression_df({})

In [3]:
# Format the regression dataframe
processed_df, target, binary_target, num_votes = feat_and_reg.format_regression_df(
                                                            raw_regression_df,[],bad_movies=True)
processed_df

Unnamed: 0_level_0,Western Europe,Asia,Africa and Middle-East,Eastern Europe and Russia,Central and South America,actor_number,gender_ratio,has_famous_actor,action,adventure,...,horror,animation,children,adult,fantasy,genre,title_length,combinned_movie_num,num_directors,combinned_movie_success
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
975900,0.0,0.0,0.0,0.0,0.0,17.0,-0.294118,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.165379,14.0,1.0,1
28463795,1.0,0.0,0.0,0.0,0.0,4.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.539345,1.0,1.0,0
261236,1.0,0.0,0.0,0.0,0.0,3.0,-0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.870103,1.0,1.0,0
10408933,0.0,0.0,0.0,0.0,0.0,4.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.165379,18.0,1.0,1
24229100,0.0,1.0,0.0,0.0,0.0,5.0,-0.200000,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.984274,2.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28373912,0.0,0.0,0.0,1.0,0.0,20.0,-0.700000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.539345,4.0,1.0,1
1918494,1.0,0.0,0.0,0.0,0.0,24.0,-0.636364,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.165379,6.0,1.0,1
664006,0.0,0.0,0.0,0.0,0.0,8.0,-0.250000,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.165379,33.0,1.0,1
24209227,0.0,0.0,0.0,0.0,0.0,8.0,-0.428571,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.984274,29.0,1.0,1


## Propensity Score

### Horror

In [16]:
# Compute propensity scores
psm = PsmPy(processed_df.reset_index(), treatment='horror', indx='movie_id', exclude = [])
psm.logistic_ps(balance = True)

In [17]:
psm.knn_matched(matcher='propensity_logit', replacement=False, caliper=None)

In [18]:
matched_ids = set(psm.matched_ids["movie_id"]).union(psm.matched_ids["matched_ID"])

In [19]:
# Creates matching and print summary of logisitic regression
matched_df = processed_df[processed_df.index.isin(matched_ids)]
matched_targets = binary_target[processed_df.index.isin(matched_ids)]
features_matched = feat_and_reg.forward_selection(matched_df, matched_targets, log_reg=True)
if 'action' not in features_matched:
    features_matched.append('action')
model_matched = sm.Logit(matched_targets, sm.add_constant(matched_df[features_matched])).fit(disp = False)
print(model_matched.summary())

Optimization terminated successfully.
         Current function value: 0.618542
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.620156
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621977
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.619638
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.622027
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.616040
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.622173
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.622244
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.618599
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.621269
  

         Current function value: 0.549805
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.549461
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.549783
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.548355
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.549454
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.548444
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.549967
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.545593
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.545089
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.546553
         Iterations 6
Optimization termi

Optimization terminated successfully.
         Current function value: 0.539256
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538877
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538932
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.539270
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538521
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538540
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538500
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.539114
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538196
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.537945
  

### Fantasy

In [31]:
# Compute propensity scores
psm = PsmPy(processed_df.reset_index(), treatment='fantasy', indx='movie_id', exclude = [])
psm.logistic_ps(balance = True)

In [32]:
# Creates matching and print summary of logisitic regression
psm.knn_matched(matcher='propensity_logit', replacement=False, caliper=None)
matched_ids = set(psm.matched_ids["movie_id"]).union(psm.matched_ids["matched_ID"])
matched_df = processed_df[processed_df.index.isin(matched_ids)]
matched_targets = binary_target[processed_df.index.isin(matched_ids)]
features_matched = feat_and_reg.forward_selection(matched_df, matched_targets, log_reg=True)
if 'fantasy' not in features_matched:
    features_matched.append('fantasy')
model_matched = sm.Logit(matched_targets, sm.add_constant(matched_df[features_matched])).fit(disp = False)
print(model_matched.summary())

Optimization terminated successfully.
         Current function value: 0.658349
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.657516
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.660289
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.658224
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.660298
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.649155
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.660244
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.657537
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.656719
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.660021
  

Optimization terminated successfully.
         Current function value: 0.575840
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.576673
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.576638
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.576172
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.575294
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.575759
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.576490
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.576467
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.575716
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.575971
  

Optimization terminated successfully.
         Current function value: 0.568728
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.568052
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.569157
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.569143
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.568049
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.567281
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.568003
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.567761
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.567315
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.567778
  

### Genre

In [33]:
# Compute propensity scores
psm = PsmPy(processed_df.reset_index(), treatment='genre', indx='movie_id', exclude = [])
psm.logistic_ps(balance = True)

In [34]:
# Creates matching and print summary of logisitic regression
psm.knn_matched(matcher='propensity_logit', replacement=False, caliper=None)
matched_ids = set(psm.matched_ids["movie_id"]).union(psm.matched_ids["matched_ID"])
matched_df = processed_df[processed_df.index.isin(matched_ids)]
matched_targets = binary_target[processed_df.index.isin(matched_ids)]
features_matched = feat_and_reg.forward_selection(matched_df, matched_targets, log_reg=True)
if 'genre' not in features_matched:
    features_matched.append('genre')
model_matched = sm.Logit(matched_targets, sm.add_constant(matched_df[features_matched])).fit(disp = False)
print(model_matched.summary())

Optimization terminated successfully.
         Current function value: 0.685385
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688393
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691441
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.689657
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.690849
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.687975
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.690483
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692530
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690975
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.691658
  

Optimization terminated successfully.
         Current function value: 0.630591
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623837
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626595
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625264
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626335
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.624477
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.627185
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626242
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626964
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626907
  

Optimization terminated successfully.
         Current function value: 0.614879
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614136
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614868
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614418
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614321
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614737
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614158
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614850
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614365
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.614877
  

### Thriller

In [35]:
# Compute propensity scores
psm = PsmPy(processed_df.reset_index(), treatment='thriller', indx='movie_id', exclude = [])
psm.logistic_ps(balance = True)

In [36]:
# Creates matching and print summary of logisitic regression
psm.knn_matched(matcher='propensity_logit', replacement=False, caliper=None)
matched_ids = set(psm.matched_ids["movie_id"]).union(psm.matched_ids["matched_ID"])
matched_df = processed_df[processed_df.index.isin(matched_ids)]
matched_targets = binary_target[processed_df.index.isin(matched_ids)]
features_matched = feat_and_reg.forward_selection(matched_df, matched_targets, log_reg=True)
if 'thriller' not in features_matched:
    features_matched.append('thriller')
model_matched = sm.Logit(matched_targets, sm.add_constant(matched_df[features_matched])).fit(disp = False)
print(model_matched.summary())

Optimization terminated successfully.
         Current function value: 0.686090
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.686929
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688384
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687419
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688432
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.681649
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688373
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687312
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684610
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.683763
  

Optimization terminated successfully.
         Current function value: 0.626299
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.625889
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.626435
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.622791
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.622284
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623856
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623306
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623839
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623642
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.623835
  

Optimization terminated successfully.
         Current function value: 0.616705
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.615763
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.616447
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.616572
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.616819
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.616899
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.615678
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.615142
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.615680
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.614856
  

Optimization terminated successfully.
         Current function value: 0.611650
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611671
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611539
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611545
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611593
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611597
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611730
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611451
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611468
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611345
  