In [41]:
import src.models.question1 as q1

In [54]:
# Load the winners and nominees of the Oscars
oscar_movies_df = q1.load_oscar_movies()

### Causal effect of winning an oscar

We will now study the causal effect of winning an oscar on the average rating.  
To do so, we consider the control group to be the oscar nominees.  
This choice is made so that we compare movies of similar level of qualities (we do not compare a very well crafted movie with a small quick win production).  

To study the causal, we need to construct a graph of pairs between control and treated.  
We will create pairs based on the released year of the movie (such that movies in a pair are done in the same year).  
We do so because the technical abilities to produce movies has greatly change over time and could introduce a hidden variable bias.

In [55]:
import networkx as nx
import pandas as pd

control_df: pd.DataFrame = oscar_movies_df[oscar_movies_df['winner'] == 0]
treated_df: pd.DataFrame = oscar_movies_df[oscar_movies_df['winner'] == 1]

G = nx.Graph()

for control_id, control_row in control_df.iterrows():
    for treated_id, treated_row in treated_df.iterrows():
        if control_row['release'] == treated_row['release']:
            diff_nb_votes = abs(control_row['numVotes'] - treated_row['numVotes'])

            G.add_weighted_edges_from([(control_id, treated_id, diff_nb_votes)])

matching = nx.min_weight_matching(G)

print(f"We have {len(matching)} matches")
print(matching)

We have 53 matches
{(90, 91), (202, 204), (3, 4), (18, 20), (84, 81), (147, 149), (14, 16), (40, 44), (8, 9), (178, 176), (97, 96), (152, 151), (74, 73), (104, 105), (225, 224), (75, 78), (115, 114), (230, 232), (109, 107), (214, 211), (174, 172), (283, 268), (166, 168), (185, 187), (160, 161), (254, 253), (51, 52), (133, 134), (183, 182), (282, 269), (112, 111), (144, 146), (129, 127), (245, 265), (121, 123), (30, 32), (140, 142), (88, 86), (136, 135), (99, 101), (35, 37), (154, 156), (26, 28), (243, 249), (191, 193), (242, 238), (196, 195), (279, 275), (2, 12), (45, 47), (68, 67), (63, 62), (119, 118)}


We may now cmopute the average treatement effect to estimate the effect of winning an oscar on the average rating.

In [57]:
def is_treated(node_id):
    return node_id in treated_df.index

ATE = 0

for pair in matching:
    control_id = pair[0] if is_treated(pair[1]) else pair[1]
    treated_id = pair[1] if is_treated(pair[1]) else pair[0]

    control_row = oscar_movies_df.loc[control_id]
    treated_row = oscar_movies_df.loc[treated_id]
   
    ATE += treated_row['averageRating'] - control_row['averageRating']

ATE /= len(matching)

print(f"The Average Treatment Effect is {ATE:.2f} rating units")

The Average Treatment Effect is 0.22 rating units
