In [41]:
import src.models.question1 as q1

In [42]:
# Load the winners and nominees of the Oscars
winners, nominees = q1.load_oscar_winners_nominees()

### Causal effect of winning an oscar

We will now study the causal effect of winning an oscar on the average rating.  
To do so, we consider the control group to be the oscar nominees.  
This choice is made so that we compare movies of similar level of qualities (we do not compare a very well crafted movie with a small quick win production).  

To study the causal, we need to construct a graph of pairs between control and treated.  
We will create pairs based on the released year of the movie (such that movies in a pair are done in the same year).  
We do so because the technical abilities to produce movies has greatly change over time and could introduce a hidden variable bias.

In [51]:
import networkx as nx

G = nx.Graph()

for i in range(10):
    for j in range(5):
        G.add_edge(i, j)

print(G.edges(data=True))
        

[(0, 0, {}), (0, 1, {}), (0, 2, {}), (0, 3, {}), (0, 4, {}), (0, 5, {}), (0, 6, {}), (0, 7, {}), (0, 8, {}), (0, 9, {}), (1, 1, {}), (1, 2, {}), (1, 3, {}), (1, 4, {}), (1, 5, {}), (1, 6, {}), (1, 7, {}), (1, 8, {}), (1, 9, {}), (2, 2, {}), (2, 3, {}), (2, 4, {}), (2, 5, {}), (2, 6, {}), (2, 7, {}), (2, 8, {}), (2, 9, {}), (3, 3, {}), (3, 4, {}), (3, 5, {}), (3, 6, {}), (3, 7, {}), (3, 8, {}), (3, 9, {}), (4, 4, {}), (4, 5, {}), (4, 6, {}), (4, 7, {}), (4, 8, {}), (4, 9, {})]


In [43]:
import networkx as nx
import pandas as pd

control_df: pd.DataFrame = nominees.reset_index(drop=True)
treated_df: pd.DataFrame = winners.reset_index(drop=True)

print(len(control_df))
print(len(treated_df))

G = nx.Graph()

for control_id, control_row in control_df.iterrows():
    for treated_id, treated_row in treated_df.iterrows():
        if control_row['release'] == treated_row['release']:
            diff_nb_votes = abs(control_row['numVotes'] - treated_row['numVotes'])
            #print(f"Adding edge between {control_id} and {treated_id} with weight {diff_nb_votes}")

            if treated_id > 60:
                print(f"Adding edge between {control_id} and {treated_id} with weight {diff_nb_votes}")

            G.add_weighted_edges_from([(control_id, treated_id, diff_nb_votes)])

print(G.edges(data=True))

matching = nx.min_weight_matching(G)

print(f"We have {len(matching)} matches")
print(matching)

231
53
[(0, 1, {'weight': 156752}), (0, 8, {'weight': 435759}), (0, 9, {'weight': 65156}), (0, 10, {'weight': 65532}), (1, 1, {'weight': 21222}), (1, 2, {'weight': 8541}), (1, 3, {'weight': 11984}), (2, 4, {'weight': 31365}), (2, 5, {'weight': 31416}), (2, 6, {'weight': 11668}), (2, 7, {'weight': 12345}), (3, 11, {'weight': 3105}), (3, 12, {'weight': 8135}), (3, 13, {'weight': 37393}), (4, 14, {'weight': 13303}), (4, 15, {'weight': 10154}), (4, 16, {'weight': 115644}), (5, 21, {'weight': 128422}), (5, 22, {'weight': 128328}), (5, 23, {'weight': 133945}), (6, 24, {'weight': 34403}), (6, 25, {'weight': 12122}), (6, 26, {'weight': 20288}), (6, 27, {'weight': 78284}), (7, 28, {'weight': 95528}), (7, 29, {'weight': 5866}), (7, 30, {'weight': 9593}), (7, 31, {'weight': 26879}), (8, 32, {'weight': 38566}), (8, 33, {'weight': 39810}), (8, 34, {'weight': 98159}), (8, 35, {'weight': 6797}), (9, 36, {'weight': 137331}), (9, 37, {'weight': 161130}), (9, 38, {'weight': 140451}), (9, 39, {'weight': 

We may now cmopute the average treatement effect to estimate the effect of winning an oscar on the average rating.

In [22]:
ATE = 0

for pair in matching:
    
    ATE += treated_row['averageRating'] - control_row['averageRating']

ATE /= len(matching)

print(f"The Average Treatment Effect is {ATE:.2f} rating units")

KeyError: 107