# Propensity Score Matching

In [104]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv("../data/output_data/clean_movies_sent_fem_star.csv")

print(movies_df.head(5))

movies_df.rename(columns={
    'Film-Noir': 'Film_Noir',
    'Sci-Fi': 'Sci_Fi',
    'M/PG': 'M_PG',
    'NC-17': 'NC_17',
    'Not Rated': 'Not_Rated',
    'PG-13': 'PG_13',
    'TV-MA': 'TV_MA',
    'IMDB Rating': 'IMDB_Rating',
    'IMDB Votes': 'IMDB_Votes',
    'TMDB Rating': 'TMDB_Rating',
    'Vote Count': 'Vote_Count',
}, inplace=True)

                      Title  Year  Runtime  IMDB Rating  Metascore  \
0             Four Brothers  2005      109          6.8       49.0   
1  The Adventures of Tintin  2011      107          7.3       68.0   
2             Green Lantern  2011      114          5.5       39.0   
3           The Beastmaster  1982      118          6.2       18.0   
4        Kong: Skull Island  2017      118          6.7       62.0   

   IMDB Votes   Box Office     Budget    Revenue  TMDB Rating  ...  \
0      164844   74494381.0   45000000   92374674        6.800  ...   
1      249784   77591831.0  130000000  373993951        6.905  ...   
2      304201  116601172.0  200000000  219851172        5.174  ...   
3       26055   14056528.0    8000000   14056528        6.300  ...   
4      362341  168052812.0  185000000  566652812        6.547  ...   

   English_Language  Other_Language  Oscars_Won  Oscars_Nominated  Wins  \
0                 1               0           0                 0     5   
1       

In [105]:
pd.crosstab(movies_df['Female_Lead'], movies_df['Box Office'])

In [106]:
print(movies_df.columns)

Index(['Title', 'Year', 'Runtime', 'IMDB_Rating', 'Metascore', 'IMDB_Votes',
       'Box Office', 'Budget', 'Revenue', 'TMDB_Rating', 'Vote_Count', 'Month',
       'Day', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film_Noir',
       'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance', 'Sci_Fi',
       'Sport', 'Thriller', 'War', 'Western', 'Approved', 'G', 'GP', 'M',
       'M_PG', 'NC_17', 'Not_Rated', 'PG', 'PG_13', 'R', 'TV_MA', 'Unrated',
       'Top_Production_Company', 'Top_Director', 'Top_Writer', 'Domestic',
       'International', 'English_Language', 'Other_Language', 'Oscars_Won',
       'Oscars_Nominated', 'Wins', 'Nominations', 'Descr_Sentiment',
       'Tagline_Sentiment', 'Female_Lead', 'starpower'],
      dtype='object')

In [107]:
from sklearn.preprocessing import StandardScaler

cols_to_scale = ['Year', 'Runtime', 'Metascore', 'IMDB_Votes',
                 'TMDB_Rating', 'Vote_Count', 'Month', 'Day', 'Oscars_Won',
                 'Oscars_Nominated', 'Wins', 'Nominations',
                 'Descr_Sentiment', 'Tagline_Sentiment', 'starpower']

scaler = StandardScaler()
movies_df[cols_to_scale] = scaler.fit_transform(movies_df[cols_to_scale])

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

X = movies_df.drop(columns=["Female_Lead"])
X = X.select_dtypes(include=[float, int])

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data[vif_data['VIF'] > 10])  # Check for high VIF values

             feature           VIF
7            Revenue  1.159141e+01
34          Approved  5.883213e+12
35                 G  3.336000e+14
36                GP  4.094181e+13
37                 M  3.216857e+14
38              M_PG  1.916425e+14
39             NC_17  9.191020e+13
40         Not_Rated  1.047349e+14
41                PG  1.047349e+14
42             PG_13  6.004800e+14
43                 R  8.417943e+13
44             TV_MA  2.038750e+12
45           Unrated  5.700759e+13
49          Domestic  2.251800e+15
50     International  9.007199e+15
51  English_Language  4.094181e+14
52    Other_Language  2.094698e+14

In [109]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

# didn't include Other_Language, International, Musical, and Approved (one from each of the categorical variables)
# THIS IS THE VERSION THAT INCLUDES THE EX-POST VARIABLES
# formula = (
#     "Female_Lead ~ Year + Runtime + Budget + Metascore + IMDB_Votes + TMDB_Rating + Vote_Count + Month + Day + "
#     "Action + Adventure + Animation + Biography + Comedy + Crime + Documentary + Drama + Family + Fantasy + "
#     "Film_Noir + History + Horror + Music + Mystery + Romance + Sci_Fi + Sport + Thriller + War + Western + "
#     "G + GP + M + M_PG + NC_17 + Not_Rated + PG + PG_13 + R + TV_MA + Unrated + "
#     "Top_Production_Company + Top_Director + Top_Writer + Domestic + English_Language + "
#     "Oscars_Won + Oscars_Nominated + Wins + Nominations + Descr_Sentiment + Tagline_Sentiment + starpower"
# )
# THIS IS THE VERSION THAT DOES NOT INCLUDE THE EX-POST VARIABLES
formula = (
    "Female_Lead ~ Year + Runtime + Budget + Month + Day + "
    "Action + Adventure + Animation + Biography + Comedy + Crime + Documentary + Drama + Family + Fantasy + "
    "Film_Noir + History + Horror + Music + Mystery + Romance + Sci_Fi + Sport + Thriller + War + Western + "
    "G + GP + M + M_PG + NC_17 + Not_Rated + PG + PG_13 + R + TV_MA + Unrated + "
    "Top_Production_Company + Top_Director + Top_Writer + Domestic + English_Language + "
    "Descr_Sentiment + Tagline_Sentiment + starpower"
)

prop_model = smf.glm(
    formula=formula,
    data=movies_df,
    family=sm.families.Binomial()
).fit()

print(prop_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:            Female_Lead   No. Observations:                 2816
Model:                            GLM   Df Residuals:                     2770
Model Family:                Binomial   Df Model:                           45
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1175.3
Date:                Tue, 29 Jul 2025   Deviance:                       2350.6
Time:                        15:35:11   Pearson chi2:                 2.76e+03
No. Iterations:                    22   Pseudo R-squ. (CS):            0.08372
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -0

In [110]:
movies_df['ps'] = prop_model.predict(movies_df)
movies_df

The closer to 0 indicates a lower likelihood of the movie having a female lead, while closer to 1 indicates a higher likelihood of the film having a female lead.

In [111]:
movies_df['match'] = np.nan

for i in range(len(movies_df)):
    cur_T = movies_df.loc[i, "Female_Lead"]
    cur_ps = movies_df.loc[i, "ps"]
    
    # Filter out current village
    other_df = movies_df[movies_df["Female_Lead"] != cur_T].copy()
    
    # Calculate absolute difference
    other_df['ps_dist'] = (other_df['ps'] - cur_ps).abs()
    
    # Find the index of the row with minimal ps_dist
    match_id = other_df['ps_dist'].idxmin()
    
    # Assign match_id to the 'match' column at position i
    movies_df.loc[i, 'match'] = match_id

In [112]:
# movies_df[45:55]

In [113]:
# movies_df.iloc[621]  # Display the row with index 2597
# See an example where Hunger Games: Mockingjay Part 2 is matched with 2 Fast 2 Furious

## Looking at Box Office as the Outcome Variable

In [None]:
movies_df = movies_df.rename(columns={'Box Office': 'Box_Office'})
movies_df = movies_df.reset_index().rename(columns={'index': 'movie_idx'})

female_lead_df = movies_df[movies_df['Female_Lead'] == 1]

matched_df = female_lead_df.merge(movies_df, left_on='match', right_on='movie_idx', how='left', suffixes=('.female', '.male'))
print(matched_df.head())

mean_fem = matched_df['Box_Office.female'].mean()
mean_male = matched_df['Box_Office.male'].mean()

print(f"Mean Box Office for Female Lead: {mean_fem}")
print(f"Mean Box Office for Male Lead: {mean_male}")

   movie_idx.female                  Title.female  Year.female  \
0                11  Maleficent: Mistress of Evil     1.012890   
1                17                       Species    -0.690102   
2                18            A Cinderella Story    -0.051480   
3                21   The Hunchback of Notre Dame    -0.619144   
4                29               Double Jeopardy    -0.406270   

   Runtime.female  IMDB_Rating.female  Metascore.female  IMDB_Votes.female  \
0        0.277090                 6.6         -0.875368          -0.333726   
1       -0.256919                 5.9         -0.532970          -0.468345   
2       -0.888021                 5.9         -1.902561          -0.435065   
3       -1.082206                 7.0          0.893687          -0.149703   
4       -0.402558                 6.5         -0.989500          -0.429025   

   Box_Office.female  Budget.female  Revenue.female  ...  Oscars_Won.male  \
0        113929605.0      185000000       491730089  ... 

## Looking at IMDb Rating as the Outcome Variable

In [None]:
mean_fem = matched_df['IMDB_Rating.female'].mean()
mean_male = matched_df['IMDB_Rating.male'].mean()

print(f"Mean IMDb Rating for Female Lead: {mean_fem}")
print(f"Mean IMDb Rating for Male Lead: {mean_male}")

Mean IMDb Rating for Female Lead: 6.329303278688524
Mean IMDb Rating for Male Lead: 6.600819672131147