In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [19]:
data = pd.read_csv("/content/drive/MyDrive/ADA/congress_115_116.csv")
congress=data[ (data.party!='ID')]#We remove the independant politician


# Observational Studies

As we have handpicked our own dataset from the quotebank, we want to know what are the possible limitations that are due to Covariates on the outcome. To do so an observational study will be made.

In theory, only the political party of the speaker should have an influence on the result;In practise, some covariates may change the outcome too. 


The "treatment" variable is set to be the party of the speaker (1 if republican R  and 0 if democrat D) and the observed covariates are the age , party and the gender of the speaker.

In [25]:
obs_congress= congress[["party" ,"gender","date_of_birth","state"]]
obs_congress['year_of_birth']=pd.DatetimeIndex(obs_congress['date_of_birth']).year 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
obs_congress.head(2)

Unnamed: 0,party,gender,date_of_birth,state,year_of_birth
0,R,M,1954-09-16,LA,1954
1,D,F,1946-05-27,NC,1946


In [28]:
obs_features = ['party' ,'gender','year_of_birth','state']
obs_congress= pd.get_dummies(obs_congress[obs_features])

In [29]:
obs_congress.head(2)

Unnamed: 0,year_of_birth,party_D,party_R,gender_F,gender_M,state_AK,state_AL,state_AR,state_AZ,state_CA,state_CO,state_CT,state_DE,state_FL,state_GA,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,1954,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1946,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [30]:
# We normalize continuous variables mainly the date of birth 

obs_congress['year_of_birth']=(obs_congress['year_of_birth'] - obs_congress['year_of_birth'].mean())/obs_congress['year_of_birth'].std()

In [31]:
all_columns =  list(obs_congress.columns) #"+".join

In [32]:
all_columns.remove("party_D")
all_columns.remove("party_R")

In [33]:
model="party_R~" + ' + '.join(all_columns)

In [34]:
model

'party_R~year_of_birth + gender_F + gender_M + state_AK + state_AL + state_AR + state_AZ + state_CA + state_CO + state_CT + state_DE + state_FL + state_GA + state_HI + state_IA + state_ID + state_IL + state_IN + state_KS + state_KY + state_LA + state_MA + state_MD + state_ME + state_MI + state_MN + state_MO + state_MS + state_MT + state_NC + state_ND + state_NE + state_NH + state_NJ + state_NM + state_NV + state_NY + state_OH + state_OK + state_OR + state_PA + state_RI + state_SC + state_SD + state_TN + state_TX + state_UT + state_VA + state_VT + state_WA + state_WI + state_WV + state_WY'

In [35]:
mod = smf.logit(formula=model, data=obs_congress)

res = mod.fit()

# Extract the estimated propensity scores
obs_congress['Propensity_score'] = res.predict()

print(res.summary())

         Current function value: 0.415979
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:                party_R   No. Observations:                  386
Model:                          Logit   Df Residuals:                      334
Method:                           MLE   Df Model:                           51
Date:                Fri, 12 Nov 2021   Pseudo R-squ.:                  0.3996
Time:                        17:58:55   Log-Likelihood:                -160.57
converged:                      False   LL-Null:                       -267.43
Covariance Type:            nonrobust   LLR p-value:                 8.321e-22
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.4147        nan        nan        nan         nan         nan
year_of_birth     0.5421      0.156      3.478      0.00

  bse_ = np.sqrt(np.diag(self.cov_params()))


# Summary of observed covariate analysis

We clearly see that age and gender are not covariate of concern (low coefficients:  0.5421 for age and -1.1399 for women). Indeed, both parties have politicians that tend to be old and have a similar proportion of women.

However, we can see that states play a big role  in determening the political party of the candidate.  A state with a lot of democrats like Rhode Island for example has a coefficient of -51.47 ! While on the other side of the spectrum a republican state like Indiana has 34.54. 
Some states are in the middle and have a split repartition like Florida with a coefficient of 0.18. 

Therefore we should be cautious about our analysis as states are covariate to take into account when interpreting the final results!