In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [3]:
'''
data = sm.datasets.get_rdataset("dietox", "geepack").data
print(data)
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
mdf = md.fit()
print(mdf.summary())
'''

'\ndata = sm.datasets.get_rdataset("dietox", "geepack").data\nprint(data)\nmd = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])\nmdf = md.fit()\nprint(mdf.summary())\n'

## Encode data

In [46]:
drop_cols = ['primary_status', 'general_status', 'primary_runoff_status', 'twitter_handle']

# encode data
encode_dem_cols = ['candidate', 'won_primary', 'state', 'district', 'office_type', 'race_type', 'race', 'veteran', 'race_primary',
    'lgbtq', 'elected_official', 'self_funder', 'stem', 'obama_alum', 'dem_party_support', 'emily_endorsed',
    'guns_sense_candidate', 'biden_endorsed', 'warren_endorsed', 'sanders_endorsed', 'our_revolution_endorsed',
    'justice_dems_endorsed', 'pccc_endorsed', 'indivisible_endorsed', 'wfp_endorsed', 'votevets_endorsed', 'no_labels_support']

encode_rep_cols = ['candidate', 'won_primary', 'state', 'district', 'office_type', 'race_type', 'race_primary_election_date',
    'rep_party_support', 'trump_endorsed', 'bannon_endorsed', 'great_america_endorsed', 'nra_endorsed',
    'right_to_life_endorsed', 'susan_b_anthony_endorsed', 'club_for_growth_endorsed', 'koch_support', 'house_freedom_support',
    'tea_party_endorsed', 'main_street_endorsed','chamber_endorsed', 'no_labels_support']


dem_df = pd.read_csv('../data/dem_candidates.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])
rep_df = pd.read_csv('../data/rep_candidates.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])

label_encoder = LabelEncoder()

def label_encode_data(df, encode_cols):
    for col in encode_cols:
        df[col] = label_encoder.fit_transform(df[col].values.astype(str))

    return df

dem_df_enc = label_encode_data(dem_df, encode_dem_cols)
rep_df_enc = label_encode_data(rep_df, encode_rep_cols)

dem_df_enc.to_csv('../data/encoded_dem_data.csv', index=False)
rep_df_enc.to_csv('../data/encoded_rep_data.csv', index=False)


## Democratic - mixedlm

In [49]:
dem_df = dem_df_enc.drop(drop_cols, axis=1)
dem_dep_cols = " + ".join(encode_dem_cols[2:])


model = smf.mixedlm("won_primary ~ " + dep_cols, dem_df, groups=dem_df["candidate"]).fit()
print(model.summary())

              Mixed Linear Model Regression Results
Model:               MixedLM    Dependent Variable:    won_primary
No. Observations:    857        Method:                REML       
No. Groups:          790        Scale:                 0.0167     
Min. group size:     1          Likelihood:            -614.5427  
Max. group size:     2          Converged:             Yes        
Mean group size:     1.1                                          
------------------------------------------------------------------
                        Coef.  Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------------------
Intercept               -0.987    0.277 -3.563 0.000 -1.530 -0.444
state                    0.000    0.005  0.022 0.983 -0.009  0.010
district                 0.000    0.001  0.219 0.826 -0.002  0.002
office_type              0.082    0.106  0.774 0.439 -0.126  0.290
race_type               -0.008    0.056 -0.151 0.880 -0.119  0.102
race      

In [50]:
model = smf.mixedlm("primary_pctg ~ " + dem_dep_cols, dem_df, groups=dem_df["candidate"]).fit()
print(model.summary())

                Mixed Linear Model Regression Results
Model:                 MixedLM    Dependent Variable:    primary_pctg
No. Observations:      857        Method:                REML        
No. Groups:            790        Scale:                 0.5801      
Min. group size:       1          Likelihood:            -3757.0390  
Max. group size:       2          Converged:             Yes         
Mean group size:       1.1                                           
---------------------------------------------------------------------
                         Coef.  Std.Err.   z    P>|z|  [0.025  0.975]
---------------------------------------------------------------------
Intercept               -71.389   13.974 -5.109 0.000 -98.777 -44.002
state                     0.266    0.250  1.067 0.286  -0.223   0.755
district                 -0.107    0.048 -2.209 0.027  -0.201  -0.012
office_type              19.696    5.420  3.634 0.000   9.072  30.320
race_type                 1.864    0

## Republican - mixedlm

In [51]:
rep_dep_cols = " + ".join(encode_rep_cols[2:])
rep_df = rep_df_enc.drop(drop_cols, axis=1)
model = smf.mixedlm("won_primary ~ " + rep_dep_cols, rep_df, groups=rep_df["candidate"]).fit()
print(model.summary())

                Mixed Linear Model Regression Results
Model:                MixedLM     Dependent Variable:     won_primary
No. Observations:     773         Method:                 REML       
No. Groups:           755         Scale:                  0.0048     
Min. group size:      1           Likelihood:             -476.9141  
Max. group size:      8           Converged:              Yes        
Mean group size:      1.0                                            
---------------------------------------------------------------------
                           Coef.  Std.Err.   z    P>|z| [0.025 0.975]
---------------------------------------------------------------------
Intercept                  -0.623    0.185 -3.357 0.001 -0.986 -0.259
state                       0.001    0.002  0.512 0.609 -0.002  0.004
district                   -0.000    0.001 -0.567 0.571 -0.001  0.001
office_type                 0.095    0.066  1.446 0.148 -0.034  0.224
race_type                  -0.006   

In [52]:
model = smf.mixedlm("primary_pctg ~ " + rep_dep_cols, rep_df, groups=rep_df["candidate"]).fit()
print(model.summary())

                 Mixed Linear Model Regression Results
Model:                  MixedLM     Dependent Variable:     primary_pctg
No. Observations:       773         Method:                 REML        
No. Groups:             755         Scale:                  117.7749    
Min. group size:        1           Likelihood:             -3623.2014  
Max. group size:        8           Converged:              Yes         
Mean group size:        1.0                                             
------------------------------------------------------------------------
                            Coef.  Std.Err.   z    P>|z|  [0.025  0.975]
------------------------------------------------------------------------
Intercept                  -57.776   13.210 -4.374 0.000 -83.668 -31.885
state                        0.116    0.142  0.819 0.413  -0.162   0.394
district                    -0.029    0.043 -0.659 0.510  -0.114   0.057
office_type                  9.452    5.176  1.826 0.068  -0.694  19.