In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [14]:
dem_df = pd.read_csv('../data/dem_num_endorsements.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])
rep_df = pd.read_csv('../data/rep_num_endorsements.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])

dem_drop_cols = ['won_primary', 'primary_status', 'general_status', 'primary_pctg', 'candidate', 'primary_runoff_status', 
'num_non_endorsements', 'num_endorsements', 'partisan_lean', 'district']

rep_drop_cols = ['won_primary', 'primary_status', 'general_status', 'primary_pctg', 'candidate', 'primary_runoff_status', 
'num_non_endorsements', 'num_endorsements']
# drop race primary (date), candidate (name irrelevant)

def run_random_forest(df, drop_cols):
    msk = np.random.rand(len(df)) < 0.8
    train = df[msk]
    test = df[~msk]

    train_x = train.drop(drop_cols, axis=1)
    train_y = train[['primary_pctg', 'won_primary']]
    train_y_primary_pctg = train['primary_pctg']
    train_y_won_primary = train['won_primary']


    test_x = test.drop(drop_cols, axis=1) 
    test_y = test[['primary_pctg', 'won_primary']]
    test_y_primary_pctg = test['primary_pctg']
    test_y_won_primary = test['won_primary']

    regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
    regr.fit(train_x, train_y)

    regr.score(test_x, test_y)
    feature_importances = pd.DataFrame(regr.feature_importances_, index = train_x.columns, columns=['importance']).sort_values('importance',ascending=False)
    print(feature_importances)


In [15]:
# dem_drop_cols.append('partisan_lean')
# dem_drop_cols.append('district')

print("DEMOCRATIC: ")
run_random_forest(dem_df, dem_drop_cols)
print("\n")

DEMOCRATIC: 
                         importance
partisan_lean              0.634910
guns_sense_candidate       0.185643
emily_endorsed             0.079028
state                      0.038771
district                   0.019106
elected_official           0.017243
race_primary               0.012536
dem_party_support          0.009377
veteran                    0.001694
race                       0.001692
lgbtq                      0.000000
sanders_endorsed           0.000000
votevets_endorsed          0.000000
wfp_endorsed               0.000000
indivisible_endorsed       0.000000
pccc_endorsed              0.000000
justice_dems_endorsed      0.000000
our_revolution_endorsed    0.000000
warren_endorsed            0.000000
biden_endorsed             0.000000
office_type                0.000000
race_type                  0.000000
obama_alum                 0.000000
stem                       0.000000
self_funder                0.000000
no_labels_support          0.000000




In [16]:
print("REPUBLICAN: ")
run_random_forest(rep_df, rep_drop_cols)

REPUBLICAN: 
                            importance
right_to_life_endorsed        0.662182
district                      0.092004
susan_b_anthony_endorsed      0.090403
koch_support                  0.068866
rep_party_support             0.040039
club_for_growth_endorsed      0.014544
main_street_endorsed          0.011855
tea_party_endorsed            0.011239
house_freedom_support         0.006127
state                         0.002163
race_primary_election_date    0.000577
chamber_endorsed              0.000000
nra_endorsed                  0.000000
great_america_endorsed        0.000000
bannon_endorsed               0.000000
trump_endorsed                0.000000
race_type                     0.000000
office_type                   0.000000
no_labels_support             0.000000
