In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [2]:
dem_df = pd.read_csv('../data/dem_num_endorsements.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])
rep_df = pd.read_csv('../data/rep_num_endorsements.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])

dem_drop_cols = ['won_primary', 'primary_status', 'general_status', 'primary_pctg', 'candidate', 'primary_runoff_status', 
'num_non_endorsements', 'num_endorsements', 'partisan_lean', 'district']

rep_drop_cols = ['won_primary', 'primary_status', 'general_status', 'primary_pctg', 'candidate', 'primary_runoff_status', 
'num_non_endorsements', 'num_endorsements', 'district']
# drop race primary (date), candidate (name irrelevant)

def run_random_forest(df, drop_cols):
    msk = np.random.rand(len(df)) < 0.8
    train = df[msk]
    test = df[~msk]

    train_x = train.drop(drop_cols, axis=1)
    train_y = train[['primary_pctg', 'won_primary']]
    train_y_primary_pctg = train['primary_pctg']
    train_y_won_primary = train['won_primary']


    test_x = test.drop(drop_cols, axis=1) 
    test_y = test[['primary_pctg', 'won_primary']]
    test_y_primary_pctg = test['primary_pctg']
    test_y_won_primary = test['won_primary']

    regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
    regr.fit(train_x, train_y)

    regr.score(test_x, test_y)
    feature_importances = pd.DataFrame(regr.feature_importances_, index = train_x.columns, columns=['importance']).sort_values('importance',ascending=False)
    print(feature_importances)


In [3]:
print("DEMOCRATIC: ")
run_random_forest(dem_df, dem_drop_cols)
print("\n")

DEMOCRATIC: 
                         importance
guns_sense_candidate       0.447298
emily_endorsed             0.418751
our_revolution_endorsed    0.103279
self_funder                0.011989
state                      0.007128
elected_official           0.004242
race_primary               0.003770
dem_party_support          0.001242
office_type                0.000725
pccc_endorsed              0.000633
lgbtq                      0.000299
wfp_endorsed               0.000255
obama_alum                 0.000204
veteran                    0.000186
justice_dems_endorsed      0.000000
votevets_endorsed          0.000000
indivisible_endorsed       0.000000
biden_endorsed             0.000000
race_type                  0.000000
sanders_endorsed           0.000000
warren_endorsed            0.000000
stem                       0.000000
race                       0.000000
no_labels_support          0.000000




In [4]:
print("REPUBLICAN: ")
run_random_forest(rep_df, rep_drop_cols)

REPUBLICAN: 
                            importance
right_to_life_endorsed        0.690991
susan_b_anthony_endorsed      0.142733
tea_party_endorsed            0.043197
club_for_growth_endorsed      0.042913
koch_support                  0.028068
rep_party_support             0.027327
house_freedom_support         0.005442
office_type                   0.005324
state                         0.005275
main_street_endorsed          0.004654
race_primary_election_date    0.003820
bannon_endorsed               0.000257
nra_endorsed                  0.000000
great_america_endorsed        0.000000
trump_endorsed                0.000000
race_type                     0.000000
chamber_endorsed              0.000000
no_labels_support             0.000000
