In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [20]:
dem_df = pd.read_csv('../data/dem_num_endorsements.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])
rep_df = pd.read_csv('../data/rep_num_endorsements.csv', encoding="ISO-8859-1").dropna(subset=['primary_pctg'])

dem_drop_cols = ['won_primary', 'primary_status', 'general_status', 'primary_pctg', 'candidate', 'primary_runoff_status', 
'num_non_endorsements', 'num_endorsements', 'partisan_lean', 'district']

rep_drop_cols = ['won_primary', 'primary_status', 'general_status', 'primary_pctg', 'candidate', 'primary_runoff_status', 
'num_non_endorsements', 'num_endorsements', 'district']
# drop race primary (date), candidate (name irrelevant)

def run_random_forest(df, drop_cols):
    msk = np.random.rand(len(df)) < 0.8
    train = df[msk]
    test = df[~msk]

    train_x = train.drop(drop_cols, axis=1)
    train_y = train[['primary_pctg', 'won_primary']]
    train_y_primary_pctg = train['primary_pctg']
    train_y_won_primary = train['won_primary']


    test_x = test.drop(drop_cols, axis=1) 
    test_y = test[['primary_pctg', 'won_primary']]
    test_y_primary_pctg = test['primary_pctg']
    test_y_won_primary = test['won_primary']

    regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
    regr.fit(train_x, train_y)

    regr.score(test_x, test_y)
    feature_importances = pd.DataFrame(regr.feature_importances_, index = train_x.columns, columns=['importance']).sort_values('importance',ascending=False)
    print(feature_importances)


In [21]:
print("DEMOCRATIC: ")
run_random_forest(dem_df, dem_drop_cols)
print("\n")

DEMOCRATIC: 
                         importance
emily_endorsed             0.557558
guns_sense_candidate       0.326716
our_revolution_endorsed    0.039344
dem_party_support          0.026236
indivisible_endorsed       0.020909
state                      0.017041
justice_dems_endorsed      0.005043
race_primary               0.002108
self_funder                0.001008
votevets_endorsed          0.000952
lgbtq                      0.000870
elected_official           0.000847
pccc_endorsed              0.000602
sanders_endorsed           0.000572
biden_endorsed             0.000192
wfp_endorsed               0.000000
race_type                  0.000000
warren_endorsed            0.000000
race                       0.000000
office_type                0.000000
veteran                    0.000000
obama_alum                 0.000000
stem                       0.000000
no_labels_support          0.000000




In [22]:
print("REPUBLICAN: ")
run_random_forest(rep_df, rep_drop_cols)

REPUBLICAN: 
                            importance
right_to_life_endorsed        0.676079
tea_party_endorsed            0.078608
susan_b_anthony_endorsed      0.073852
rep_party_support             0.072686
club_for_growth_endorsed      0.031822
main_street_endorsed          0.021863
koch_support                  0.018043
state                         0.016761
office_type                   0.005638
race_primary_election_date    0.004646
nra_endorsed                  0.000000
great_america_endorsed        0.000000
bannon_endorsed               0.000000
trump_endorsed                0.000000
house_freedom_support         0.000000
race_type                     0.000000
chamber_endorsed              0.000000
no_labels_support             0.000000
