In [23]:
import pandas as pd
import numpy as np
import arcpy
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
from matplotlib.ticker import FuncFormatter

PATH = 'C:\\Users\\Charles\\Documents\\ArcGIS\\SF_election_2016'

In [106]:
source_files = [x for x in os.listdir(PATH + '\\derived_data') if x.endswith('.txt') and x != 'SF_2010_pop_block.txt']
exclude = [
    '110 - U.S. Representative, District 12.txt',
    '115 - U.S. Representative, District 13.txt',
    '120 - U.S. Representative, District 14.txt',
    '140 - Board of Supervisors, District 1.txt',
    '145 - Board of Supervisors, District 3.txt',
    '155 - Board of Supervisors, District 7.txt',
    '160 - Board of Supervisors, District 9.txt',
    '165 - Board of Supervisors, District 11.txt',
    '185 - BART Director, District 7.txt',
    '190 - BART Director, District 9.txt',
]
join_df = None
for f in source_files[:-1]:
    if f in exclude:
        continue
    df = pd.read_csv(PATH + '\\derived_data\\' + f)
    agg_dict = {x: 'sum' for x in df.columns[7:]}
    agg_dict['ballots_cast'] = 'sum'
    df = df.rename(columns={'precinct_id': 'precinctid'})
    df = df.groupby('precinctid').agg(agg_dict).reset_index()
    df = df.drop(['under_vote', 'over_vote'], axis='columns')
    df[[x for x in df.columns if x not in ('precinctid', 'ballots_cast')]] = df[[x for x in df.columns if x not in ('precinctid', 'ballots_cast')]].div(df['ballots_cast'], axis=0)
    df = df.rename(columns={x: x + '_' + f[6:-4].lower().replace(' ', '_') for x in df.columns[1:]})
    if join_df is None:
        join_df = df
    else:
        join_df = join_df.merge(df, how='left', left_on='precinctid', right_on='precinctid', suffixes=['_main', '_' + f[6:-4].lower().replace(' ', '_')])

In [107]:
feature_df = join_df.drop('precinctid', axis='columns').copy()

# Clean the feature data
feature_df = feature_df.drop([x for x in feature_df.columns if 'u.s._representative' in x or 'state_assembly' in x], axis='columns')
feature_df = feature_df[~pd.isnull(feature_df['yes_local_measure_u'])]
feature_df = feature_df[~pd.isnull(feature_df['dean_preston_board_of_supervisors,_district_5'])]

In [108]:
classes = feature_df['dean_preston_board_of_supervisors,_district_5'] > feature_df['london_breed_board_of_supervisors,_district_5']

In [109]:
RandomForestClassifier(class_weight='balanced').fit_transform(feature_df.values, classes)



array([[ 0.03639847,  0.        ,  0.29693487, ...,  0.09386973,
         0.44061303,  0.40229885],
       [ 0.0259366 ,  0.00144092,  0.40634006, ...,  0.1037464 ,
         0.49279539,  0.35878963],
       [ 0.02180233,  0.00145349,  0.41860465, ...,  0.11046512,
         0.53343023,  0.32848837],
       ..., 
       [ 0.0155587 ,  0.        ,  0.50353607, ...,  0.08910891,
         0.61951909,  0.24611033],
       [ 0.02025316,  0.        ,  0.51392405, ...,  0.09493671,
         0.61518987,  0.24556962],
       [ 0.01330109,  0.00362757,  0.51511487, ...,  0.07255139,
         0.61305925,  0.23941959]])

In [105]:
feature_df.columns[np.where(pd.isnull(feature_df).apply(any, axis=0))]

Index([u'writein_bart_director,_district_9',
       u'gwyneth_borden_bart_director,_district_9',
       u'bevan_dufty_bart_director,_district_9',
       u'ballots_cast_bart_director,_district_9',
       u'michael_petrelis_bart_director,_district_9'],
      dtype='object')

In [77]:
feature_df[pd.isnull(feature_df['london_breed_board_of_supervisors,_district_5'])]

Unnamed: 0,writein_bernard_bernie_sanders__tu_president_and_vice_president,gary_johnson__bill_weld_president_and_vice_president,writein_jerry_white__niles_niemuth_president_and_vice_president,jill_stein__ajamu_baraka_president_and_vice_president,writein_evan_mcmullin__nathan_johnso_president_and_vice_president,hillary_clinton__tim_kaine_president_and_vice_president,writein_president_and_vice_president,writein_laurence_kotlikoff__edward_l_president_and_vice_president,ballots_cast_president_and_vice_president,gloria_estela_la_riva__dennis_j_bank_president_and_vice_president,...,no_local_measure_v,ballots_cast_local_measure_w,yes_local_measure_w,no_local_measure_w,ballots_cast_local_measure_x,yes_local_measure_x,no_local_measure_x,ballots_cast_district_measure_rr,yes_district_measure_rr,no_district_measure_rr
0,0.004484,0.020179,0.0,0.020179,0.004484,0.804933,0.006726,0.000000,446,0.008969,...,0.403587,446,0.585202,0.307175,446,0.573991,0.302691,446,0.692825,0.186099
1,0.003704,0.022222,0.0,0.019753,0.001235,0.813580,0.003704,0.000000,810,0.007407,...,0.401235,810,0.549383,0.362963,810,0.543210,0.355556,810,0.709877,0.196296
2,0.003841,0.016645,0.0,0.025608,0.000000,0.819462,0.014085,0.000000,781,0.003841,...,0.402049,781,0.537772,0.340589,781,0.591549,0.279129,781,0.704225,0.189501
3,0.005242,0.015727,0.0,0.028834,0.000000,0.794233,0.002621,0.000000,763,0.006553,...,0.408912,763,0.555701,0.331586,763,0.589777,0.277851,763,0.720839,0.176933
4,0.008333,0.015000,0.0,0.038333,0.001667,0.786667,0.006667,0.000000,600,0.003333,...,0.410000,600,0.520000,0.346667,600,0.561667,0.281667,600,0.663333,0.211667
5,0.000000,0.009881,0.0,0.025692,0.000000,0.798419,0.007905,0.000000,506,0.003953,...,0.397233,506,0.567194,0.320158,506,0.610672,0.235178,506,0.681818,0.211462
6,0.004032,0.013441,0.0,0.029570,0.000000,0.836022,0.004032,0.000000,744,0.002688,...,0.426075,744,0.573925,0.311828,744,0.577957,0.276882,744,0.701613,0.184140
7,0.003241,0.017828,0.0,0.027553,0.000000,0.769854,0.006483,0.000000,617,0.004862,...,0.458671,617,0.551053,0.335494,617,0.552674,0.306321,617,0.677472,0.231767
8,0.008368,0.016736,0.0,0.034868,0.000000,0.789400,0.000000,0.000000,717,0.008368,...,0.443515,717,0.518828,0.331939,717,0.584379,0.241283,717,0.677824,0.200837
9,0.005386,0.017953,0.0,0.019749,0.000000,0.833034,0.005386,0.000000,557,0.008977,...,0.436266,557,0.522442,0.339318,557,0.576302,0.247756,557,0.664273,0.226212
