In [1]:
import pandas as pd
import numpy as np
import arcpy

arcpy.env.workspace = 'C:\Users\Charles\Documents\ArcGIS\san_francisco'

OVERWRITE = True

### Split the Excel file into CSVs

In [2]:
dfs = pd.read_excel('20161206_vote_all_districts.xlsx', sheetname=None, header=3)

In [3]:
dfs.values()[0].head()

Unnamed: 0,PrecinctName,ReportingType,PrecinctID,Precincts,Registration,Ballots Cast,Turnout (%),Yes,No,Under Vote,Over Vote
0,Pct 1101,Election Day,1101,1,586,159,27.13,54,96,9,0
1,Pct 1101,VBM,1101,1,586,287,48.98,107,163,13,0
2,Pct 1102,Election Day,1102,1,996,291,29.22,102,172,16,0
3,Pct 1102,VBM,1102,1,996,519,52.11,201,290,26,0
4,Pct 1103,Election Day,1103,1,971,263,27.09,96,152,15,0


In [4]:
dfs.keys()

[u'265 - State Proposition 65',
 u'215 - State Proposition 55',
 u'155 - Board of Supervisors, Dis',
 u'190 - BART Director, District 9',
 u'340 - Local Measure M',
 u'210 - State Proposition 54',
 u'180 - Member, Community College',
 u'225 - State Proposition 57',
 u'330 - Local Measure K',
 u'240 - State Proposition 60',
 u'280 - School Measure A',
 u'105 - U.S. Senator',
 u'100 - President and Vice Presid',
 u'255 - State Proposition 63',
 u'195 - State Proposition 51',
 u'170 - Superior Court Judge, Sea',
 u'125 - State Senate, District 11',
 u'230 - State Proposition 58',
 u'175 - Member, Board of Educatio',
 u'245 - State Proposition 61',
 u'335 - Local Measure L',
 u'120 - U.S. Representative, Dist',
 u'400 - District Measure RR',
 u'235 - State Proposition 59',
 u'360 - Local Measure Q',
 u'350 - Local Measure O',
 u'200 - State Proposition 52',
 u'320 - Local Measure I',
 u'135 - State Assembly, District ',
 u'130 - State Assembly, District ',
 u'185 - BART Director, District 

In [5]:
dfs['Contents'].head()

Unnamed: 0,Table of Contents - Worksheets,Unnamed: 1
0,,
1,100.0,President and Vice President
2,105.0,U.S. Senator
3,110.0,"U.S. Representative, District 12"
4,115.0,"U.S. Representative, District 13"


In [6]:
# Since the sheet names get cut off, we can fix them using the Contents tab
contents = dfs['Contents'].copy()
contents.columns = ['key', 'name']
contents = contents.iloc[1:]
contents['key'] = contents['key'].astype(int)
contents = contents.set_index('key').to_dict(orient='index')

In [7]:
dfs_fixed_names = {}
for cut_name in dfs:
    if cut_name == 'Contents':
        continue
    key = int(cut_name[:3])
    if key in contents:
        prefix = cut_name[:6]
        postfix = contents[key]['name']
        dfs_fixed_names[prefix + postfix] = dfs[cut_name]

In [8]:
for name, df in dfs_fixed_names.items():
    print(name)
    if name == 'Contents':
        continue
    df.columns = [''.join([y for y in x.replace('(%)', 'Percent').replace(' ', '_') if y.isalnum() or y == '_'])
                  for x in df.columns]
    df = df[df['PrecinctName'].apply(lambda x: str(x).startswith('Pct '))]
    df.to_csv('SF_2016_vote/{}.txt'.format(name), encoding='utf-8', index=False)

265 - State Proposition 65
215 - State Proposition 55
225 - State Proposition 57
190 - BART Director, District 9
180 - Member, Community College Board
340 - Local Measure M
210 - State Proposition 54
330 - Local Measure K
250 - State Proposition 62
310 - Local Measure G
175 - Member, Board of Education
170 - Superior Court Judge, Seat 7
240 - State Proposition 60
280 - School Measure A
205 - State Proposition 53
255 - State Proposition 63
195 - State Proposition 51
100 - President and Vice President
125 - State Senate, District 11
230 - State Proposition 58
400 - District Measure RR
335 - Local Measure L
245 - State Proposition 61
235 - State Proposition 59
360 - Local Measure Q
350 - Local Measure O
200 - State Proposition 52
320 - Local Measure I
185 - BART Director, District 7
220 - State Proposition 56
115 - U.S. Representative, District 13
130 - State Assembly, District 17
135 - State Assembly, District 19
275 - State Proposition 67
105 - U.S. Senator
145 - Board of Supervisors, D

### Create geodatabase from CSVs

In [30]:
if OVERWRITE:
    arcpy.Delete_management('san_francisco_vote_2016.gdb')
arcpy.CreateFileGDB_management('.', 'san_francisco_vote_2016.gdb')
arcpy.TableToGeodatabase_conversion(
    ['SF_2016_vote/{}.txt'.format(x) for x in dfs_fixed_names.keys()],
    'san_francisco_vote_2016.gdb')

<Result 'C:\\Users\\Charles\\Documents\\ArcGIS\\san_francisco\\san_francisco_vote_2016.gdb\\san_francisco_vote_2016.gdb'>

### Clean field names and types

In [32]:
arcpy.env.workspace = 'C:\Users\Charles\Documents\ArcGIS\san_francisco\san_francisco_vote_2016.gdb'
table_names = arcpy.ListTables()

NO_TOUCH = ['OBJECTID', 'PrecinctName', 'ReportingType', 'PrecinctId']
FLOAT = ['Turnout_Percent']

for df_name, df in sorted(dfs_fixed_names.items(), key=lambda x: x[0]):
    if df_name == 'Contents':
        continue
    table_name = [x for x in table_names if x[1:4] == df_name[:3]][0]
    full_field_names = df.columns.values
    cut_fields = arcpy.ListFields(table_name)
    for ix, field in enumerate(cut_fields):
        if field.name not in NO_TOUCH:  # We know that all of the no touch columns have their full names, too, so this is fine
            original_name = field.name
            temp_name = field.name[:5] + '_temp'
            if original_name in full_field_names:
                full_name = original_name
            else:
                full_name = full_field_names[ix-1]  # -1 is for OBJECTID
            arcpy.AddField_management(table_name, temp_name, 'FLOAT' if full_name in FLOAT else 'LONG')
            arcpy.CalculateField_management(table_name, temp_name, u'!{}!'.format(original_name), "PYTHON_9.3")
            arcpy.DeleteField_management(table_name, original_name)
            arcpy.AlterField_management(table_name, temp_name, full_name[:31])  # Two birds one stone, also 31 is max length
    print('{} complete'.format(table_name))
    

T105___U_S__Senator complete
T110___U_S__Representative__District_12 complete
T115___U_S__Representative__District_13 complete
T120___U_S__Representative__District_14 complete
T125___State_Senate__District_11 complete
T130___State_Assembly__District_17 complete
T135___State_Assembly__District_19 complete
T140___Board_of_Supervisors__District_1 complete
T145___Board_of_Supervisors__District_3 complete
T150___Board_of_Supervisors__District_5 complete
T155___Board_of_Supervisors__District_7 complete
T160___Board_of_Supervisors__District_9 complete
T165___Board_of_Supervisors__District_11 complete
T170___Superior_Court_Judge__Seat_7 complete
T175___Member__Board_of_Education complete
T180___Member__Community_College_Board complete
T185___BART_Director__District_7 complete
T190___BART_Director__District_9 complete
T195___State_Proposition_51 complete
T200___State_Proposition_52 complete
T205___State_Proposition_53 complete
T210___State_Proposition_54 complete
T215___State_Proposition_55 com