In [4]:
%%capture
# Imports
import pandas as pd
import numpy as np
import json
from os import listdir
from os.path import isfile, join
import warnings
warnings.filterwarnings('ignore')

In [5]:
%%capture
# Constants
dataPath = '/Users/dylansmith/Documents/ActiveProjects/PoliticalAnalysis/data'
party_abbrev = {'REP': 'republican', 'DEM': 'democrat', 'IPD':'independent','LIB':'libertarian'}
office = {'house': 'US House','senate': 'US Senate', 'president':'President'}
pollOut = ['year', 'state_po','FIPS', 'office', 'DISTRICT', 
           'special', 'candidate', 'party', 'candidatevotes', 'totalvotes']

## Political Party Dataset

The political parties for each country were scored against a list of different ideals by experts and rated across the years (website:https://www.v-dem.net/en/data/data/v-party-dataset/).  

In [28]:
RENAME_COLS = {'v2paenname':'party',
               'v2pashname':'party_name_short',
               'country_name':'country_name',
               'year':'year',
               'v2xpa_illiberal':'illiberalism',
               'v2xpa_popul':'populism',
              'v2pagovsup':'government_support',
              'v2paanteli': 'anti_elitism',
              'v2papeople': 'people_centrism',
              'v2paopresp': 'political_opponents_respect',
              'v2paplur': 'political_pluralism',
              'v2paminor':'observes_minority_rights',
              'v2paviol': 'rejection_of_political_violence',
              'v2paimmig': 'supports_immigration',
              'v2palgbt': 'lgbt_social_equality',
              'v2paculsup': 'opposes_cultural_superiority',
              'v2parelig': 'invokes_religious_principles',
              'v2pagender': 'gender_equality_representation',
              'v2pawomlab':'working_women',
              'v2pariglef': 'economic_right_leaning_scale',
              'v2pawelf': 'suppors_welfare',
              'v2paclient': 'clientelism',
               'v2pasalie_0':'importance_anti_elitism',
               'v2pasalie_1':'importance_people_centrism',
               'v2pasalie_2':'importance_political_pluralism',
               'v2pasalie_3':'importance_observes_minority_rights',
               'v2pasalie_4':'importance_supports_immigration',
               'v2pasalie_5':'importance_lgbt_social_equality',
               'v2pasalie_6':'importance_opposes_cultural_superiority',
               'v2pasalie_7':'importance_invokes_religious_principles',
               'v2pasalie_8':'importance_gender_equality_representation',
               'v2pasalie_9':'importance_suppors_welfare',
               'v2pasalie_10':'importance_economic_issues',
               'v2pasalie_11':'importance_clientelism',
               'v2pasalie_12':'importance_envirnomental_protection',
               'v2pasalie_13':'importance_farmers_issues',
              'v2pasalie_14':'importance_leader',
              'v2pasalie_15':'importance_anti_corruption',
              'v2pasalie_16':'importance_intimidation',
              'v2pagroup_0':'main_support_not_observable',
              'v2pagroup_1':'main_support_aristocracy',
              'v2pagroup_2':'main_support_agrarian_elites',
              'v2pagroup_3':'main_support_business_elites',
              'v2pagroup_4':'main_support_military',
              'v2pagroup_5':'main_support_racial',
              'v2pagroup_6':'main_support_religious',
              'v2pagroup_7':'main_support_local_elites',
              'v2pagroup_8':'main_support_working_class_urban',
              'v2pagroup_9':'main_support_middle_class_urban',
              'v2pagroup_10':'main_support_working_class_rural',
              'v2pagroup_11':'main_support_middle_class_rural',
              'v2pagroup_12':'main_support_separatists',
              'v2pagroup_13':'main_support_woman',
              'v2paactcom':'local_organizational_strength',
              'v2paactcom':'affiliate_organizations',
              'v2padisa':'internal_cohesion',
              'v2paind':'leader_personalization_of_party',
              'ep_antielite_salience':'importance_anti_establishment_rhetoric',
              'ep_corrupt_salience':'importance_reducing_political_corruption',
              'ep_members_vs_leadership':'leader_controls_party_policy',
              'ep_people_vs_elite':'people_over_elite_decisions',
              'ep_type_populism':'populism_type',
              'ep_type_populist_values':'populist_values_type',
              'ep_v8_popul_rhetoric':'favors_populist_rhetoric',
              'ep_v9_popul_saliency':'importance_populism',
              'ep_galtan':'authoritarian_position',
              'ep_galtan_salience':'importance_libertarian',
              'ep_v6_lib_cons':'social_conservative_values',
               'ep_v7_lib_cons_saliency':'importance_social_values'}

ORDINAL_REPLACE = {'populism_type': {1: 'strongly_pluralist', 2:'moderately_pluralist', 
                                    3:'moderately_populist', 4:'strongly_populist'},
                   'populist_values_type':{1: 'pluralist_liberal', 2: 'pluralist_conservative',
                                          3: 'populist_liberal', 4: 'populist_conservative'}}

In [32]:
# Get the Presidential Polling data
df_parties = pd.read_csv(dataPath + '/raw/v_dem_pol_parties/V-Dem-CPD-Party-V1.csv',
                    index_col = False,
                    encoding='utf-8')

df_parties  = df_parties[df_parties['country_name'] == 'United States of America']
df_parties.head()

Unnamed: 0,v2paenname,v2paorname,v2pashname,v2paid,pf_party_id,party_gaps,pf_url,country_name,histname,country_id,...,ep_galtan_salience,GPS_ID,ep_members_vs_leadership,ep_v6_lib_cons,ep_people_vs_elite,ep_v7_lib_cons_saliency,ep_type_populism,ep_type_populist_values,ep_v8_popul_rhetoric,ep_v9_popul_saliency
1436,Democratic Party,,Dem,432,432,,https://partyfacts.herokuapp.com/data/partycod...,United States of America,United States of America,20,...,,,,,,,,,,
1437,Democratic Party,,Dem,432,432,,https://partyfacts.herokuapp.com/data/partycod...,United States of America,United States of America,20,...,,,,,,,,,,
1438,Democratic Party,,Dem,432,432,,https://partyfacts.herokuapp.com/data/partycod...,United States of America,United States of America,20,...,,,,,,,,,,
1439,Democratic Party,,Dem,432,432,,https://partyfacts.herokuapp.com/data/partycod...,United States of America,United States of America,20,...,,,,,,,,,,
1440,Democratic Party,,Dem,432,432,,https://partyfacts.herokuapp.com/data/partycod...,United States of America,United States of America,20,...,,,,,,,,,,


In [33]:
# Select the columns to use and rename them based upon the english definitions
df_pol_out = df_parties[list(RENAME_COLS.keys())]
df_pol_out = df_pol_out.rename(RENAME_COLS, axis = 1)

# Replace the Ordinal Values
for key in ORDINAL_REPLACE.keys():
    df_pol_out[key].replace(to_replace = ORDINAL_REPLACE[key], inplace = True)

df_pol_out['party'] = df_pol_out['party'].apply(lambda x: x.split(' ')[0].lower())
df_pol_out.head()

Unnamed: 0,party,party_name_short,country_name,year,illiberalism,populism,government_support,anti_elitism,people_centrism,political_opponents_respect,...,leader_controls_party_policy,people_over_elite_decisions,populism_type,populist_values_type,favors_populist_rhetoric,importance_populism,authoritarian_position,importance_libertarian,social_conservative_values,importance_social_values
1436,democratic,Dem,United States of America,1900,,,,,,,...,,,,,,,,,,
1437,democratic,Dem,United States of America,1902,,,,,,,...,,,,,,,,,,
1438,democratic,Dem,United States of America,1904,,,,,,,...,,,,,,,,,,
1439,democratic,Dem,United States of America,1906,,,,,,,...,,,,,,,,,,
1440,democratic,Dem,United States of America,1908,,,,,,,...,,,,,,,,,,


In [34]:
df_pol_out.to_csv(dataPath + '/processed/vdem_political_parties.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')

## Democracy Quality Dataset
This represents the democratic freedom within each government in the world.  In order to analyze with the US data, ingest and output only the relevant features.

In [4]:
# Get the Presidential Polling data
df_dqual = pd.read_csv(dataPath + '/raw/v_dem_demo/V-Dem-CY-Full+Others-v10.tsv.gz',
                    index_col = False,
                    compression = 'gzip',
                    sep='\t',
                    encoding='utf-8')

# Filter on the USA column and delete the dataframe from memory
df_usa = df_dqual[df_dqual['country_text_id'] == 'USA']
del df_dqual
df_usa.head()

Unnamed: 0,country_name,country_text_id,country_id,year,historical_date,project,historical,histname,codingstart,codingend,...,e_miurbpop,e_pefeliex,e_peinfmor,e_pelifeex,e_pematmor,e_wb_pop,e_civil_war,e_miinteco,e_miinterc,e_pt_coup
2865,United States of America,USA,20,1789,1789-12-31,1,1,United States of America,1088,2019,...,,,,,,,,0.0,0.0,
2866,United States of America,USA,20,1790,1790-12-31,1,1,United States of America,1088,2019,...,,,,,,,,0.0,1.0,
2867,United States of America,USA,20,1791,1791-12-31,1,1,United States of America,1088,2019,...,,,,,,,,0.0,1.0,
2868,United States of America,USA,20,1792,1792-12-31,1,1,United States of America,1088,2019,...,,,,,,,,0.0,1.0,
2869,United States of America,USA,20,1793,1793-12-31,1,1,United States of America,1088,2019,...,,,,,,,,0.0,1.0,


In [6]:
# Drop all null columns and write to file
df_usa = df_usa.dropna(axis=1,how='all')        
        
df_usa.to_csv(dataPath + '/processed/vdem.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')

# Voting Data Extract and Formatting

The below data was downloaded from Harvard's Dataverse and compressed using gzip to optimize storage space.  The below process ingests the data and formats it under one structure to be combined together for further feature creation and analysis.  This data represents county/district level voting results for the past 50 years.  Data coverage is listed below:

  - President Returns -> County Level for the past 5 elections (2000 - 2016)
  - House Returns -> District Level returns for the past 50 + years (1978 - 2018)
  - Senate Returns -> County Level returns for the past 50+ years (1978 - 2018) 
  
 2020 Results: https://engaging-data.com/county-electoral-map-land-vs-population/

In [13]:
# Get the Presidential Polling data
df_pres = pd.read_csv(dataPath + '/raw/voting/President/2000-2016_Pres_election.tsv.gz',
                    index_col = False,
                    compression = 'gzip',
                    sep='\t',
                    encoding='latin-1')

df_pres.head()

Unnamed: 0,year,state,state_po,county,FIPS,office,candidate,party,candidatevotes,totalvotes,version
0,2000,Alabama,AL,Autauga,1001.0,President,Al Gore,democrat,4942.0,17208,20190722
1,2000,Alabama,AL,Autauga,1001.0,President,George W. Bush,republican,11993.0,17208,20190722
2,2000,Alabama,AL,Autauga,1001.0,President,Ralph Nader,green,160.0,17208,20190722
3,2000,Alabama,AL,Autauga,1001.0,President,Other,,113.0,17208,20190722
4,2000,Alabama,AL,Baldwin,1003.0,President,Al Gore,democrat,13997.0,56480,20190722


In [14]:
df_pres = df_pres[df_pres['FIPS'].notnull()]
df_pres['FIPS'] = df_pres['FIPS'].astype(int).astype(str).str.pad(width = 5, side = 'left', fillchar = '0')
df_pres.insert(4, 'DISTRICT', 0)
df_pres.insert(5, 'special', False)
df_pres = df_pres[pollOut]

In [15]:
# Get the House Polling Data
df_house = pd.read_csv(dataPath + '/raw/voting/House/76-2018_House.tsv.gz',
                    index_col = False,
                    compression = 'gzip',
                    sep='\t',
                    encoding='latin-1')

df_house.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,runoff,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,1976,Alabama,AL,1,63,41,US House,1,gen,False,False,Bill Davenport,democrat,False,total,58906,157170,False,20171005
1,1976,Alabama,AL,1,63,41,US House,1,gen,False,False,Jack Edwards,republican,False,total,98257,157170,False,20171005
2,1976,Alabama,AL,1,63,41,US House,1,gen,False,False,,,True,total,7,157170,False,20171005
3,1976,Alabama,AL,1,63,41,US House,2,gen,False,False,J. Carole Keahey,democrat,False,total,66288,156362,False,20171005
4,1976,Alabama,AL,1,63,41,US House,2,gen,False,False,,,True,total,5,156362,False,20171005


In [16]:
# Get the County Per District
df_lkup = pd.read_csv(dataPath + '/raw/voting/cd116/cd116.txt',
                    index_col = False,
                    encoding='latin-1',
                    dtype = {'BLOCKID':np.str})
df_lkup['BLOCKID'] = df_lkup['BLOCKID'].str[:5]
df_lkup.drop_duplicates(inplace = True)

df_lkup['state_fips'] = df_lkup['BLOCKID'].astype(int) // 1000
df_lkup.rename(columns={'BLOCKID':'FIPS','CD116': 'DISTRICT'}, inplace = True)
df_lkup.head()

Unnamed: 0,FIPS,DISTRICT,state_fips
0,1129,1,1
1818,1099,1,1
2673,1097,1,1
4373,1025,1,1
4823,1053,1,1


In [17]:
# Format House data for concatenation with other polling results
df_house = df_house[(df_house['writein'] == False) & (df_house['party'] != 'NA')]
df_house.rename(columns = {'district':'DISTRICT'}, inplace = True)

df_house = df_house.merge(df_lkup, on = ['state_fips','DISTRICT'], how = 'left')
df_house = df_house[pollOut]
df_house.head()

Unnamed: 0,year,state_po,FIPS,office,DISTRICT,special,candidate,party,candidatevotes,totalvotes
0,1976,AL,1129,US House,1,False,Bill Davenport,democrat,58906,157170
1,1976,AL,1099,US House,1,False,Bill Davenport,democrat,58906,157170
2,1976,AL,1097,US House,1,False,Bill Davenport,democrat,58906,157170
3,1976,AL,1025,US House,1,False,Bill Davenport,democrat,58906,157170
4,1976,AL,1053,US House,1,False,Bill Davenport,democrat,58906,157170


In [14]:
# Now Ingest the Senate file
df_sen = pd.read_csv(dataPath + '/raw/voting/Senate/election_results.tsv.gz',
                    index_col = False,
                    compression = 'gzip',
                    sep = '\t',
                    encoding='latin-1')

df_sen.head()

Unnamed: 0,year,state,state_po,state_fips,state_cen,state_ic,office,district,stage,runoff,special,candidate,party,writein,mode,candidatevotes,totalvotes,unofficial,version
0,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,,False,Sam Steiger,republican,False,total,321236,741210,False,20171011.0
1,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,,False,Wm. Mathews Feighan,independent,False,total,1565,741210,False,20171011.0
2,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,,False,Dennis DeConcini,democrat,False,total,400334,741210,False,20171011.0
3,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,,False,Allan Norwitz,libertarian,False,total,7310,741210,False,20171011.0
4,1976,Arizona,AZ,4,86,61,US Senate,statewide,gen,,False,Bob Field,independent,False,total,10765,741210,False,20171011.0


In [19]:
df_sen = df_sen[(df_sen['writein'] == False) & (df_sen['party'] != 'NA')]
df_sen.rename(columns = {'district':'DISTRICT'}, inplace = True)

df_sen['state_fips'] = (df_sen['state_fips']*1000).astype(str)\
                                          .str.pad(width = 5, side = 'left', fillchar = '0')
df_sen.rename(columns = {'state_fips':'FIPS'}, inplace = True)

df_sen = df_sen[pollOut]
df_sen.head()

Unnamed: 0,year,state_po,FIPS,office,DISTRICT,special,candidate,party,candidatevotes,totalvotes
0,1976,AZ,4000,US Senate,statewide,False,Sam Steiger,republican,321236,741210
1,1976,AZ,4000,US Senate,statewide,False,Wm. Mathews Feighan,independent,1565,741210
2,1976,AZ,4000,US Senate,statewide,False,Dennis DeConcini,democrat,400334,741210
3,1976,AZ,4000,US Senate,statewide,False,Allan Norwitz,libertarian,7310,741210
4,1976,AZ,4000,US Senate,statewide,False,Bob Field,independent,10765,741210


#### 2020 Data Formatting

Added to include the 2020 results from a new dataset.

NYT: https://static01.nyt.com/elections-assets/2020/data/api/2020-11-03/state-page/arizona.json
Election Scraper: https://github.com/alex/nyt-2020-election-scraper

In [20]:
with open(dataPath + '/raw/voting/President/2020_results.json') as f:
    data = json.load(f)

In [21]:
df_2020 = pd.DataFrame()
for race in data['data']['races']:
    candidates = {}
    for person in race['candidates']:
        candidates[person['candidate_key']] =  {'name': person['name_display'],
                              'party': person['party_id']}
    state = race['state_id'] 
    office = race['office'] 
    county_data = []
    for county in race['counties']:
        fips, name, total_votes =  county['fips'], county['name'], county['votes']
        for cand, votes in county['results'].items():
            results = {'FIPS': fips, 'county_name': name, 'totalvotes': total_votes, 'DISTRICT': 'statewide',
                      'office': office, 'candidatevotes': votes,'year' : 2020, 'special': 'False',
                      'state_po': state, 'candidate': candidates[cand]['name'], 'party': candidates[cand]['party']}
            county_data.append(results)
    df = pd.DataFrame(county_data)
    df_2020 = pd.concat([df_2020, df[pollOut]])
    
df_2020.head()

Unnamed: 0,year,state_po,FIPS,office,DISTRICT,special,candidate,party,candidatevotes,totalvotes
0,2020,AK,2901,President,statewide,False,Donald J. Trump,republican,3511,7360
1,2020,AK,2901,President,statewide,False,Joseph R. Biden Jr.,democrat,3477,7360
2,2020,AK,2901,President,statewide,False,Jo Jorgensen,libertarian,216,7360
3,2020,AK,2901,President,statewide,False,Jesse Ventura,green,50,7360
4,2020,AK,2901,President,statewide,False,Write-ins,write-ins,46,7360


In [22]:
# Combine all datasets into one dataframe and write back to the processed dataset.
# Get the the election results from NY times for the actual FIPS related data.
df_polling = pd.concat([df_sen, df_house, df_pres, df_2020])
df_polling.to_csv(dataPath + '/processed/voting.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')

df_polling.head()

Unnamed: 0,year,state_po,FIPS,office,DISTRICT,special,candidate,party,candidatevotes,totalvotes
0,1976,AZ,4000,US Senate,statewide,False,Sam Steiger,republican,321236.0,741210
1,1976,AZ,4000,US Senate,statewide,False,Wm. Mathews Feighan,independent,1565.0,741210
2,1976,AZ,4000,US Senate,statewide,False,Dennis DeConcini,democrat,400334.0,741210
3,1976,AZ,4000,US Senate,statewide,False,Allan Norwitz,libertarian,7310.0,741210
4,1976,AZ,4000,US Senate,statewide,False,Bob Field,independent,10765.0,741210


# GDP Data 

This data estimates different industries gdp output from each county and also tracks the YoY change from 2000 onwards

In [37]:
# Constants
industries = ['All industry total','Accommodation and food services',
 'Administrative and support and waste management and remediation services',
 'Agriculture, forestry, fishing and hunting', 'All industry total (percent change)',
 'Arts, entertainment, and recreation','Construction','Durable goods manufacturing','Educational services',
 'Finance and insurance','Government and government enterprises','Health care and social assistance',
 'Information', 'Management of companies and enterprises', 'Manufacturing',
 'Mining, quarrying, and oil and gas extraction', 'Nondurable goods manufacturing', 'Private industries',
 'Professional, scientific, and technical services','Real estate and rental and leasing', 'Retail trade',
 'Trade', 'Transportation and utilities', 'Transportation and warehousing', 'Utilities', 'Wholesale trade']


In [39]:
def gdp_reformat(df, typ):
    # Replace GeoFIPS with a string, drop unnecessary columns 
    df['GeoFIPS'] = df['GeoFIPS'].str.replace('"','').str.strip()
    df['Description'] = df['Description'].str.strip()
    df.rename(columns = {'GeoFIPS': 'FIPS'}, inplace = True)
    df = df[df['FIPS'].astype(int) != 0]
    df.drop(columns = ['TableName','Region','GeoName','IndustryClassification','Unit'], inplace = True)

    for yr in range(2002,2019):
        df = df.replace({str(yr): {'(D)': None, '(NA)': None}})
        df[str(yr)] = df[str(yr)].astype('float')

    df = pd.melt(df, id_vars = ['FIPS','LineCode','Description'], value_vars = [str(YR) for YR in range(2002,2019)])
    df.rename(columns = {'variable':'YR'}, inplace = True)
    df.replace({'All industry total (percent change)': 'All industry total'},inplace = True)
    df.insert(3,'UNIT',typ)
    return df

In [40]:
df_gdp = pd.read_csv(dataPath + '/raw/indicators/gdp_usd_all_areas.tsv.gz',
                index_col = False,
                compression = 'gzip',
                sep='\t',
                encoding='latin-1')

df_gdp_chg = pd.read_csv(dataPath + '/raw/indicators/gdp_change_all_areas.tsv.gz',
                index_col = False,
                compression = 'gzip',
                sep='\t',
                encoding='latin-1')

df_gdp.head()

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,2001,2002,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,"""00000""",United States *,,CAGDP2,1,...,All industry total,Thousands of dollars,10581822000,10936418000,...,14448932000,14992052000,15542582000,16197007000,16784851000,17527258000,18224780000,18715040000,19519424000,20580223000
1,"""00000""",United States *,,CAGDP2,2,...,Private industries,Thousands of dollars,9188887407,9462019658,...,12403879944,12884088643,13405519970,14037519000,14572341002,15255889000,15883937000,16326092000,17065802000,18035586000
2,"""00000""",United States *,,CAGDP2,3,11,"Agriculture, forestry, fishing and hunting",Thousands of dollars,99835618,95628974,...,129967839,146299048,180944850,179573000,215600643,201003000,180655000,164281000,174579000,166464000
3,"""00000""",United States *,,CAGDP2,6,21,"Mining, quarrying, and oil and gas extraction",Thousands of dollars,123924346,112417015,...,275840872,305838102,356315451,358798000,386518621,416375000,259863000,215621000,287324000,346645000
4,"""00000""",United States *,,CAGDP2,10,22,Utilities,Thousands of dollars,181332474,177617765,...,258241011,278837008,287459067,279651000,286340248,298076000,299214000,302355000,315114000,325852000


In [41]:
df_gdp = gdp_reformat(df_gdp, 'USD')
df_gdp_chg = gdp_reformat(df_gdp_chg, 'PCT_CHG')
df_gdp_chg.head()

Unnamed: 0,FIPS,LineCode,Description,UNIT,YR,value
0,1001,1,All industry total,PCT_CHG,2002,2.2
1,1001,2,Private industries,PCT_CHG,2002,0.91
2,1001,3,"Agriculture, forestry, fishing and hunting",PCT_CHG,2002,-0.36
3,1001,6,"Mining, quarrying, and oil and gas extraction",PCT_CHG,2002,-0.07
4,1001,10,Utilities,PCT_CHG,2002,3.25


In [53]:
# Merge the two files for one output
df_gdp_out = df_gdp.merge(df_gdp_chg, on = ['FIPS','YR','Description','LineCode'], how = 'left')
df_gdp_out.drop(columns = ['UNIT_x','UNIT_y','LineCode'], inplace = True)
df_gdp_out.rename(columns = {'value_y': 'PCT_CHG','value_x':'USD'}, inplace = True)

# Filter the extraneous industries out of the df
df_gdp_out['YR'] = df_gdp_out['YR'].astype(int)
df_gdp_out = df_gdp_out[(df_gdp_out['YR'] % 2 == 0) & (df_gdp_out['Description'].isin(industries))]

# Get the breakdown of each industry within a county as well as normalize the total county output within the year
df_gdp_out['PCT_TOTL'] = df_gdp_out.groupby(['FIPS','YR'])['USD'].apply(lambda x: x * 100 / x.max())

# Normalize each counties total based upon the country + year
df_gdp_out.loc[df_gdp['Description'] == 'All industry total', 'USD'] = \
    df_gdp_out[df_gdp_out['Description'] == 'All industry total'].groupby('YR')['USD'].apply(lambda x: (x - x.mean()) / x.std())

# Get the Four year ago USD output from each county and calculate the percent change
df_gdp_ago = df_gdp_out[['FIPS','Description','YR','USD']]
df_gdp_ago.rename(columns = {'USD':'FOUR_YR_PCT_CHG'}, inplace = True)
df_gdp_ago['YR'] = df_gdp_ago['YR'] + 4
df_gdp_out = df_gdp_out.merge(df_gdp_ago, on = ['FIPS','YR','Description'], how = 'left')
df_gdp_out['FOUR_YR_PCT_CHG'] = (df_gdp_out['USD'] - df_gdp_out['FOUR_YR_PCT_CHG'] ) / df_gdp_out['FOUR_YR_PCT_CHG'] * 100

df_gdp_out.head()

Unnamed: 0,FIPS,Description,YR,USD,PCT_CHG,PCT_TOTL,FOUR_YR_PCT_CHG
0,1000,All industry total,2002,1.328426,,100.0,
1,1000,Private industries,2002,106583900.0,,83.404035,
2,1000,"Agriculture, forestry, fishing and hunting",2002,1588009.0,,1.242648,
3,1000,"Mining, quarrying, and oil and gas extraction",2002,1207862.0,,0.945176,
4,1000,Utilities,2002,3442155.0,,2.693554,


In [54]:
# Create the final Feature dataset per year
df_gdp_final = pd.pivot_table(df_gdp_out, index = ['YR','FIPS'], columns = 'Description')
df_gdp_final.columns = ['%s_%s' % (ind, val) for (ind, val) in df_gdp_final.columns]
df_gdp_final.reset_index(inplace = True)

df_gdp_final.head()

Unnamed: 0,YR,FIPS,FOUR_YR_PCT_CHG_Accommodation and food services,FOUR_YR_PCT_CHG_Administrative and support and waste management and remediation services,"FOUR_YR_PCT_CHG_Agriculture, forestry, fishing and hunting",FOUR_YR_PCT_CHG_All industry total,"FOUR_YR_PCT_CHG_Arts, entertainment, and recreation",FOUR_YR_PCT_CHG_Construction,FOUR_YR_PCT_CHG_Durable goods manufacturing,FOUR_YR_PCT_CHG_Educational services,...,USD_Nondurable goods manufacturing,USD_Private industries,"USD_Professional, scientific, and technical services",USD_Real estate and rental and leasing,USD_Retail trade,USD_Trade,USD_Transportation and utilities,USD_Transportation and warehousing,USD_Utilities,USD_Wholesale trade
0,2002,1000,,,,,,,,,...,9434185.0,106583943.0,6697631.0,13597001.0,9832119.0,17088178.0,6693176.0,3251020.0,3442155.0,7256058.0
1,2002,1001,,,,,,,,,...,,673655.0,15695.0,104335.0,83624.0,102404.0,55406.0,12819.0,42587.0,18780.0
2,2002,1003,,,,,,,,,...,79188.0,2880474.0,105693.0,722074.0,351028.0,476434.0,65616.0,51082.0,14533.0,125405.0
3,2002,1005,,,,,,,,,...,85670.0,550980.0,,60325.0,41044.0,,53574.0,43220.0,10354.0,
4,2002,1007,,,,,,,,,...,,173548.0,6004.0,38025.0,16323.0,,9399.0,7256.0,2143.0,


In [55]:
# Write the data into Google Drive
df_gdp_final.to_csv(dataPath + '/processed/gdp.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')

# Demographic Data From the Census

In [1]:
# Constants
CENSUS_FIELDS = {'Asian Alone': 'AA','American Indian Alaska': 'IA', 
                 'White Alone':'WA','Black Alone':'BA',
                "Native Hawaiian Pacific": 'NA','Two Or More Races': 
                 'TOM',"Hispanic": 'H'}
AGE_CODES = {1:"0_to_4",2:"5_to_9",3:"10_to_14",4:"15_to_19",5:"20_to_24",
          6:"25_to_29",7:"30_to_34",8:"35_to_39",9:"40_to_44",10:"45_to_49",
          11:"50_to_54",12:"55_to_59",13:"60_to_64",14:"65_to_69",15:"70_to_74",
          16:"75_to_79",17:"80_to_84", 18:"85_to_Older", 0:'Total_Population'}

In [2]:
def formatDemographicData(df, yr):
    # Reformat a few of the fields to match the geographic needs and reformat the Year column.
    df.replace({'X':np.nan}, inplace = True)
    df = df.apply(pd.to_numeric, errors='ignore')
    pd.to_numeric(df['WA_MALE'])
    df.insert(0, 'FIPS', (df['STATE']*1000 + df['COUNTY']).astype(str).str.pad(width = 5, side = 'left', fillchar = '0'))
    df['AGEGRP'] = df['AGEGRP'].replace(AGE_CODES)
    df['YEAR'] = df['YEAR'].apply(lambda x: x + (yr - 3))
    df = df[(df['YEAR'] >= yr) & (~df['AGEGRP'].isin(['0_to_4','5_to_9','10_to_14'])) & (df['YEAR'] % 2 == 0)]

    # Get the total population to decompose each group into a percent
    df_out = df[['YEAR','FIPS','TOT_POP']][df['AGEGRP'] == 'Total_Population']
    df_out.rename(columns = {'TOT_POP':'TOT_POP_CNTY'},inplace = True)
    df = df.merge(df_out, on = ['FIPS', 'YEAR'], how = 'left')
    df['TOT_POP_CNTY'] = df['TOT_POP_CNTY'].astype(float)
    
    # Get state aggregations
    df_state = df[df.columns.difference(['FIPS'])].groupby(['STATE','AGEGRP','YEAR','STNAME']).sum()
    df_state['CTYNAME'] = 'State Level'
    df_state.reset_index(inplace = True)
    df_state.insert(0, 'FIPS', (df_state['STATE']*1000).astype(str).str.pad(width = 5, side = 'left', fillchar = '0')) 
    df = pd.concat([df, df_state[list(df)]])
    
    # Get Demographics as a percentage of age
    out_cols = ['TOT_MALE','TOT_FEMALE']
    df['TOT_MALE'] = round(df['TOT_MALE'] / df['TOT_POP_CNTY'] * 100, 8)
    df['TOT_FEMALE'] = round(df['TOT_FEMALE'] / df['TOT_POP_CNTY'] * 100, 8)
    for key, value in CENSUS_FIELDS.items():
        for gender in ['MALE','FEMALE']:
            demo = '%s_%s' % (value, gender)
            df[demo] = df[demo].astype(float)
            df['H_%s' % gender] = df['H_%s' % gender].astype(float)
      
            if value == 'WA':
                df[demo] = round((df[demo] - df['H_%s' % gender]) / df['TOT_POP_CNTY'] * 100, 8)
            else:
                df[demo] = round(df[demo] / df['TOT_POP_CNTY'] * 100, 8)

            out_cols.append(demo)

    return df[['YEAR','FIPS', 'CTYNAME', 'AGEGRP','STATE'] + out_cols]

In [6]:
# Format all the history files and write them to the output
histPath = dataPath + '/raw/demographics/2009'
demofiles = [f for f in listdir(histPath) if isfile(join(histPath, f))]

df_out = pd.DataFrame()
for file_name in demofiles:
    df_tmp = pd.read_csv(histPath + '/'+ file_name,
                   encoding='latin-1')
    df_format = formatDemographicData(df_tmp, 2000)
    df_out = pd.concat([df_out, df_format])

#download the dataframes for the age fields and the census fields
df_demo_src = pd.read_csv(dataPath + '/raw/demographics/2018/cc-est2018-alldata.csv.gz',
                index_col = False,
                compression = 'gzip',
                sep='\t',
                encoding='latin-1')

df_format = formatDemographicData(df_demo_src, 2010)

df_demo = pd.concat([df_out, df_format])
df_demo.head()

Unnamed: 0,YEAR,FIPS,CTYNAME,AGEGRP,STATE,TOT_MALE,TOT_FEMALE,AA_MALE,AA_FEMALE,IA_MALE,...,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,H_MALE,H_FEMALE
0,2000,39001,Adams County,Total_Population,39,49.028789,50.971211,0.03658,0.087793,0.380437,...,47.69726,49.573838,0.13169,0.073161,0.01829,0.014632,0.446282,0.563339,0.31825,0.321908
1,2000,39001,Adams County,15_to_19,39,3.584885,3.665362,0.003658,0.007316,0.032922,...,3.504408,3.540988,0.0,0.007316,0.0,0.003658,0.021948,0.043897,0.021948,0.03658
2,2000,39001,Adams County,20_to_24,39,3.0691,3.123971,0.003658,0.003658,0.021948,...,2.973991,3.061784,0.01829,0.007316,0.0,0.0,0.01829,0.025606,0.032922,0.010974
3,2000,39001,Adams County,25_to_29,39,3.178842,3.123971,0.0,0.0,0.029264,...,3.087391,3.058126,0.014632,0.003658,0.0,0.0,0.032922,0.025606,0.014632,0.01829
4,2000,39001,Adams County,30_to_34,39,3.445879,3.244687,0.003658,0.014632,0.025606,...,3.343454,3.149577,0.014632,0.003658,0.003658,0.0,0.03658,0.01829,0.01829,0.029264


In [9]:
# Format the Age Groups appropriately
for grp in list(df_demo)[5:]:
    df_demo.loc[df_demo['AGEGRP'] == '15_to_19', grp] = df_demo[df_demo['AGEGRP'] == '15_to_19'][grp].astype(float) * 2 / 5

# Aggregate the ages into different groups and filter out people who are too young to vote.  Also filter odd numbered years
df_demo['AGEGRP'].replace({'15_to_19':'18_to_29', '20_to_24': '18_to_29', '25_to_29':'18_to_29',
                          '30_to_34':'30_to_49', '35_to_39': '30_to_49', '40_to_44': '30_to_49' , '45_to_49': '30_to_49',
                          '50_to_54': '50_to_64', '55_to_59': '50_to_64' , '60_to_64': '50_to_64',
                          '65_to_69': '65+', '70_to_74': '65+' , '75_to_79': '65+', '75_to_79': '65+', '85_to_Older': '65+'},
                          inplace = True)
df_demo = df_demo.groupby(['YEAR','FIPS','AGEGRP'], as_index=False).sum()

# Create the final Feature dataset per year
df_demo = pd.pivot_table(df_demo, index = ['YEAR','FIPS'], columns = 'AGEGRP')
#df_demo = df_demo.pivot(index = ['YEAR','FIPS'], columns = 'AGEGRP')
df_demo.columns = ['%s_%s' % (demo, age) for (demo, age) in df_demo.columns]
df_demo.reset_index(inplace = True)

df_demo.head()

Unnamed: 0,YEAR,FIPS,AA_FEMALE_18_to_29,AA_FEMALE_30_to_49,AA_FEMALE_50_to_64,AA_FEMALE_65+,AA_FEMALE_80_to_84,AA_FEMALE_Total_Population,AA_MALE_18_to_29,AA_MALE_30_to_49,...,WA_FEMALE_50_to_64,WA_FEMALE_65+,WA_FEMALE_80_to_84,WA_FEMALE_Total_Population,WA_MALE_18_to_29,WA_MALE_30_to_49,WA_MALE_50_to_64,WA_MALE_65+,WA_MALE_80_to_84,WA_MALE_Total_Population
0,2000,1000,0.081112,0.139223,0.054899,0.019273,0.001123,0.384537,0.091126,0.112515,...,6.335615,5.244854,0.912497,35.919704,5.402142,10.374925,5.990792,3.730495,0.500039,34.175328
1,2000,1001,0.035558,0.1436,0.07066,0.029632,0.0,0.328228,0.031911,0.047867,...,6.833516,4.253282,0.615427,40.501915,5.438093,12.620806,6.386761,3.184263,0.328228,39.136579
2,2000,1003,0.031268,0.095502,0.029712,0.020515,0.000707,0.239109,0.026741,0.055886,...,8.29808,6.738211,1.048402,43.894226,5.428911,12.362229,7.728604,5.754184,0.771092,42.139108
3,2000,1005,0.030308,0.05855,0.030997,0.003444,0.003444,0.154985,0.037196,0.05855,...,4.856208,4.250043,0.74393,24.387808,4.129499,8.179783,5.007749,3.061822,0.361633,26.147753
4,2000,1007,0.019061,0.015048,0.0,0.0,0.0,0.055177,0.007022,0.010032,...,7.03752,5.186597,0.682183,39.571629,6.17175,11.592095,6.79675,3.867376,0.331059,38.448034


In [10]:
# Write the data into the processed folder
df_demo.to_csv(dataPath + '/processed/demographics.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')  

# Indicator Analysis

1. Cultural Indicators for the County
2. Unemployment + Employment Values

Files that are skipped until further notice.

    - People: Analysis of net immigration from different areas over the different time zones.
    - Veterans: Percent of population that is veterans

In [60]:
df_culture = pd.read_csv(dataPath+ '/raw/indicators/regions.csv',
                        index_col = False,
                        encoding='latin-1',
                        dtype = {'FIPS':np.str})

df_county = pd.read_csv(dataPath + '/raw/indicators/Rural_Atlas_Update22/County Classifications.tsv.gz',
                index_col = False,
                compression = 'gzip',
                sep='\t')

df_county.head()

Unnamed: 0,ï»¿FIPStxt,State,County,RuralUrbanContinuumCode2013,UrbanInfluenceCode2013,RuralUrbanContinuumCode2003,UrbanInfluenceCode2003,Metro2013,Nonmetro2013,Micropolitan2013,...,FarmDependent2003,ManufacturingDependent2000,LowEducation2000,RetirementDestination2000,PersistentPoverty2000,Noncore2013,Type_2015_Nonspecialized_NO,Metro_Adjacent2013,PersistentChildPoverty2004,RecreationDependent2000
0,1001,AL,Autauga,2.0,2.0,2.0,2.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1003,AL,Baldwin,3.0,2.0,4.0,5.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1005,AL,Barbour,6.0,6.0,6.0,6.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
3,1007,AL,Bibb,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,1009,AL,Blount,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [61]:
# Format the County file
df_county.rename({'ï»¿FIPStxt':"FIPS"}, axis = 'columns', inplace = True)
df_county['FIPS'] = df_county['FIPS'].astype(str).str.pad(width = 5, side = 'left', fillchar = '0')

# Format the Rural Urban Continuum
df_county = df_county[['FIPS'] + ['%s%s' % (col, yr) for yr in [2003,2013] for col in ['RuralUrbanContinuumCode','UrbanInfluenceCode','Metro','Nonmetro','Micropolitan']]] 
df_county.columns = df_county.columns.str.replace("2003", "2000s").str.replace('2013','2010s')
    
df_culture = df_county.merge(df_culture, on = 'FIPS', how = 'left')
df_culture.loc[df_culture['REGION'].isnull(),'REGION'] = 12
df_culture.loc[df_culture['DESCRIPTION'].isnull(),'DESCRIPTION'] = 'HI & AK'

df_cult_out = pd.DataFrame()
for yr in [2000, 2010]:
    df_tmp = df_culture[['FIPS','State','REGION','DESCRIPTION']]
    df_tmp['year_join'] = yr
    for col in ['RuralUrbanContinuumCode','UrbanInfluenceCode','Metro','Nonmetro','Micropolitan']:
        df_tmp[col] = df_culture[col + str(yr) + 's']
    df_cult_out = pd.concat([df_cult_out, df_tmp])

df_cult_out.head()

Unnamed: 0,FIPS,State,REGION,DESCRIPTION,year_join,RuralUrbanContinuumCode,UrbanInfluenceCode,Metro,Nonmetro,Micropolitan
0,1001,AL,1.0,DEEP SOUTH,2000,2.0,2.0,1.0,0.0,0.0
1,1003,AL,1.0,DEEP SOUTH,2000,4.0,5.0,0.0,1.0,1.0
2,1005,AL,1.0,DEEP SOUTH,2000,6.0,6.0,0.0,1.0,0.0
3,1007,AL,1.0,DEEP SOUTH,2000,1.0,1.0,1.0,0.0,0.0
4,1009,AL,11.0,GREATER APPALACHIA,2000,1.0,1.0,1.0,0.0,0.0


In [63]:
# Write the data into Google Drive
df_cult_out.to_csv(dataPath + '/processed/culture.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')

#### Unemployment and Employment Numbers

In [5]:
def outputCleanEmploymentValues(df_emp):
    """ Function that outputs each of the employment datasets for merging
    """
    df_emp.rename(columns = {'GeoFIPS':'FIPS'}, inplace = True)
    df_emp.drop(columns = ['GeoName','Region','TableName','IndustryClassification','Unit','LineCode'], inplace = True)
    df_emp = df_emp[~df_emp['FIPS'].str.replace("\"",'').apply(pd.to_numeric, errors='coerce').isna()]
    df_emp['FIPS'] = df_emp['FIPS'].str.replace('\"','').str.strip()
    df_emp['Description'] = df_emp['Description'].str.strip().str.upper()
    return df_emp

In [33]:
EMPLOY_SECTOR_REPLACE = {'AGRICULTURAL SERVICES, FORESTRY, AND FISHING':'FORESTRY, FISHING, AND RELATED ACTIVITIES',
'MINING':'MINING, QUARRYING, AND OIL AND GAS EXTRACTION',
'TRANSPORTATION AND PUBLIC UTILITIES':'TRANSPORTATION AND WAREHOUSING',
'FINANCE, INSURANCE, AND REAL ESTATE':'FINANCE AND INSURANCE',
'SERVICES':'OTHER SERVICES (EXCEPT GOVERNMENT AND GOVERNMENT ENTERPRISES)'}

In [53]:
# Format all the history files and write them to the output
df_empHist = pd.read_csv(dataPath + '/raw/employment/CAEMP25S__ALL_AREAS_1969_2000.csv',
                index_col = False,
                encoding = 'latin-1',
                dtype = {'LineCode':np.str})

# Format all the history files and write them to the output
df_empCurr = pd.read_csv(dataPath + '/raw/employment/CAEMP25N__ALL_AREAS_2001_2018.csv',
                index_col = False,
                encoding = 'latin-1',
                dtype = {'LineCode':np.str})

df_empHist = outputCleanEmploymentValues(df_empHist)
df_empHist['Description'] = df_empHist['Description'].replace(EMPLOY_SECTOR_REPLACE)
df_empCurr = outputCleanEmploymentValues(df_empCurr)

df_employment = df_empCurr.merge(df_empHist, how = 'left', on = ['FIPS','Description'])
df_employment.head()

Unnamed: 0,FIPS,Description,2001,2002,2003,2004,2005,2006,2007,2008,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000
0,0,TOTAL EMPLOYMENT (NUMBER OF JOBS),165522200,165095100,165921500,168839700,172338400,175868600,179543700,179213900,...,137612800,138166100,140774400,144196600,147915800,151056200,154541200,158481200,161531300,165370800
1,0,WAGE AND SALARY EMPLOYMENT,137334000,136301000,135967000,137404000,139341000,141660000,143170000,142584000,...,115086000,115666000,117717000,120568000,123412000,125711000,128681000,131920000,134766000,137610000
2,0,PROPRIETORS EMPLOYMENT,28188200,28794100,29954500,31435700,32997400,34208600,36373700,36629900,...,22526800,22500100,23057400,23628600,24503800,25345200,25860200,26561200,26765300,27760800
3,0,FARM PROPRIETORS EMPLOYMENT,2190000,2032000,1953000,1894000,1875000,1827000,1914000,1897000,...,2194000,2191000,2273000,2245000,2240000,2205000,2212000,2249000,2237000,2224000
4,0,NONFARM PROPRIETORS EMPLOYMENT 2/,25998200,26762100,28001500,29541700,31122400,32381600,34459700,34732900,...,20332800,20309100,20784400,21383600,22263800,23140200,23648200,24312200,24528300,25536800


In [54]:
# Reverse Pivot the years into rows and take care of formatting issues for the below pivot
df_employment = pd.melt(df_employment, id_vars = ['FIPS','Description'])
df_employment.rename(columns = {'variable':'YR'}, inplace = True)
df_employment['value'] = df_employment['value'].str.replace('\(NA\)','').str.replace('\(D\)','')
df_employment['value'] = df_employment['value'].apply(lambda x: float(x) if x != '' else np.nan)

# Pivot the table to get years as the rows and employment as the column values
df_employment = df_employment.pivot(index = ['FIPS','YR'], columns = 'Description')
df_employment.reset_index(inplace = True)
df_employment.columns = ['FIPS','YR'] + [y for (x,y) in df_employment.columns[2:]]
df_employment['YR'] = df_employment['YR'].astype(int)

# Get percentage of employment for each county for each year
for sector in list(df_employment)[2:]:
    if sector == 'TOTAL EMPLOYMENT (NUMBER OF JOBS)':
        continue
    df_employment[sector] = df_employment[sector].astype(float) / df_employment['TOTAL EMPLOYMENT (NUMBER OF JOBS)'].astype(float)

df_employment.head()

Unnamed: 0,FIPS,YR,ACCOMMODATION AND FOOD SERVICES,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMENT AND REMEDIATION SERVICES,"ARTS, ENTERTAINMENT, AND RECREATION",CONSTRUCTION,EDUCATIONAL SERVICES,FARM EMPLOYMENT,FARM PROPRIETORS EMPLOYMENT,FEDERAL CIVILIAN,...,PROPRIETORS EMPLOYMENT,REAL ESTATE AND RENTAL AND LEASING,RETAIL TRADE,STATE AND LOCAL,STATE GOVERNMENT,TOTAL EMPLOYMENT (NUMBER OF JOBS),TRANSPORTATION AND WAREHOUSING,UTILITIES,WAGE AND SALARY EMPLOYMENT,WHOLESALE TRADE
0,0,1969,,,,0.049101,,0.043689,0.030213,0.032058,...,0.135429,,0.147704,0.104455,,91053200.0,0.052671,,0.864571,0.045006
1,0,1970,,,,0.048191,,0.043395,0.029766,0.031793,...,0.136776,,0.150078,0.108975,,91277600.0,0.053304,,0.863224,0.045714
2,0,1971,,,,0.04868,,0.04276,0.029307,0.031295,...,0.139432,,0.153127,0.112861,,91581400.0,0.05272,,0.860568,0.046199
3,0,1972,,,,0.050183,,0.041045,0.028035,0.030282,...,0.141235,,0.152572,0.113845,,94312200.0,0.052215,,0.858765,0.045995
4,0,1973,,,,0.051554,,0.039582,0.02669,0.028844,...,0.139905,,0.152519,0.113027,,98427500.0,0.051537,,0.860095,0.046016


In [55]:
# Read Files necessary for Unemployment Analysis
df_unemp = pd.read_excel(dataPath + '/raw/indicators/Unemployment.xls',
                index_col = False)

#df_jobs = pd.read_csv(dataPath + '/raw/indicators/Rural_Atlas_Update22/Jobs.csv.gz',
#                index_col = False,
#                compression = 'gzip',
#                sep='\t')
#df_jobs.rename({'ï»¿FIPS':"FIPS"}, axis = 'columns', inplace = True)
#df_jobs = df_jobs[['FIPS','State','PctEmpAgriculture','PctEmpMining','PctEmpConstruction','PctEmpManufacturing',
# 'PctEmpTrade','PctEmpTrans','PctEmpInformation','PctEmpFIRE','PctEmpServices','PctEmpGovt',]]

df_income = pd.read_csv(dataPath + '/raw/indicators/Rural_Atlas_Update22/Income.csv.gz',
                index_col = False,
                compression = 'gzip',
                sep='\t')

df_income.head()

Unnamed: 0,ï»¿FIPS,State,County,MedHHInc,PerCapitaInc,PovertyUnder18Pct,PovertyAllAgesPct,Deep_Pov_All,Deep_Pov_Children,PovertyUnder18Num,PovertyAllAgesNum
0,0,US,United States,61937.0,32621.0,18.0,13.1,6.24959,8.598276,12997532.0,41852315.0
1,1000,AL,Alabama,49881.0,26846.0,23.9,16.8,7.611623,11.591313,255613.0,801758.0
2,1001,AL,Autauga,59338.0,29372.0,19.3,13.8,6.142609,8.910594,2509.0,7587.0
3,1003,AL,Baldwin,57588.0,31203.0,13.9,9.8,4.482528,6.214526,6442.0,21069.0
4,1005,AL,Barbour,34382.0,18461.0,43.9,30.9,12.749387,26.709797,2242.0,6788.0


In [56]:
# Select the desired columns the income values and rename to reflect the current status
df_income.rename({'ï»¿FIPS':"FIPS"}, axis = 'columns', inplace = True)
df_income = df_income[['FIPS','State','MedHHInc','PerCapitaInc','PovertyUnder18Pct','PovertyAllAgesPct',
                         'Deep_Pov_All', 'Deep_Pov_Children' ]]
df_income.columns = ['FIPS','State'] + ['Curr_%s' % col for col in df_income.columns[2:]]
df_income['FIPS'] = df_income['FIPS'].astype(str).str.pad(width = 5, side = 'left', fillchar = '0')

# Reverse Pivot the years into rows and take care of formatting issues for the below pivot
df_unemp = df_unemp[['FIPS','State'] + ['Unemployment_rate_%s' % yr for yr in range(2007,2019)]]
df_unemp = pd.melt(df_unemp, id_vars = ['FIPS','State'])
df_unemp.rename(columns = {'value': 'Unemployment_Rate', 'variable':'YR'}, inplace = True)
df_unemp['YR'] = df_unemp['YR'].str.replace('Unemployment_rate_','').astype(int)
df_unemp['FIPS'] = df_unemp['FIPS'].astype(str).str.pad(width = 5, side = 'left', fillchar = '0')

# Format the dataframes and merge them to form one output
df_employ = df_employment.merge(df_unemp, on = ['FIPS','YR'], how = 'left')\
                    .merge(df_income, on = ['FIPS','State'], how = 'left')
                    
#df_employ = df_unemp.merge(df_income, on = ['FIPS','State'], how = 'left').\
#                    merge(df_employment, on = ['FIPS','YR'], how = 'left')
df_employ.head()

Unnamed: 0,FIPS,YR,ACCOMMODATION AND FOOD SERVICES,ADMINISTRATIVE AND SUPPORT AND WASTE MANAGEMENT AND REMEDIATION SERVICES,"ARTS, ENTERTAINMENT, AND RECREATION",CONSTRUCTION,EDUCATIONAL SERVICES,FARM EMPLOYMENT,FARM PROPRIETORS EMPLOYMENT,FEDERAL CIVILIAN,...,WAGE AND SALARY EMPLOYMENT,WHOLESALE TRADE,State,Unemployment_Rate,Curr_MedHHInc,Curr_PerCapitaInc,Curr_PovertyUnder18Pct,Curr_PovertyAllAgesPct,Curr_Deep_Pov_All,Curr_Deep_Pov_Children
0,0,1969,,,,0.049101,,0.043689,0.030213,0.032058,...,0.864571,0.045006,,,,,,,,
1,0,1970,,,,0.048191,,0.043395,0.029766,0.031793,...,0.863224,0.045714,,,,,,,,
2,0,1971,,,,0.04868,,0.04276,0.029307,0.031295,...,0.860568,0.046199,,,,,,,,
3,0,1972,,,,0.050183,,0.041045,0.028035,0.030282,...,0.858765,0.045995,,,,,,,,
4,0,1973,,,,0.051554,,0.039582,0.02669,0.028844,...,0.860095,0.046016,,,,,,,,


In [60]:
# Write the data into Google Drive
df_employ.to_csv(dataPath + '/processed/employment.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')

#### Education Statistics of a County

In [26]:
# Read Files necessary for Education Analysis
df_edu = pd.read_excel(dataPath + '/raw/indicators/Education.xls',
                index_col = False)

# Format and select the appropriate columns
df_edu['FIPS'] = df_edu['FIPS'].astype(str).str.pad(width = 5, side = 'left', fillchar = '0')
df_edu = df_edu[['FIPS','State'] + [col for col in df_edu.columns if 'percent' in col.lower() or 'PCT_' in col]]
df_edu.columns = df_edu.columns.str.replace('Percent of adults with less than a high school diploma, ', 'PCT_LESS_HS_')\
                    .str.replace('Percent of adults with a high school diploma only, ','PCT_HS_')\
                    .str.replace('Percent of adults completing some college \(1-3 years\),', 'PCT_SOME_BA_')\
                    .str.replace('Percent of adults completing four years of college or higher, ', 'PCT_EQ_MORE_BA_')\
                    .str.replace("Percent of adults completing some college or associate's degree, ", 'PCT_SOME_BA_')\
                    .str.replace("Percent of adults with a bachelor's degree or higher,", 'PCT_EQ_MORE_BA_')\
                    .str.replace("13_17", '2010').str.replace(' ','')

# Pivot the formatting to the year level
df_edu_out = pd.DataFrame()
for yr in ['1970', '1980', '1990', '2000', '2010']:
    df_yrs = df_edu[['FIPS','State']]
    df_yrs["year_join"] = int(yr)
    for edu_lvl in ['PCT_LESS_HS_%s','PCT_HS_%s','PCT_SOME_BA_%s','PCT_EQ_MORE_BA_%s']:
        df_yrs[edu_lvl.replace('_%s','')] = df_edu[edu_lvl % yr]
    df_edu_out = pd.concat([df_edu_out, df_yrs])

df_edu_out.head()

Unnamed: 0,FIPS,State,year_join,PCT_LESS_HS,PCT_HS,PCT_SOME_BA,PCT_EQ_MORE_BA
0,0,US,1970,47.7,31.1,10.6,10.7
1,1000,AL,1970,58.7,25.9,7.5,7.8
2,1001,AL,1970,54.8,31.1,7.7,6.4
3,1003,AL,1970,59.4,26.7,7.4,6.5
4,1005,AL,1970,68.8,19.0,4.9,7.3


In [27]:
# Write the data into Google Drive
df_edu_out.to_csv(dataPath + '/processed/education.tsv.gz',
                    compression = 'gzip',
                    mode = 'w',
                    sep='\t',
                    index = False,
                    encoding='utf-8',
                    line_terminator = '\n')