In [1]:
# The sources of data that you will extract from.
# The type of transformation needed for this data (cleaning, joining, filtering, aggregating, etc).
# The type of final production database to load the data into (relational or non-relational).
# The final tables or collections that will be used in the production database.
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import xlrd
import re

In [2]:
fea_xls = './data/DataDownload.xls'
xls = pd.ExcelFile(fea_xls)
print (xls.sheet_names)

['Read_Me', 'Variable List', 'Supplemental Data - County', 'Supplemental Data - State', 'ACCESS', 'STORES', 'RESTAURANTS', 'ASSISTANCE', 'INSECURITY', 'PRICES_TAXES', 'LOCAL', 'HEALTH', 'SOCIOECONOMIC']


In [3]:
variables = xls.parse(sheet_name='Variable List') 
variables[variables['Category Code']=="ACCESS"]

Unnamed: 0,Category Name,Category Code,Subcategory Name,Variable Name,Variable Code,Geography,Units
0,Access and Proximity to Grocery Store,ACCESS,Overall,"Population, low access to store, 2010",LACCESS_POP10,CNTY10,Count
1,Access and Proximity to Grocery Store,ACCESS,Overall,"Population, low access to store, 2015",LACCESS_POP15,CNTY10,Count
2,Access and Proximity to Grocery Store,ACCESS,Overall,"Population, low access to store (% change), 20...",PCH_LACCESS_POP_10_15,CNTY10,% change
3,Access and Proximity to Grocery Store,ACCESS,Overall,"Population, low access to store (%), 2010",PCT_LACCESS_POP10,CNTY10,Percent
4,Access and Proximity to Grocery Store,ACCESS,Overall,"Population, low access to store (%), 2015",PCT_LACCESS_POP15,CNTY10,Percent
5,Access and Proximity to Grocery Store,ACCESS,Household Resources,"Low income & low access to store, 2010",LACCESS_LOWI10,CNTY10,Count
6,Access and Proximity to Grocery Store,ACCESS,Household Resources,"Low income & low access to store, 2015",LACCESS_LOWI15,CNTY10,Count
7,Access and Proximity to Grocery Store,ACCESS,Household Resources,"Low income & low access to store (% change), 2...",PCH_LACCESS_LOWI_10_15,CNTY10,% change
8,Access and Proximity to Grocery Store,ACCESS,Household Resources,"Low income & low access to store (%), 2010",PCT_LACCESS_LOWI10,CNTY10,Percent
9,Access and Proximity to Grocery Store,ACCESS,Household Resources,"Low income & low access to store (%), 2015",PCT_LACCESS_LOWI15,CNTY10,Percent


In [4]:
# food access related data
access = xls.parse(sheet_name='ACCESS')
# rename incorrect column name
access = access.rename(columns={'LACCESS_CHILD_10_15':'PCH_LACCESS_CHILD_10_15'})
print(access.columns)
access.head()

Index(['FIPS', 'State', 'County', 'LACCESS_POP10', 'LACCESS_POP15',
       'PCH_LACCESS_POP_10_15', 'PCT_LACCESS_POP10', 'PCT_LACCESS_POP15',
       'LACCESS_LOWI10', 'LACCESS_LOWI15', 'PCH_LACCESS_LOWI_10_15',
       'PCT_LACCESS_LOWI10', 'PCT_LACCESS_LOWI15', 'LACCESS_HHNV10',
       'LACCESS_HHNV15', 'PCH_LACCESS_HHNV_10_15', 'PCT_LACCESS_HHNV10',
       'PCT_LACCESS_HHNV15', 'LACCESS_SNAP15', 'PCT_LACCESS_SNAP15',
       'LACCESS_CHILD10', 'LACCESS_CHILD15', 'PCH_LACCESS_CHILD_10_15',
       'PCT_LACCESS_CHILD10', 'PCT_LACCESS_CHILD15', 'LACCESS_SENIORS10',
       'LACCESS_SENIORS15', 'PCH_LACCESS_SENIORS_10_15',
       'PCT_LACCESS_SENIORS10', 'PCT_LACCESS_SENIORS15', 'LACCESS_WHITE15',
       'PCT_LACCESS_WHITE15', 'LACCESS_BLACK15', 'PCT_LACCESS_BLACK15',
       'LACCESS_HISP15', 'PCT_LACCESS_HISP15', 'LACCESS_NHASIAN15',
       'PCT_LACCESS_NHASIAN15', 'LACCESS_NHNA15', 'PCT_LACCESS_NHNA15',
       'LACCESS_NHPI15', 'PCT_LACCESS_NHPI15', 'LACCESS_MULTIR15',
       'PCT_LACCESS_

Unnamed: 0,FIPS,State,County,LACCESS_POP10,LACCESS_POP15,PCH_LACCESS_POP_10_15,PCT_LACCESS_POP10,PCT_LACCESS_POP15,LACCESS_LOWI10,LACCESS_LOWI15,...,LACCESS_HISP15,PCT_LACCESS_HISP15,LACCESS_NHASIAN15,PCT_LACCESS_NHASIAN15,LACCESS_NHNA15,PCT_LACCESS_NHNA15,LACCESS_NHPI15,PCT_LACCESS_NHPI15,LACCESS_MULTIR15,PCT_LACCESS_MULTIR15
0,1001,AL,Autauga,18428.439685,17496.693038,-5.056026,33.769657,32.062255,5344.427472,6543.676824,...,471.136164,0.863345,86.767975,0.159,61.169869,0.112092,8.817961,0.016159,482.848633,0.884808
1,1003,AL,Baldwin,35210.814078,30561.26443,-13.204891,19.318473,16.767489,9952.144027,9886.831137,...,1377.874834,0.755973,212.946378,0.116833,181.649648,0.099662,14.819634,0.008131,1127.696098,0.618712
2,1005,AL,Barbour,5722.305602,6069.523628,6.067799,20.840972,22.10556,3135.676086,2948.790251,...,509.377525,1.855183,17.09641,0.062266,39.960527,0.145539,8.082376,0.029436,462.382655,1.684025
3,1007,AL,Bibb,1044.867327,969.378841,-7.224696,4.559753,4.230324,491.449066,596.162829,...,8.596762,0.037516,1.994318,0.008703,2.513097,0.010967,0.0,0.0,5.259244,0.022951
4,1009,AL,Blount,1548.175559,3724.428242,140.568857,2.70084,6.49738,609.027708,1650.959482,...,497.489891,0.867886,8.428994,0.014705,28.938242,0.050484,1.062851,0.001854,202.914187,0.35399


In [5]:
# select the columns by regex
# select the columns starting with LACCESS and ends with 15
laccess = access.filter(regex=("^LACCESS+.*15$"))
laccess.insert(0,'FIPS',access['FIPS'])
laccess.insert(1,'Year',2015)
laccess = laccess.rename(columns=lambda x: re.sub('15$','',x))
#laccess.set_index('FIPS',inplace=True)
laccess.head()

Unnamed: 0,FIPS,Year,LACCESS_POP,LACCESS_LOWI,LACCESS_HHNV,LACCESS_SNAP,LACCESS_CHILD,LACCESS_SENIORS,LACCESS_WHITE,LACCESS_BLACK,LACCESS_HISP,LACCESS_NHASIAN,LACCESS_NHNA,LACCESS_NHPI,LACCESS_MULTIR
0,1001,2015,17496.693038,6543.676824,677.672769,931.935186,4616.97148,2180.809285,12640.615414,4216.473194,471.136164,86.767975,61.169869,8.817961,482.848633
1,1003,2015,30561.26443,9886.831137,1394.162766,950.53529,7007.972675,5580.66279,25483.186811,3540.965826,1377.874834,212.946378,181.649648,14.819634,1127.696098
2,1005,2015,6069.523628,2948.790251,425.144927,422.56904,1031.927776,824.175262,2828.876711,2713.12493,509.377525,17.09641,39.960527,8.082376,462.382655
3,1007,2015,969.378841,596.162829,224.388125,53.818772,232.64267,137.688231,540.512239,419.099942,8.596762,1.994318,2.513097,0.0,5.259244
4,1009,2015,3724.428242,1650.959482,719.931444,175.370338,910.686113,505.914282,3387.241266,95.842687,497.489891,8.428994,28.938242,1.062851,202.914187


In [6]:
# select the columns starting with LACCESS and ends with 10
temp_df = access.filter(regex=("^LACCESS+.*10$")).rename(columns=lambda x: re.sub('10$','',x))
temp_df.insert(0,'FIPS',access['FIPS'])
temp_df.insert(1,'Year',2010)
temp_df.head()
laccess = laccess.append(temp_df,sort=False)

In [7]:
# select the columns starting with PCT_LACCESS and ends with 15
pct_access = access.filter(regex=("^PCT_LACCESS+.*15$"))
pct_access.insert(0,'FIPS',access['FIPS'])
pct_access.insert(1,'Year',2015)
pct_access = pct_access.rename(columns=lambda x: re.sub('15$','',x))
#pct_access.set_index('FIPS', inplace=True)

# select the columns starting with PCT_LACCESS and ends with 10
temp_df = access.filter(regex=("^PCT_LACCESS+.*10$")).rename(columns=lambda x: re.sub('10$','',x))
temp_df.insert(0,'FIPS',access['FIPS'])
temp_df.insert(1,'Year',2010)
temp_df.head()
pct_access = pct_access.append(temp_df,sort=False)

In [24]:
# Final dataframe containing laccess and pct_access 
access_df = pd.merge(laccess,pct_access, on=['FIPS','Year'])
access_df.head()

Unnamed: 0,FIPS,Year,LACCESS_POP,LACCESS_LOWI,LACCESS_HHNV,LACCESS_SNAP,LACCESS_CHILD,LACCESS_SENIORS,LACCESS_WHITE,LACCESS_BLACK,...,PCT_LACCESS_SNAP,PCT_LACCESS_CHILD,PCT_LACCESS_SENIORS,PCT_LACCESS_WHITE,PCT_LACCESS_BLACK,PCT_LACCESS_HISP,PCT_LACCESS_NHASIAN,PCT_LACCESS_NHNA,PCT_LACCESS_NHPI,PCT_LACCESS_MULTIR
0,1001,2015,17496.693038,6543.676824,677.672769,931.935186,4616.97148,2180.809285,12640.615414,4216.473194,...,4.608749,8.460485,3.996279,23.163613,7.726582,0.863345,0.159,0.112092,0.016159,0.884808
1,1003,2015,30561.26443,9886.831137,1394.162766,950.53529,7007.972675,5580.66279,25483.186811,3540.965826,...,1.2989,3.844936,3.06184,13.981393,1.942757,0.755973,0.116833,0.099662,0.008131,0.618712
2,1005,2015,6069.523628,2948.790251,425.144927,422.56904,1031.927776,824.175262,2828.876711,2713.12493,...,4.303147,3.758341,3.001695,10.302934,9.88136,1.855183,0.062266,0.145539,0.029436,1.684025
3,1007,2015,969.378841,596.162829,224.388125,53.818772,232.64267,137.688231,540.512239,419.099942,...,0.67671,1.015242,0.600865,2.35877,1.828933,0.037516,0.008703,0.010967,0.0,0.022951
4,1009,2015,3724.428242,1650.959482,719.931444,175.370338,910.686113,505.914282,3387.241266,95.842687,...,0.812727,1.58872,0.882583,5.909147,0.167201,0.867886,0.014705,0.050484,0.001854,0.35399


In [25]:
access_df = access_df.reset_index()

In [29]:
access_df = access_df.rename(columns={'index':'id'})
access_df.head()

Unnamed: 0,id,FIPS,Year,LACCESS_POP,LACCESS_LOWI,LACCESS_HHNV,LACCESS_SNAP,LACCESS_CHILD,LACCESS_SENIORS,LACCESS_WHITE,...,PCT_LACCESS_SNAP,PCT_LACCESS_CHILD,PCT_LACCESS_SENIORS,PCT_LACCESS_WHITE,PCT_LACCESS_BLACK,PCT_LACCESS_HISP,PCT_LACCESS_NHASIAN,PCT_LACCESS_NHNA,PCT_LACCESS_NHPI,PCT_LACCESS_MULTIR
0,0,1001,2015,17496.693038,6543.676824,677.672769,931.935186,4616.97148,2180.809285,12640.615414,...,4.608749,8.460485,3.996279,23.163613,7.726582,0.863345,0.159,0.112092,0.016159,0.884808
1,1,1003,2015,30561.26443,9886.831137,1394.162766,950.53529,7007.972675,5580.66279,25483.186811,...,1.2989,3.844936,3.06184,13.981393,1.942757,0.755973,0.116833,0.099662,0.008131,0.618712
2,2,1005,2015,6069.523628,2948.790251,425.144927,422.56904,1031.927776,824.175262,2828.876711,...,4.303147,3.758341,3.001695,10.302934,9.88136,1.855183,0.062266,0.145539,0.029436,1.684025
3,3,1007,2015,969.378841,596.162829,224.388125,53.818772,232.64267,137.688231,540.512239,...,0.67671,1.015242,0.600865,2.35877,1.828933,0.037516,0.008703,0.010967,0.0,0.022951
4,4,1009,2015,3724.428242,1650.959482,719.931444,175.370338,910.686113,505.914282,3387.241266,...,0.812727,1.58872,0.882583,5.909147,0.167201,0.867886,0.014705,0.050484,0.001854,0.35399


In [9]:
password = 'laptop'
connection_string = f'mysql://root:{password}@localhost:3306/etl_county_db'
engine = create_engine(connection_string)

# Connect to local database
# database_path = "etl_county_db"
# engine = create_engine(f"sqlite:///{database_path}")

In [34]:
engine.table_names()

['access']

In [35]:
# Write access_df to SQL database
access_df.to_sql(name='access', con=engine, if_exists='append',index=False)

In [36]:
# Check that data was written to database
pd.read_sql_query('select * from access WHERE LACCESS_POP > 1000', con=engine)

Unnamed: 0,ID,FIPS,Year,LACCESS_POP,PCT_LACCESS_POP,LACCESS_LOWI,PCT_LACCESS_LOWI,LACCESS_HHNV,PCT_LACCESS_HHNV,LACCESS_CHILD,...,LACCESS_NHASIAN,PCT_LACCESS_NHASIAN,LACCESS_NHNA,PCT_LACCESS_NHNA,LACCESS_NHPI,PCT_LACCESS_NHPI,LACCESS_MULTIR,PCT_LACCESS_MULTIR,LACCESS_SNAP,PCT_LACCESS_SNAP
0,0,1001,2015,17496.693038,32.062255,6543.676824,11.991125,677.672769,3.351332,4616.971480,...,86.767975,0.159000,61.169869,0.112092,8.817961,0.016159,482.848633,0.884808,931.935186,4.608749
1,1,1003,2015,30561.264430,16.767489,9886.831137,5.424427,1394.162766,1.905114,7007.972675,...,212.946378,0.116833,181.649648,0.099662,14.819634,0.008131,1127.696098,0.618712,950.535290,1.298900
2,2,1005,2015,6069.523628,22.105560,2948.790251,10.739667,425.144927,4.329378,1031.927776,...,17.096410,0.062266,39.960527,0.145539,8.082376,0.029436,462.382655,1.684025,422.569040,4.303147
3,4,1009,2015,3724.428242,6.497380,1650.959482,2.880150,719.931444,3.336414,910.686113,...,8.428994,0.014705,28.938242,0.050484,1.062851,0.001854,202.914187,0.353990,175.370338,0.812727
4,5,1011,2015,4141.900365,37.950342,2154.789300,19.743351,415.861301,11.104441,616.715390,...,0.968748,0.008876,7.705288,0.070600,0.000000,0.000000,36.523171,0.334645,225.322525,6.016623
5,6,1013,2015,1304.328756,6.226805,612.934761,2.926122,320.168301,3.770678,264.140116,...,4.955498,0.023657,7.298759,0.034844,0.000000,0.000000,11.440987,0.054619,91.437795,1.076879
6,7,1015,2015,27785.986765,23.433852,10883.209969,9.178567,1235.289629,2.609895,6167.764487,...,235.071203,0.198252,118.296515,0.099768,21.961507,0.018522,886.573464,0.747709,1931.256358,4.080320
7,8,1017,2015,7131.505717,20.843214,3533.916216,10.328558,730.201067,5.240803,1595.353850,...,108.493406,0.317093,20.688146,0.060465,0.061101,0.000179,104.189620,0.304514,502.632028,3.607493
8,10,1021,2015,1469.899110,3.368007,637.419527,1.460531,641.110768,3.871909,337.049923,...,2.446086,0.005605,9.460583,0.021677,27.709124,0.063490,59.318065,0.135917,115.455204,0.697277
9,11,1023,2015,2702.895199,19.502815,1010.379176,7.290419,423.036624,7.211671,557.553869,...,0.000000,0.000000,5.000000,0.036078,0.000000,0.000000,8.868541,0.063991,360.071289,6.138276


In [None]:
# groceries 2009, 2014
# supercenters
# convenience_stores
# specialized_food_stores
stores = xls.parse(sheet_name='STORES')
stores.columns

In [None]:
#stores.filter(regex=("^GROC+.*14$"))
stores[['FIPS','GROC09', 'GROC14', 'GROCPTH09', 'GROCPTH14', 
        'SUPERC09', 'SUPERC14', 'SUPERCPTH09', 'SUPERCPTH14',
       'CONVS09', 'CONVS14', 'CONVSPTH09', 'CONVSPTH14',
       'SPECS09', 'SPECS14', 'SPECSPTH09', 'SPECSPTH14']] 
#        ,'SNAPS12', 'SNAPS16', 'SNAPSPTH12', 'SNAPSPTH16', 
#        'WICS08', 'WICS12', 'WICSPTH08', 'WICSPTH12']]

In [None]:
# fast_food
# full_service_restaurant
# farmers_market 2007,2016
restaurants = xls.parse(sheet_name='RESTAURANTS')
restaurants.head()

In [None]:
# health related data
# recreation_facilities 2009, 2013
# obesity_rate 2008, 2013
# diabetes_rate
health = xls.parse(sheet_name='HEALTH')
health.head()

In [None]:
# poverty_rate 2015
socioeconomic = xls.parse(sheet_name='SOCIOECONOMIC')
socioeconomic.head()

In [None]:
# crime rate related data

In [None]:
# property value related data
file = "./data/County_MedianValuePerSqft_AllHomes.csv"
df = pd.read_csv(file, encoding='ISO-8859-1')
df[df.RegionName=='Blount County']
#FIPS = 2 digit StateCodeFIPS + 3 digit MunicipalCodeFIPS