This notebook will pull other data that we plan to use for the CS 109 final projects. Planning on bringing in education, demographic, and economic data to help us predict crime

In [1]:
import pandas as pd
import urllib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
"""
Function
---------------
split_MSA

This method takes in a dataframe with MSA and splits into a city_key (largest city)
and state_key. This will help facilitate MSA merging

Returns dataframe with these two additional features
"""
def split_MSA(df):
    df['MSA'] = df['MSA'].str.replace('Metro Area', '')
    # Need to manually fix how this MSA is written
    df.loc[df['MSA'].str.contains("Texarkana"), "MSA"] = "Texarkana, AR-TX"

    #Grab Everything before comma
    df['city_key'] = df['MSA'].str.split(",").str[0]
    # Then grab everything before first hyphen if it has it
    df['city_key'] = df['city_key'].str.split("-").str[0].str.strip()
    # State will be everying after comma 
    df['state_key']=df['MSA'].str.split(",").str[1].str.strip()
    return(df)

"""
Function
--------
append_df

This function appends two dataframes

Parameters:
    input - dataframe to be appended
    output - dataframe to be appended onto
    
Returns a single dataframe 
"""
def append_df(input,output):
    if output.empty:
        output=input.copy()
    else:
        output=pd.concat([output,input])
        output.reset_index(drop='Index',inplace=True)
    return(output)

'''
Function
-----------
var_thresh

This function takes in a dataframe and keeps only thos varaibles that have a pct
non-missing that is above that threshold
'''
def var_thresh(df, thresh=0.65):
    return(df.loc[:, pd.notnull(df).sum() > len (df) *thresh])

'''
Function
---------
slim_df

This function takes in a list of variables to keep
on the the given df. It keep the variables + geography
then renames to MSA and drops the first row of variable descriptions
'''
def slim_df(df, var_list):
    var_list.append('GEO.display-label')
    df = df.loc[:, var_list]
    df = df.rename(index=str, columns={'GEO.display-label': 'MSA'})
    df['MSA'] = df["MSA"].astype(str)
    # Drop first row of var descriptions
    df = df.loc[df.MSA != "Geography", :]
    # Split MSA into city-state key
    return(split_MSA(df))

In [3]:
'''
Function
---------
match_crime

This function will take in a dataframe and make changes to MSA
in order to match crime data
'''
def match_crime(df):
    df.loc[df['MSA'].str.contains('Crestview'),'city_key']='Crestview'
    df.loc[df['MSA'].str.contains('Sarasota'),'city_key']='North Port'
    df.loc[df['MSA'].str.contains('Louisville'),'city_key']='Louisville'
    df.loc[df['MSA'].str.contains('Santa Maria'),'city_key']='Santa Maria'
    df.loc[df['MSA'].str.contains('Weirton'),'city_key']='Weirton'
    df.loc[df['MSA'].str.contains('San Germán'),'city_key']='San German'
    df.loc[df['MSA'].str.contains('Mayagüez'),'city_key']='Mayaguez'
    df.loc[df['MSA'].str.contains('Honolulu'),'city_key']='Urban Honolulu'

    #State
    df.loc[df['MSA'].str.contains('Worcester'),'state_key']='MA-CT'
    df.loc[df['MSA'].str.contains('Myrtle Beach'),'state_key']='SC-NC'
    df.loc[df['MSA'].str.contains('Salisbury'),'state_key']='MD-DE'
    df.loc[df['MSA'].str.contains('Weirton'),'state_key']='WV-OH'
    return(df)

In [4]:
#####################
# Employment Data
#####################
emp_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_S2301'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_S2301'
    employ = pd.read_csv("data/employ/%s.csv" %f, encoding='Latin-1')
    
    # Grab Unemployment
    un = [v for v in employ.columns if "HC04" in v and "EST" in v]
    employ = slim_df(employ, un)
    
    employ = employ.loc[:, ["MSA", "city_key", "state_key", 
                          "HC04_EST_VC01", "HC04_EST_VC03",
                         'HC04_EST_VC24']]
    employ['year'] = year
    emp_all = append_df(employ, emp_all) 

# Process Final DataFrame
emp_all = emp_all.sort_values(['city_key', 'state_key', 'year'])
emp_all = match_crime(emp_all)
del emp_all['MSA']
emp_all = emp_all.rename(index=str,
                        columns={'HC04_EST_VC01': 'unemp_16_ovr',
                                'HC04_EST_VC03': 'unemp_16_19',
                                'HC04_EST_VC24': 'unemp_female'})
emp_all.head()

Unnamed: 0,city_key,state_key,unemp_16_ovr,unemp_16_19,unemp_female,year
0,Abilene,TX,6.6,23.1,5.2,2006
367,Abilene,TX,3.2,7.0,3.8,2007
736,Abilene,TX,2.8,14.2,1.8,2008
1105,Abilene,TX,5.7,22.6,4.8,2009
1479,Abilene,TX,8.5,19.9,7.7,2010


In [5]:
############
# Age Data
############
age_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_S0101'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_S0101'
    age = pd.read_csv("data/age/%s.csv" %f, encoding='Latin-1')
    age = slim_df(age, [v for v in age.columns if "EST" in v])
    age = age.replace("(X)", np.nan)

    age = age.loc[:, ['MSA','city_key','state_key',
                      'HC01_EST_VC33','HC01_EST_VC34',
                      'HC01_EST_VC01', 'HC02_EST_VC01',
                      'HC03_EST_VC01', 'HC01_EST_VC06',
                      'HC01_EST_VC07', 'HC02_EST_VC07']]
    age['year'] = year
    age_all = append_df(age, age_all) 


# Process Final DataFrame
age_all = age_all.sort_values(['city_key', 'state_key', 'year'])
age_all = age_all.rename(index=str,
                         columns={'HC01_EST_VC33':'median_age',
                                'HC01_EST_VC34': 'sex_ratio',
                                'HC01_EST_VC01': 'total_pop',
                                'HC02_EST_VC01': 'male_pop',
                                'HC03_EST_VC01': 'female_pop',
                                'HC01_EST_VC06': 'pop_15_19',
                                'HC01_EST_VC07': 'pop_20_24',
                                'HC02_EST_VC07': 'male_pop_20_24'})

# Convert to Int and Get Proportions
age_all[['total_pop', 'male_pop', 'female_pop']] = age_all[['total_pop', 'male_pop', 'female_pop']].astype(int)
age_all['male_pop'] = age_all['male_pop'] / age_all['total_pop']
age_all['female_pop'] = age_all['female_pop'] / age_all['total_pop']
del age_all['total_pop']
# Match Crime Data and then get rid of MSA
age_all = match_crime(age_all)
del age_all['MSA']
age_all.head()

Unnamed: 0,city_key,state_key,median_age,sex_ratio,male_pop,female_pop,pop_15_19,pop_20_24,male_pop_20_24,year
0,Abilene,TX,34.4,99.1,0.497717,0.502283,8.3,8.7,10.2,2006
367,Abilene,TX,34.9,99.1,0.497777,0.502223,9.5,7.7,8.6,2007
736,Abilene,TX,34.6,101.0,0.502381,0.497619,9.2,7.6,8.9,2008
1105,Abilene,TX,33.2,97.0,0.492269,0.507731,7.9,9.0,9.6,2009
1479,Abilene,TX,,,0.501355,0.498645,7.3,9.5,9.9,2010


In [6]:
###############
# Income Data
###############
inc_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_B19001F'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_B19001F'
    inc = pd.read_csv("data/house_income/%s.csv" %f, encoding='Latin-1')
    # Keep only the estimates
    inc = slim_df(inc, [v for v in inc.columns if "HD01" in v])
    inc['year'] = year
    inc_all = append_df(inc, inc_all) 

# Proccess Final Data Frame
inc_all =  inc_all.rename(index=str,
                          columns={'HD01_VD01':'total',
                                  'HD01_VD02': 'inc_lt10',
                                  'HD01_VD03': 'inc_10_15',
                                  'HD01_VD04': 'inc_15_19',
                                  'HD01_VD05': 'inc_20_24',
                                  'HD01_VD06': 'inc_25_29',
                                  'HD01_VD07': 'inc_30_34',
                                  'HD01_VD08': 'inc_35_39',
                                  'HD01_VD09': 'inc_40_44',
                                  'HD01_VD10': 'inc_45_49',
                                  'HD01_VD11': 'inc_50_59',
                                  'HD01_VD12': 'inc_60_74',
                                  'HD01_VD13':'inc_75_99',
                                  'HD01_VD14':'inc_100_124',
                                  'HD01_VD15':'inc_125_149',
                                  'HD01_VD16':'inc_150_199',
                                  'HD01_VD17':'inc_gt_200'})

numeric_vars =  [v for v in inc_all.columns if "inc" in v]
inc_all[numeric_vars] = inc_all[numeric_vars].astype(int)
inc_all['total'] = inc_all['total'].astype(int)
# Get propotion of each imcome bracket by dividing by total
inc_all.loc[:, numeric_vars] = inc_all[numeric_vars].apply(lambda x: x / inc_all["total"])
del inc_all['total']
# Match Crime data and Get rid of MSA
inc_all = match_crime(inc_all)
del inc_all['MSA']
inc_all.head()

Unnamed: 0,inc_lt10,inc_10_15,inc_15_19,inc_20_24,inc_25_29,inc_30_34,inc_35_39,inc_40_44,inc_45_49,inc_50_59,inc_60_74,inc_75_99,inc_100_124,inc_125_149,inc_150_199,inc_gt_200,city_key,state_key,year
0,0.066006,0.039355,0.191044,0.142857,0.034552,0.051751,0.09498,0.044159,0.004803,0.138829,0.060738,0.099628,0.004029,0.006508,0.018748,0.002014,Abilene,TX,2006
1,0.042883,0.102766,0.078153,0.191829,0.056077,0.063436,0.136514,0.011165,0.071048,0.108094,0.062928,0.046435,0.025121,0.003552,0.0,0.0,Albany,NY,2006
2,0.111215,0.064565,0.078066,0.069276,0.056352,0.080003,0.071902,0.061101,0.054006,0.094082,0.10114,0.101661,0.034098,0.007486,0.009591,0.005456,Albuquerque,NM,2006
3,0.150833,0.063611,0.084792,0.084514,0.105903,0.057778,0.043542,0.024583,0.080417,0.095417,0.075556,0.068958,0.042639,0.0,0.021458,0.0,Allentown,PA-NJ,2006
4,0.084392,0.086299,0.077275,0.135104,0.043213,0.134596,0.061642,0.051856,0.037239,0.120742,0.05427,0.070285,0.011185,0.022496,0.0,0.009405,Amarillo,TX,2006


In [7]:
###############
# GINI INDEX
###############
gini_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_B19083'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_B19083'
    gini = pd.read_csv("data/gini/%s.csv" %f, encoding='Latin-1')
    # Don't need micro areas
    micro_area = gini['GEO.display-label'].str.contains("Micro Area")
    gini = gini.loc[~micro_area, :]
    gini = slim_df(gini, ["HD01_VD01"])
    gini['year'] = year
    gini_all = append_df(gini, gini_all) 

# Clean Final Dataframes
gini_all = gini_all.rename(index=str,
                           columns={"HD01_VD01":"gini"})
gini_all['gini'] = gini_all['gini'].astype(float)
del gini_all['MSA']
gini_all.head()

Unnamed: 0,gini,city_key,state_key,year
0,0.443,Abilene,TX,2006
1,0.533,Aguadilla,PR,2006
2,0.445,Akron,OH,2006
3,0.481,Albany,GA,2006
4,0.405,Albany,NY,2006


In [8]:
#################################
# Head of Household Information
#################################


In [9]:
#################
# Education Data
#################
'''
edu = pd.read_csv("data/education/ACS_06_EST_B15001.csv", encoding='Latin-1')

# There are too many variables

# I am going to create 4 groupings. 
# male - no hs number, male - high school graduates and same for female
# Then I will sum up the variables to create aggregates above


# Going to get the variables for 9-12 no diploma for male and female
var_groups = {}
for g in ['Male', 'Female']:
    g_list = []
    for r in edu.columns:
        var_desc = edu[r].iloc[0]
        # Make sure gender mentioned and 9th grade and estimate
        if g in var_desc and '9th' in var_desc and 'HD01' in r:
            g_list.append(r)
    var_groups['no_hs_%s' %g] = g_list

print(var_groups)
'''

'\nedu = pd.read_csv("data/education/ACS_06_EST_B15001.csv", encoding=\'Latin-1\')\n\n# There are too many variables\n\n# I am going to create 4 groupings. \n# male - no hs number, male - high school graduates and same for female\n# Then I will sum up the variables to create aggregates above\n\n\n# Going to get the variables for 9-12 no diploma for male and female\nvar_groups = {}\nfor g in [\'Male\', \'Female\']:\n    g_list = []\n    for r in edu.columns:\n        var_desc = edu[r].iloc[0]\n        # Make sure gender mentioned and 9th grade and estimate\n        if g in var_desc and \'9th\' in var_desc and \'HD01\' in r:\n            g_list.append(r)\n    var_groups[\'no_hs_%s\' %g] = g_list\n\nprint(var_groups)\n'

In [10]:
############
# Race Data
############
race_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_B02001'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_B02001'
    race = pd.read_csv("data/race/%s.csv" %f, encoding='Latin-1')
    race = slim_df(race, [v for v in race.columns if "HD01" in v])
    race = race.loc[:, ['MSA', 'city_key', 'state_key',
                       'HD01_VD01','HD01_VD02',
                       'HD01_VD03', 'HD01_VD05']]
    
    race['year'] = year
    race_all = append_df(race, race_all) 

# Proccess Final Data Frame
race_all =  race_all.rename(index=str,
                            columns={'HD01_VD01':'total',
                                    'HD01_VD02': 'white',
                                    'HD01_VD03': 'black',
                                    'HD01_VD05': 'asian'})

race_num_v = ['total', 'white','black','asian']
race_all[race_num_v] = race_all[race_num_v].astype(int)
race_all[race_num_v] = race_all[race_num_v].apply(lambda x: x / race_all["total"])
del race_all['total']
# Match Crime Data
race_all = match_crime(race_all)
race_all.head()

Unnamed: 0,MSA,city_key,state_key,white,black,asian,year
0,"Abilene, TX",Abilene,TX,0.741838,0.068295,0.014292,2006
1,"Aguadilla-Isabela-San Sebastián, PR",Aguadilla,PR,0.896527,0.019961,0.000758,2006
2,"Akron, OH",Akron,OH,0.844866,0.11688,0.017715,2006
3,"Albany, GA",Albany,GA,0.485957,0.494136,0.006355,2006
4,"Albany-Schenectady-Troy, NY",Albany,NY,0.867237,0.070216,0.030761,2006


In [11]:
# Bring Everything Together
census_df = race_all.copy()

merge_df = lambda df: census_df.merge(df,
                                     how='outer',
                                     on=['city_key','state_key','year'],
                                     indicator=True)

# Merge in Employment
census_df = merge_df(emp_all)

# Check to see if any non-matches that should be matches
# All left matched so we should be good
print("Employment Merge Stats")
print(census_df['_merge'].value_counts())
del census_df['_merge']

# Merge in House Income
census_df = merge_df(age_all)
print("Age Merge Stats")
print(census_df['_merge'].value_counts())
del census_df['_merge']

# Bring in INcome
census_df = merge_df(inc_all)
print("Income Merge Stats")
print(census_df['_merge'].value_counts())
del census_df['_merge']

# Merge in Gini
census_df = merge_df(gini_all)
print("Gini Merge Stats")
print("Blake checked these. no typos")
print(census_df['_merge'].value_counts())
del census_df['_merge']

Employment Merge Stats
both          4118
right_only      36
left_only        0
Name: _merge, dtype: int64
Age Merge Stats
both          4154
right_only       0
left_only        0
Name: _merge, dtype: int64
Income Merge Stats
left_only     3121
both          1033
right_only       0
Name: _merge, dtype: int64
Gini Merge Stats
Blake checked these. no typos
both          4079
right_only      75
left_only       75
Name: _merge, dtype: int64


In [12]:
float_cols = census_df.columns.difference(["MSA", "city_key", "state_key","year"])
census_df[float_cols] = census_df[float_cols].astype(float)
census_df.head()

Unnamed: 0,MSA,city_key,state_key,white,black,asian,year,unemp_16_ovr,unemp_16_19,unemp_female,...,inc_40_44,inc_45_49,inc_50_59,inc_60_74,inc_75_99,inc_100_124,inc_125_149,inc_150_199,inc_gt_200,gini
0,"Abilene, TX",Abilene,TX,0.741838,0.068295,0.014292,2006,6.6,23.1,5.2,...,0.044159,0.004803,0.138829,0.060738,0.099628,0.004029,0.006508,0.018748,0.002014,0.443
1,"Aguadilla-Isabela-San Sebastián, PR",Aguadilla,PR,0.896527,0.019961,0.000758,2006,20.6,56.8,19.1,...,,,,,,,,,,0.533
2,"Akron, OH",Akron,OH,0.844866,0.11688,0.017715,2006,6.3,23.0,4.8,...,,,,,,,,,,0.445
3,"Albany, GA",Albany,GA,0.485957,0.494136,0.006355,2006,10.0,31.0,10.1,...,,,,,,,,,,0.481
4,"Albany-Schenectady-Troy, NY",Albany,NY,0.867237,0.070216,0.030761,2006,5.4,16.8,4.4,...,0.011165,0.071048,0.108094,0.062928,0.046435,0.025121,0.003552,0.0,0.0,0.405


In [13]:
#####################
# Bring in BEA Data
#####################
bea_gdp = pd.read_csv("data/BEA_real_GDP_pc.csv",skiprows=[0,1,2], header=1)
del bea_gdp['Fips']
bea_gdp= bea_gdp.iloc[1:, :].rename(index=str, columns={"Area": 'MSA'})
bea_gdp = pd.melt(bea_gdp, id_vars=["MSA"], var_name='year', value_name='real_pc_gdp')
bea_gdp = bea_gdp.loc[bea_gdp.MSA.notnull(), :]
bea_gdp['year'] = bea_gdp['year'].astype(int)
bea_gdp = bea_gdp.loc[bea_gdp.year >= 2006, :]

# Get rid of MSA in paranthesis
bea_gdp['MSA'] = bea_gdp['MSA'].str.replace(r"\(.*\)","")
bea_gdp = split_MSA(bea_gdp)
del bea_gdp['MSA']

In [14]:
census_df = merge_df(bea_gdp)
print("Bea Merge Stats")
print(census_df['_merge'].value_counts())
#del census_df['_merge']
census_df = census_df.sort_values(["MSA"])
# Look at left right combinations

Bea Merge Stats
both          4004
left_only      225
right_only     198
Name: _merge, dtype: int64


In [15]:
# Check to make sure that there were no typos
names = census_df.loc[census_df._merge != "both", ['city_key', 'state_key', '_merge']]
names = names.drop_duplicates()
print(names.shape[0])
del census_df["_merge"]

62


In [16]:
census_df.to_json("output/census_df.json")
census_df.head()

Unnamed: 0,MSA,city_key,state_key,white,black,asian,year,unemp_16_ovr,unemp_16_19,unemp_female,...,inc_45_49,inc_50_59,inc_60_74,inc_75_99,inc_100_124,inc_125_149,inc_150_199,inc_gt_200,gini,real_pc_gdp
0,"Abilene, TX",Abilene,TX,0.741838,0.068295,0.014292,2006,6.6,23.1,5.2,...,0.004803,0.138829,0.060738,0.099628,0.004029,0.006508,0.018748,0.002014,0.443,33978.0
2208,"Abilene, TX",Abilene,TX,0.832574,0.07919,0.015352,2012,6.8,19.7,6.5,...,,,,,,,,,0.4851,35406.0
1835,"Abilene, TX",Abilene,TX,0.840689,0.085917,0.016213,2011,6.9,19.8,6.6,...,,,,,,,,,0.4344,33964.0
2963,"Abilene, TX",Abilene,TX,0.782534,0.073649,0.015273,2014,5.1,13.0,4.9,...,,,,,,,,,0.4626,39776.0
1466,"Abilene, TX",Abilene,TX,0.843283,0.074948,0.01021,2010,8.5,19.9,7.7,...,,,,,,,,,0.459,34004.0
