This notebook will pull other data that we plan to use for the CS 109 final projects. Planning on bringing in education, demographic, and economic data to help us predict crime

In [1]:
import pandas as pd
import urllib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [10]:
"""
Function
---------------
split_MSA

This method takes in a dataframe with MSA and splits into a city_key (largest city)
and state_key. This will help facilitate MSA merging

Returns dataframe with these two additional features
"""
def split_MSA(df):
    df['MSA'] = df['MSA'].str.replace('Metro Area', '')
    # Need to manually fix how this MSA is written
    df.loc[df['MSA'].str.contains("Texarkana"), "MSA"] = "Texarkana, AR-TX"

    #Grab Everything before comma
    df['city_key'] = df['MSA'].str.split(",").str[0]
    # Then grab everything before first hyphen if it has it
    df['city_key'] = df['city_key'].str.split("-").str[0].str.strip()
    # State will be everying after comma 
    df['state_key']=df['MSA'].str.split(",").str[1].str.strip()
    return(df)

"""
Function
--------
append_df

This function appends two dataframes

Parameters:
    input - dataframe to be appended
    output - dataframe to be appended onto
    
Returns a single dataframe 
"""
def append_df(input,output):
    if output.empty:
        output=input.copy()
    else:
        output=pd.concat([output,input])
        output.reset_index(drop='Index',inplace=True)
    return(output)

'''
Function
-----------
var_thresh

This function takes in a dataframe and keeps only thos varaibles that have a pct
non-missing that is above that threshold
'''
def var_thresh(df, thresh=0.65):
    return(df.loc[:, pd.notnull(df).sum() > len (df) *thresh])

'''
Function
---------
slim_df

This function takes in a list of variables to keep
on the the given df. It keep the variables + geography
then renames to MSA and drops the first row of variable descriptions
'''
def slim_df(df, var_list):
    var_list.append('GEO.display-label')
    df = df.loc[:, var_list]
    df = df.rename(index=str, columns={'GEO.display-label': 'MSA'})
    # Drop first row of var descriptions
    df = df.loc[df.MSA != "Geography", :]
    # Split MSA into city-state key
    return(split_MSA(df))

In [11]:
# PROCESS EMPLOYMENT
emp_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_S2301'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_S2301'
    employ = pd.read_csv("data/employ/%s.csv" %f, encoding='Latin-1')
    
    # Grab Unemployment
    un = [v for v in employ.columns if "HC04" in v and "EST" in v]
    employ = slim_df(employ, un)
    
    employ = employ.loc[:, ["MSA", "city_key", "state_key", 
                          "HC04_EST_VC01", "HC04_EST_VC03",
                         'HC04_EST_VC24']]
    employ['year'] = year
    emp_all = append_df(employ, emp_all) 

# Process Final DataFrame
emp_all = emp_all.sort_values(['city_key', 'state_key', 'year'])
emp_all = emp_all.rename(index=str,
                        columns={'HC04_EST_VC01': 'unemp_16_ovr',
                                'HC04_EST_VC03': 'unemp_16_19',
                                'HC04_EST_VC24': 'unemp_female'})
emp_all.head()
emp_all.to_json("output/employment.json")

In [12]:
# Age Data
age_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_S0101'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_S0101'
    age = pd.read_csv("data/age/%s.csv" %f, encoding='Latin-1')
    age = slim_df(age, [v for v in age.columns if "EST" in v])
    age = age.replace("(X)", np.nan)

    age = age.loc[:, ['MSA','city_key','state_key',
                      'HC01_EST_VC33','HC01_EST_VC34',
                      'HC01_EST_VC01', 'HC02_EST_VC01',
                      'HC03_EST_VC01', 'HC01_EST_VC06',
                      'HC01_EST_VC07', 'HC02_EST_VC07']]
    age['year'] = year
    age_all = append_df(age, age_all) 


# Process Final DataFrame
age_all = age_all.sort_values(['city_key', 'state_key', 'year'])
age_all = age_all.rename(index=str,
                         columns={'HC01_EST_VC33':'median_age',
                                'HC01_EST_VC34': 'sex_ratio',
                                'HC01_EST_VC01': 'total_pop',
                                'HC02_EST_VC01': 'male_pop',
                                'HC03_EST_VC01': 'female_pop',
                                'HC01_EST_VC06': 'pop_15_19',
                                'HC01_EST_VC07': 'pop_20_24',
                                'HC02_EST_VC07': 'male_pop_20_24'})

age_all[['total_pop', 'male_pop', 'female_pop']] = age_all[['total_pop', 'male_pop', 'female_pop']].astype(int)

age_all['male_pop'] = age_all['male_pop'] / age_all['total_pop']
age_all['female_pop'] = age_all['female_pop'] / age_all['total_pop']
del age_all['total_pop']
age_all.head()

Unnamed: 0,MSA,city_key,state_key,median_age,sex_ratio,male_pop,female_pop,pop_15_19,pop_20_24,male_pop_20_24,year
0,"Abilene, TX",Abilene,TX,34.4,99.1,0.497717,0.502283,8.3,8.7,10.2,2006
367,"Abilene, TX",Abilene,TX,34.9,99.1,0.497777,0.502223,9.5,7.7,8.6,2007
736,"Abilene, TX",Abilene,TX,34.6,101.0,0.502381,0.497619,9.2,7.6,8.9,2008
1105,"Abilene, TX",Abilene,TX,33.2,97.0,0.492269,0.507731,7.9,9.0,9.6,2009
1479,"Abilene, TX",Abilene,TX,,,0.501355,0.498645,7.3,9.5,9.9,2010


In [30]:
inc_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_B19001F'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_B19001F'
    inc = pd.read_csv("data/house_income/%s.csv" %f, encoding='Latin-1')
    # Keep only the estimates
    inc = slim_df(inc, [v for v in inc.columns if "HD01" in v])
    inc_all = append_df(inc, inc_all) 

# Proccess Final Data Frame
inc_all =  inc_all.rename(index=str,
                          columns={'HD01_VD01':'total',
                                  'HD01_VD02': 'inc_lt10',
                                  'HD01_VD03': 'inc_10_15',
                                  'HD01_VD04': 'inc_15_19',
                                  'HD01_VD05': 'inc_20_24',
                                  'HD01_VD06': 'inc_25_29',
                                  'HD01_VD07': 'inc_30_34',
                                  'HD01_VD08': 'inc_35_39',
                                  'HD01_VD09': 'inc_40_44',
                                  'HD01_VD10': 'inc_45_49',
                                  'HD01_VD11': 'inc_50_59',
                                  'HD01_VD12': 'inc_60_74',
                                  'HD01_VD13':'inc_75_99',
                                  'HD01_VD14':'inc_100_124',
                                  'HD01_VD15':'inc_125_149',
                                  'HD01_VD16':'inc_150_199',
                                  'HD01_VD17':'inc_gt_200'})

numeric_vars =  [v for v in inc_all.columns if "inc" in v]
inc_all[numeric_vars] = inc_all[numeric_vars].astype(int)
inc_all['total'] = inc_all['total'].astype(int)
# Get propotion of each imcome bracket by dividing by total
inc_all.loc[:, numeric_vars] = inc_all[numeric_vars].apply(lambda x: x / inc_all["total"])
del inc_all['total']
inc_all.head()

Unnamed: 0,inc_lt10,inc_10_15,inc_15_19,inc_20_24,inc_25_29,inc_30_34,inc_35_39,inc_40_44,inc_45_49,inc_50_59,inc_60_74,inc_75_99,inc_100_124,inc_125_149,inc_150_199,inc_gt_200,MSA,city_key,state_key
0,0.066006,0.039355,0.191044,0.142857,0.034552,0.051751,0.09498,0.044159,0.004803,0.138829,0.060738,0.099628,0.004029,0.006508,0.018748,0.002014,"Abilene, TX",Abilene,TX
1,0.042883,0.102766,0.078153,0.191829,0.056077,0.063436,0.136514,0.011165,0.071048,0.108094,0.062928,0.046435,0.025121,0.003552,0.0,0.0,"Albany-Schenectady-Troy, NY",Albany,NY
2,0.111215,0.064565,0.078066,0.069276,0.056352,0.080003,0.071902,0.061101,0.054006,0.094082,0.10114,0.101661,0.034098,0.007486,0.009591,0.005456,"Albuquerque, NM",Albuquerque,NM
3,0.150833,0.063611,0.084792,0.084514,0.105903,0.057778,0.043542,0.024583,0.080417,0.095417,0.075556,0.068958,0.042639,0.0,0.021458,0.0,"Allentown-Bethlehem-Easton, PA-NJ",Allentown,PA-NJ
4,0.084392,0.086299,0.077275,0.135104,0.043213,0.134596,0.061642,0.051856,0.037239,0.120742,0.05427,0.070285,0.011185,0.022496,0.0,0.009405,"Amarillo, TX",Amarillo,TX
