This notebook will pull other data that we plan to use for the CS 109 final projects. Planning on bringing in education, demographic, and economic data to help us predict crime

In [1]:
import pandas as pd
import urllib
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [38]:
"""
Function
---------------
split_MSA

This method takes in a dataframe with MSA and splits into a city_key (largest city)
and state_key. This will help facilitate MSA merging

Returns dataframe with these two additional features
"""
def split_MSA(df):
    df['MSA'] = df['MSA'].str.replace('Metro Area', '')
    # Need to manually fix how this MSA is written
    df.loc[df['MSA'].str.contains("Texarkana"), "MSA"] = "Texarkana, AR-TX"

    #Grab Everything before comma
    df['city_key'] = df['MSA'].str.split(",").str[0]
    # Then grab everything before first hyphen if it has it
    df['city_key'] = df['city_key'].str.split("-").str[0].str.strip()
    # State will be everying after comma 
    df['state_key']=df['MSA'].str.split(",").str[1].str.strip()
    return(df)

"""
Function
--------
append_df

This function appends two dataframes

Parameters:
    input - dataframe to be appended
    output - dataframe to be appended onto
    
Returns a single dataframe 
"""
def append_df(input,output):
    if output.empty:
        output=input.copy()
    else:
        output=pd.concat([output,input])
        output.reset_index(drop='Index',inplace=True)
    return(output)

'''
Function
-----------
var_thresh

This function takes in a dataframe and keeps only thos varaibles that have a pct
non-missing that is above that threshold
'''
def var_thresh(df, thresh=0.65):
    return(df.loc[:, pd.notnull(df).sum() > len (df) *thresh])

In [43]:
# PROCESS EMPLOYMENT
emp_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_S2301'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_S2301'
    employ = pd.read_csv("data/employ/%s.csv" %f, encoding='Latin-1')
    
    # Grab Unemployment
    un = [v for v in employ.columns if "HC04" in v and "EST" in v]
    un.append('GEO.display-label')
    employ = employ.loc[:, un]
    employ = employ.rename(index=str, columns={'GEO.display-label': 'MSA'})

    employ = employ.loc[employ.MSA != "Geography", :]
    # Split MSA into city-state key
    employ = split_MSA(employ)
    
    employ = employ.loc[:, ["MSA", "city_key", "state_key", 
                          "HC04_EST_VC01", "HC04_EST_VC03",
                         'HC04_EST_VC24']]
    employ['year'] = year
    emp_all = append_df(employ, emp_all) 

# Process Final DataFrame
emp_all = emp_all.sort_values(['city_key', 'state_key', 'year'])
emp_all = emp_all.rename(index=str,
                        columns={'HC04_EST_VC01': 'unemp_16_ovr',
                                'HC04_EST_VC03': 'unemp_16_19',
                                'HC04_EST_VC24': 'unemp_female'})
emp_all.head()
emp_all.to_json("output/employment.json")

In [49]:
# Age Data
age_all = pd.DataFrame()
for year in range(2006, 2017):
    if year == 2006:
        f = 'ACS_06_EST_S0101'
    else:
        f = 'ACS_' + str(year)[2:] + '_1YR_S0101'
    age = pd.read_csv("data/age/%s.csv" %f, encoding='Latin-1')
    age_v = [v for v in age.columns if "EST" in v]
    age_v.append('GEO.display-label')
    age = age.loc[:, age_v]
    age = age.rename(index=str, columns={'GEO.display-label': 'MSA'})
    age = age.loc[age.MSA != "Geography", :]
    age = age.replace("(X)", np.nan)

    age = age.loc[:, ['MSA', 'HC01_EST_VC33',
                      'HC01_EST_VC34',
                      'HC01_EST_VC01', 'HC02_EST_VC01',
                      'HC03_EST_VC01', 'HC01_EST_VC06',
                      'HC01_EST_VC07', 'HC02_EST_VC07']]
    age = split_MSA(age)
    age['year'] = year
    age_all = append_df(age, age_all) 


# Process Final DataFrame
age_all = age_all.sort_values(['city_key', 'state_key', 'year'])
age_all = age_all.rename(index=str,
                         columns={'HC01_EST_VC33':'median_age',
                                'HC01_EST_VC34': 'sex_ratio',
                                'HC01_EST_VC01': 'total_pop',
                                'HC02_EST_VC01': 'male_pop',
                                'HC03_EST_VC01': 'female_pop',
                                'HC01_EST_VC06': 'pop_15_19',
                                'HC01_EST_VC07': 'pop_20_24',
                                'HC02_EST_VC07': 'male_pop_20_24'})

age_all[['total_pop', 'male_pop', 'female_pop']] = age_all[['total_pop', 'male_pop', 'female_pop']].astype(int)

age_all['male_pop'] = age_all['male_pop'] / age_all['total_pop']
age_all['female_pop'] = age_all['female_pop'] / age_all['total_pop']
del age_all['total_pop']
age_all.head()

Unnamed: 0,MSA,median_age,sex_ratio,male_pop,female_pop,pop_15_19,pop_20_24,male_pop_20_24,city_key,state_key,year
0,"Abilene, TX",34.4,99.1,0.497717,0.502283,8.3,8.7,10.2,Abilene,TX,2006
367,"Abilene, TX",34.9,99.1,0.497777,0.502223,9.5,7.7,8.6,Abilene,TX,2007
736,"Abilene, TX",34.6,101.0,0.502381,0.497619,9.2,7.6,8.9,Abilene,TX,2008
1105,"Abilene, TX",33.2,97.0,0.492269,0.507731,7.9,9.0,9.6,Abilene,TX,2009
1479,"Abilene, TX",,,0.501355,0.498645,7.3,9.5,9.9,Abilene,TX,2010
