## This notebook restructures the census-data to work with the Lending Club files

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 75)

### Import and select the columns we need for our purpose, and add a column for the first 3 digits of the zip-code, as the Lending Club data is limited to this

In [2]:
df = pd.read_csv("../census_zipcode_level.csv")

In [3]:
filtered = df[['Zip', 'State', 'Population','Black','Median_household_inc', 'Households','No_Diploma_pct', 'High_School_pct',
       'Some_College_pct', 'Bachelors_Degree_pct', 'Graduate_Degree_pct','Family_Poverty_pct','Unemployment_Rate_pct']]

In [4]:
filtered.head()

Unnamed: 0,Zip,State,Population,Black,Median_household_inc,Households,No_Diploma_pct,High_School_pct,Some_College_pct,Bachelors_Degree_pct,Graduate_Degree_pct,Family_Poverty_pct,Unemployment_Rate_pct
0,35004,Alabama,10418,1657,61371,4225,6.7,29.6,42.2,14.3,7.2,5.9,4.9
1,35005,Alabama,7708,3130,46504,2927,16.7,36.8,31.7,9.3,5.4,14.9,9.9
2,35006,Alabama,3099,205,40664,1189,14.4,45.7,29.3,7.1,3.6,11.3,10.2
3,35007,Alabama,26630,3210,67794,8735,11.5,26.9,31.6,19.4,10.7,9.6,4.7
4,35010,Alabama,20826,7048,34865,7907,25.7,32.4,26.9,10.2,4.7,22.0,9.6


In [5]:
filtered['Zip3'] = filtered['Zip'].astype(str).str.slice(0,3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [6]:
filtered.head()

Unnamed: 0,Zip,State,Population,Black,Median_household_inc,Households,No_Diploma_pct,High_School_pct,Some_College_pct,Bachelors_Degree_pct,Graduate_Degree_pct,Family_Poverty_pct,Unemployment_Rate_pct,Zip3
0,35004,Alabama,10418,1657,61371,4225,6.7,29.6,42.2,14.3,7.2,5.9,4.9,350
1,35005,Alabama,7708,3130,46504,2927,16.7,36.8,31.7,9.3,5.4,14.9,9.9,350
2,35006,Alabama,3099,205,40664,1189,14.4,45.7,29.3,7.1,3.6,11.3,10.2,350
3,35007,Alabama,26630,3210,67794,8735,11.5,26.9,31.6,19.4,10.7,9.6,4.7,350
4,35010,Alabama,20826,7048,34865,7907,25.7,32.4,26.9,10.2,4.7,22.0,9.6,350


### The percentages in the cencus-data is for each individual zip-code. We need to transform this to each 3-digit zip-code. But, to ensure that the percentage actually is scaled to the population of the 3 digit zip, we need to convert the percentages into actual persons

In [7]:
percent = filtered[['Zip', 'Zip3','Population', 'State','No_Diploma_pct', 'High_School_pct','Some_College_pct', 'Bachelors_Degree_pct', 'Graduate_Degree_pct','Family_Poverty_pct','Unemployment_Rate_pct']]
percent = percent.replace(' -   ',0)
percent['No_Diploma'] = (percent['Population'] / 100 * percent['No_Diploma_pct'].astype('float')).astype('int')
percent['High_School'] = (percent['Population'] / 100 * percent['High_School_pct'].astype('float')).astype('int')
percent['Some_College'] = (percent['Population'] / 100 * percent['Some_College_pct'].astype('float')).astype('int')
percent['Bachelors_Degree'] = (percent['Population'] / 100 * percent['Bachelors_Degree_pct'].astype('float')).astype('int')
percent['Graduate_Degree'] = (percent['Population'] / 100 * percent['Graduate_Degree_pct'].astype('float')).astype('int')
percent['Family_Poverty'] = (percent['Population'] / 100 * percent['Family_Poverty_pct'].astype('float')).astype('int')
percent['Unemployment_Rate'] = (percent['Population'] / 100 * percent['Unemployment_Rate_pct'].astype('float')).astype('int')
percent = percent[['Zip','Zip3','Population','State','No_Diploma','High_School', 'Some_College','Bachelors_Degree','Graduate_Degree','Family_Poverty','Unemployment_Rate']]
percent.head()

Unnamed: 0,Zip,Zip3,Population,State,No_Diploma,High_School,Some_College,Bachelors_Degree,Graduate_Degree,Family_Poverty,Unemployment_Rate
0,35004,350,10418,Alabama,698,3083,4396,1489,750,614,510
1,35005,350,7708,Alabama,1287,2836,2443,716,416,1148,763
2,35006,350,3099,Alabama,446,1416,908,220,111,350,316
3,35007,350,26630,Alabama,3062,7163,8415,5166,2849,2556,1251
4,35010,350,20826,Alabama,5352,6747,5602,2124,978,4581,1999


### Here we add up for each 3 digit zip-code, according to the total population in that area

In [8]:
percent_zip3 = percent.groupby('Zip3').sum()
percent_zip3 = percent_zip3.drop(columns='Zip')
percent_zip3['No_Diploma_pct'] = percent_zip3['No_Diploma'] / percent_zip3['Population'] *100
percent_zip3['High_School_pct'] = percent_zip3['High_School'] / percent_zip3['Population'] *100
percent_zip3['Some_College_pct'] = percent_zip3['Some_College'] / percent_zip3['Population'] *100
percent_zip3['Bachelors_Degree_pct'] = percent_zip3['Bachelors_Degree'] / percent_zip3['Population'] *100
percent_zip3['Graduate_Degree_pct'] = percent_zip3['Graduate_Degree'] / percent_zip3['Population'] *100
percent_zip3['Family_Poverty_pct'] = percent_zip3['Family_Poverty'] / percent_zip3['Population'] *100
percent_zip3['Unemployment_Rate_pct'] = percent_zip3['Unemployment_Rate'] / percent_zip3['Population'] *100
percent_zip3 = percent_zip3[['Population','No_Diploma_pct','High_School_pct','Some_College_pct','Bachelors_Degree_pct','Graduate_Degree_pct','Family_Poverty_pct','Unemployment_Rate_pct']]
percent_zip3.head()

Unnamed: 0_level_0,Population,No_Diploma_pct,High_School_pct,Some_College_pct,Bachelors_Degree_pct,Graduate_Degree_pct,Family_Poverty_pct,Unemployment_Rate_pct
Zip3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100,1626010,13.812646,13.558773,15.215528,30.102951,27.301124,13.500163,7.332366
101,92971,6.963462,16.094266,14.772348,31.791634,30.366458,7.409837,5.879253
102,84350,7.184351,27.224659,26.350919,22.144635,17.05987,5.478364,6.394784
103,510347,10.817738,31.259124,26.24626,18.823859,12.831466,10.262233,6.063521
104,1479033,29.038703,27.420416,24.624738,12.462264,6.42866,27.767197,12.648129


### The same is done for each state. Convert the percentages to individuals, to avoid that zips with tiny populations count as much as the zips with large populations

In [9]:
percent_state = percent.groupby('State').sum()
percent_state = percent_state.drop(columns=['Zip'])
percent_state = percent_state.reset_index()
percent_state = percent_state.loc[~percent_state['State'].str.contains(';')]
percent_state = percent_state.set_index('State')
percent_state['No_Diploma_pct_st'] = percent_state['No_Diploma'] / percent_state['Population'] *100
percent_state['High_School_pct_st'] = percent_state['High_School'] / percent_state['Population'] *100
percent_state['Some_College_pct_st'] = percent_state['Some_College'] / percent_state['Population'] *100
percent_state['Bachelors_Degree_pct_st'] = percent_state['Bachelors_Degree'] / percent_state['Population'] *100
percent_state['Graduate_Degree_pct_st'] = percent_state['Graduate_Degree'] / percent_state['Population'] *100
percent_state['Family_Poverty_pct_st'] = percent_state['Family_Poverty'] / percent_state['Population'] *100
percent_state['Unemployment_Rate_pct_st'] = percent_state['Unemployment_Rate'] / percent_state['Population'] *100
percent_state = percent_state[['Population','No_Diploma_pct_st','High_School_pct_st','Some_College_pct_st','Bachelors_Degree_pct_st','Graduate_Degree_pct_st','Family_Poverty_pct_st','Unemployment_Rate_pct_st']]
percent_state.head()

Unnamed: 0_level_0,Population,No_Diploma_pct_st,High_School_pct_st,Some_College_pct_st,Bachelors_Degree_pct_st,Graduate_Degree_pct_st,Family_Poverty_pct_st,Unemployment_Rate_pct_st
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,4835661,15.153978,30.876854,29.790302,15.150028,8.96713,14.425267,8.562449
Alaska,736161,7.761753,27.828831,35.882776,18.18719,10.296117,7.315383,8.287182
Arizona,6687474,14.4836,24.326824,33.876977,17.159155,10.144967,13.849325,8.243486
Arkansas,2959728,14.85694,34.426238,28.961242,13.993752,7.716554,13.99338,7.025342
California,38640444,18.75348,20.905306,29.490243,19.42458,11.413445,12.363792,8.960016


In [10]:
black_zip3 = filtered[['Population','Black','Zip3']].groupby('Zip3').sum()
black_zip3['% Black'] = black_zip3['Black'] / black_zip3['Population'] *100
black_zip3.head()

Unnamed: 0_level_0,Population,Black,% Black
Zip3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,1626010,204370,12.568803
101,92971,3083,3.316088
102,84350,2839,3.365738
103,510347,44943,8.806361
104,1479033,422330,28.554468


### Here the median income is calculated for each 3 digit zip-zone and joined with the rest of the data for each 3 digit zip

In [11]:
median_households_zip3 = filtered[['Zip3','Households', 'Median_household_inc']]
median_households_zip3 = median_households_zip3[median_households_zip3['Median_household_inc'] > 0]
median_households_zip3['Total'] = median_households_zip3['Households'] * median_households_zip3['Median_household_inc']
median_households_zip3 = median_households_zip3.groupby('Zip3').sum()
median_households_zip3['Median income'] = median_households_zip3['Total'] / median_households_zip3['Households']
median_households_zip3 = median_households_zip3.drop(columns=['Median_household_inc','Total'])
median_households_zip3.head()

Unnamed: 0_level_0,Households,Median income
Zip3,Unnamed: 1_level_1,Unnamed: 2_level_1
100,734645,81159.651507
101,44367,90385.881534
102,35646,87417.430146
103,180856,73993.719373
104,509233,37744.630317


In [12]:
zip3 = black_zip3.join(median_households_zip3)
zip3 = pd.concat([zip3, percent_zip3], axis=1, sort=False)
zip3 = zip3.reset_index()
zip3.head()

Unnamed: 0,Zip3,Population,Black,% Black,Households,Median income,Population.1,No_Diploma_pct,High_School_pct,Some_College_pct,Bachelors_Degree_pct,Graduate_Degree_pct,Family_Poverty_pct,Unemployment_Rate_pct
0,100,1626010,204370,12.568803,734645.0,81159.651507,1626010,13.812646,13.558773,15.215528,30.102951,27.301124,13.500163,7.332366
1,101,92971,3083,3.316088,44367.0,90385.881534,92971,6.963462,16.094266,14.772348,31.791634,30.366458,7.409837,5.879253
2,102,84350,2839,3.365738,35646.0,87417.430146,84350,7.184351,27.224659,26.350919,22.144635,17.05987,5.478364,6.394784
3,103,510347,44943,8.806361,180856.0,73993.719373,510347,10.817738,31.259124,26.24626,18.823859,12.831466,10.262233,6.063521
4,104,1479033,422330,28.554468,509233.0,37744.630317,1479033,29.038703,27.420416,24.624738,12.462264,6.42866,27.767197,12.648129


### Again, here is data for each state as a whole calculated

In [13]:
black_state = filtered[['Population','Black','State']]
black_state = black_state[~black_state['State'].str.contains(';')]
black_state = black_state.groupby('State').sum()
black_state['% Black_st'] = black_state['Black'] / black_state['Population'] *100
black_state.head()

Unnamed: 0_level_0,Population,Black,% Black_st
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,4835661,1275390,26.374678
Alaska,736161,22982,3.121871
Arizona,6687474,269639,4.032001
Arkansas,2959728,457169,15.446318
California,38640444,2156986,5.582198


In [14]:
median_households_state = filtered[['State','Households', 'Median_household_inc']]
median_households_state = median_households_state[median_households_state['Median_household_inc'] > 0]
median_households_state['Total'] = median_households_state['Households'] * median_households_state['Median_household_inc']
median_households_state = median_households_state[~median_households_state['State'].str.contains(';')]
median_households_state = median_households_state.groupby('State').sum()
median_households_state['Median income_st'] = median_households_state['Total'] / median_households_state['Households']
median_households_state = median_households_state.drop(columns=['Median_household_inc','Total'])
median_households_state.head()

Unnamed: 0_level_0,Households,Median income_st
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,1846735,47208.240899
Alaska,249796,76199.876611
Arizona,2435443,54770.912528
Arkansas,1135309,43922.248236
California,12797642,69526.556889


### We find the percentage of blacks in the whole country so we can compare with the individual areas

In [15]:
black = int(filtered[['Black']].sum())
population = int(filtered[['Population']].sum())
blacks_usa = black / population *100

In [16]:
state = pd.merge(median_households_state, black_state, on='State')
state = pd.merge(state, percent_state, on='State')
state.head()

Unnamed: 0_level_0,Households,Median income_st,Population_x,Black,% Black_st,Population_y,No_Diploma_pct_st,High_School_pct_st,Some_College_pct_st,Bachelors_Degree_pct_st,Graduate_Degree_pct_st,Family_Poverty_pct_st,Unemployment_Rate_pct_st
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alabama,1846735,47208.240899,4835661,1275390,26.374678,4835661,15.153978,30.876854,29.790302,15.150028,8.96713,14.425267,8.562449
Alaska,249796,76199.876611,736161,22982,3.121871,736161,7.761753,27.828831,35.882776,18.18719,10.296117,7.315383,8.287182
Arizona,2435443,54770.912528,6687474,269639,4.032001,6687474,14.4836,24.326824,33.876977,17.159155,10.144967,13.849325,8.243486
Arkansas,1135309,43922.248236,2959728,457169,15.446318,2959728,14.85694,34.426238,28.961242,13.993752,7.716554,13.99338,7.025342
California,12797642,69526.556889,38640444,2156986,5.582198,38640444,18.75348,20.905306,29.490243,19.42458,11.413445,12.363792,8.960016


### All the data is joined together, and compared. We compare the following:

- Does the 3 digit zip-code have a higher percentage of blacks than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage of blacks than in the country as a whole?
- Does the 3 digit zip-code have a higher percentage of persons with bacherlor degrees than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage of persons with graduate degrees than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage of persons with high-school degrees than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage of persons with no diploma than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage of persons with some college than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage of unemployment than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher median income than in the state the zip-code is located?
- Does the 3 digit zip-code have a higher percentage families living in powerty than in the state the zip-code is located?

In [17]:
Census_OneHotEncoded = filtered[['Zip3','State']]
Census_OneHotEncoded = Census_OneHotEncoded.drop_duplicates()
Census_OneHotEncoded = pd.merge(Census_OneHotEncoded, zip3, on='Zip3')
Census_OneHotEncoded = pd.merge(Census_OneHotEncoded, state, on='State')
Census_OneHotEncoded = Census_OneHotEncoded.drop(['Population','Population_x','Black_x','Black_y','Households_x','Households_y'], axis=1)
Census_OneHotEncoded = Census_OneHotEncoded.reindex(columns=sorted(Census_OneHotEncoded.columns))
Census_OneHotEncoded = Census_OneHotEncoded.set_index('Zip3')
Census_OneHotEncoded['Higher % blacks than state'] = Census_OneHotEncoded['% Black'] > Census_OneHotEncoded['% Black_st']
Census_OneHotEncoded['Higher % blacks than country'] = Census_OneHotEncoded['% Black'] > blacks_usa
Census_OneHotEncoded['Higher % bachelors degrees than state'] = Census_OneHotEncoded['Bachelors_Degree_pct'] > Census_OneHotEncoded['Bachelors_Degree_pct_st']
Census_OneHotEncoded['Higher % graduate degrees than state'] = Census_OneHotEncoded['Graduate_Degree_pct'] > Census_OneHotEncoded['Graduate_Degree_pct_st']
Census_OneHotEncoded['Higher % high school degree than state'] = Census_OneHotEncoded['High_School_pct'] > Census_OneHotEncoded['High_School_pct_st']
Census_OneHotEncoded['Higher % no diploma than state'] = Census_OneHotEncoded['No_Diploma_pct'] > Census_OneHotEncoded['No_Diploma_pct_st']
Census_OneHotEncoded['Higher % some college than state'] = Census_OneHotEncoded['Some_College_pct'] > Census_OneHotEncoded['Some_College_pct_st']
Census_OneHotEncoded['Higher % unemployment than state'] = Census_OneHotEncoded['Unemployment_Rate_pct'] > Census_OneHotEncoded['Unemployment_Rate_pct_st']
Census_OneHotEncoded['Higher median income than state'] = Census_OneHotEncoded['Median income'] > Census_OneHotEncoded['Median income_st']
Census_OneHotEncoded['Higher % powerty than state'] = Census_OneHotEncoded['Family_Poverty_pct'] > Census_OneHotEncoded['Family_Poverty_pct_st']
Census_OneHotEncoded = Census_OneHotEncoded.drop(['% Black','% Black_st','Bachelors_Degree_pct','Bachelors_Degree_pct_st','Family_Poverty_pct','Family_Poverty_pct_st','Graduate_Degree_pct','Graduate_Degree_pct_st','High_School_pct','High_School_pct_st','Median income','Median income_st','No_Diploma_pct','No_Diploma_pct_st','Population_y','Some_College_pct','Some_College_pct_st','State','Unemployment_Rate_pct','Unemployment_Rate_pct_st'], axis=1)
Census_OneHotEncoded.head()

Unnamed: 0_level_0,Higher % blacks than state,Higher % blacks than country,Higher % bachelors degrees than state,Higher % graduate degrees than state,Higher % high school degree than state,Higher % no diploma than state,Higher % some college than state,Higher % unemployment than state,Higher median income than state,Higher % powerty than state
Zip3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
350,False,True,False,False,True,True,True,False,True,False
351,False,True,False,False,True,True,True,False,True,False
352,True,True,True,True,False,False,False,False,True,True
354,True,True,True,True,True,False,False,False,False,True
355,False,False,False,False,True,True,False,True,False,True


In [18]:
Census_OneHotEncoded.to_csv("../data/census_one_hot_encoded.csv")