In [10]:
import pandas as pd 
import numpy as np

### Importing the dataframe

In [11]:
fd_2010_cleaned = pd.read_csv("fd_2010_cleaned.csv")
fd_2010_cleaned.drop(columns = ['Unnamed: 0'], inplace = True)

In [12]:
fd_2010_cleaned.head()

Unnamed: 0,CensusTract,State,County,LILATracts_1And10,Urban
0,1001020100,AL,Autauga,0,1
1,1001020200,AL,Autauga,0,1
2,1001020300,AL,Autauga,0,1
3,1001020400,AL,Autauga,0,1
4,1001020500,AL,Autauga,0,1


Padding the CensusTract to form a 11 digit unique code for merging

In [13]:
fd_2010_cleaned['CensusTract'] = fd_2010_cleaned['CensusTract'].astype('str')
fd_2010_cleaned['CensusTract'] = fd_2010_cleaned['CensusTract'].apply(lambda x: x.zfill(11))

In [14]:
census_2010_cleaned = pd.read_csv("census_2010_cleaned.csv")
census_2010_cleaned['CensusTract'] = census_2010_cleaned['CensusTract'].astype('str')
census_2010_cleaned['CensusTract'] = census_2010_cleaned['CensusTract'].apply(lambda x: x.zfill(11))

In [22]:
area = pd.read_csv("tract_area.csv")
area['CensusTract'] = area['GEOID'].astype('str')
area['CensusTract'] = area['CensusTract'].apply(lambda x: x.zfill(11))
area = area.drop(columns =["GEOID", "LILATracts_1And10"] )

Merging all the dataframes into one

In [26]:
df = pd.merge(fd_2010_cleaned, census_2010_cleaned, on = 'CensusTract')
df = pd.merge(df, area, on = "CensusTract")

# Preprocessing

We believe that the case for Alaska and Hawaii is heterogenous compare to contiguous US. Hence, we decided to remove the two states from our study area.

In [27]:
# dropping the state of Alaska and Hawaii
df.drop(df.index[df['State'] == 'AK'], inplace = True)
df.drop(df.index[df['State'] == 'HI'], inplace = True)

In [28]:
df.describe()

Unnamed: 0,LILATracts_1And10,Urban,DP03_0006PE,DP03_0088E,DP03_0119PE,DP02_0006PE,P001003PE,P001004PE,P001005PE,P001006PE,P001007PE,P001008PE,P001009PE,DP03_UNEMPLOY,H004002PE,H004003PE,H004004PE,area
count,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0,60416.0
mean,0.119786,0.769465,0.400723,27233.609259,11.486623,4.757616,71.915706,14.054224,0.986996,4.229618,0.118297,5.907888,2.787272,8.470487,43.932695,20.423404,35.643902,103356500.0
std,0.324714,0.421179,3.154642,14316.018326,11.315254,3.552253,25.992977,22.808764,4.581177,8.025331,0.335114,9.156833,1.757091,5.612966,17.713708,11.505348,22.351998,548971100.0
min,0.0,0.0,0.0,762.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22158.31
25%,0.0,1.0,0.0,18432.0,3.4,2.3,58.26518,1.079485,0.187715,0.462935,0.0,0.61474,1.437487,4.746835,32.161375,11.726348,17.952789,1693802.0
50%,0.0,1.0,0.0,23992.0,8.0,4.1,81.128554,3.881114,0.379733,1.426668,0.028727,1.849439,2.372837,7.248764,44.191163,19.300166,30.23101,4303568.0
75%,0.0,1.0,0.0,32105.0,15.9,6.4,92.477101,14.827888,0.763603,4.192217,0.103093,6.645819,3.791838,10.729614,56.504612,27.677403,49.574673,28479870.0
max,1.0,1.0,97.8,220611.0,100.0,100.0,100.0,100.0,99.337748,88.875103,14.657718,79.578359,50.0,100.0,100.0,90.717703,100.0,24676950000.0


In [29]:
# rescale DP03_0088E
df['DP03_0088E'] = df['DP03_0088E'] / df['DP03_0088E'].mean()

Add stats dumy variable

In [30]:
# add state dummy
df = df.merge(pd.get_dummies(df["State"], prefix="state", dtype = 'int'),
         left_index=True, right_index= True)

Compute the population density

In [33]:
pop_col = [col for col in df.columns if "P00100" in col]
for col in pop_col:
    col_name = col + '/AREA'
    df[col_name] = df[col] / df['area']

In [35]:
df = df.drop(columns = pop_col)

# Export File

In [36]:
df.to_csv('data.csv')