In [1]:
import pandas as pd
import functools
import os

Workflow:

I. Loading 2010 SF1 data from Massachussetts and creating tables based on variables of interest

II. Loading 2010 DHC data from Massachussetts and creating tables based on variables of interest

In [2]:
# Checking working directory
os.getcwd
path = '/Users/christinaxu/Documents/dp_census'

I. Loading 2010 SF1 data from Massachusetts

a) Read in the segments of the 2010 SF1 for Massachusetts downloaded from [here](https://archive.ciser.cornell.edu/explore/download-centers/census-2010-sf1/files)

b) The specific segments are selected based on the columns from on Abie and Os's work:
* P8 - race in 63 categories
* P9 - race in 63 categories, non-hispanic
* P10 - race in 63 categories for 18+
* P11 - race in 63 categories, non-hispanic for 18+
* P12 - sex by age
* P14 - sex by for below 20 years of age

In [3]:
sf1_1 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma000012010ur1.CSV')
sf1_2 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma000022010ur1.CSV')
sf1_3 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma000032010ur1.CSV')
sf1_4 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma000042010ur1.CSV')
sf1_7 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma000072010ur1.CSV')
sf1_8 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma000082010ur1.CSV')
mass_geo = pd.read_csv(path + '/ma2010ur1_49segments_csv/mageo2010ur1.CSV')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
sf1_1.head()

Unnamed: 0,FILEID,STUSAB,CHARITER,CIFSN,LOGRECNO,P0010001
0,UR1ST,MA,0,,1,6547629
1,UR1ST,MA,0,,2,6021989
2,UR1ST,MA,0,,3,5912700
3,UR1ST,MA,0,,4,109289
4,UR1ST,MA,0,,5,525640


In [5]:
sf1_2.head()

Unnamed: 0,FILEID,STUSAB,CHARITER,CIFSN,LOGRECNO,P0020001,P0020002,P0020003,P0020004,P0020005,P0020006
0,UR1ST,MA,0,,1,6547629,6021989,5912700,109289,525640,0
1,UR1ST,MA,0,,2,6021989,6021989,5912700,109289,0,0
2,UR1ST,MA,0,,3,5912700,5912700,5912700,0,0,0
3,UR1ST,MA,0,,4,109289,109289,0,109289,0,0
4,UR1ST,MA,0,,5,525640,0,0,0,525640,0


In [6]:
mass_geo.head()
mass_geo = mass_geo[['FILEID', 'STUSAB', 'CHARITER', 'CIFSN', 'LOGRECNO', 'BLOCK', 'COUNTY', 'TRACT']]

c) Rather than dealing with 4 tables, let's merge them into a larger table based on LOGRECNO

In [7]:
table_list = [sf1_1, sf1_2, sf1_3, sf1_4, sf1_7, sf1_8, mass_geo]
sf1_mass = functools.reduce(lambda x, y: pd.merge(x,y, on=sf1_1.columns[:5].to_list()), table_list)

In [8]:
sf1_mass.head()

Unnamed: 0,FILEID,STUSAB,CHARITER,CIFSN,LOGRECNO,P0010001,P0020001,P0020002,P0020003,P0020004,...,P016H003,P016I001,P016I002,P016I003,P017A001,P017A002,P017A003,BLOCK,COUNTY,TRACT
0,UR1ST,MA,0,,1,6547629,6547629,6021989,5912700,109289,...,385633,4892298,1003956,3888342,2.4,0.5,1.89,,,
1,UR1ST,MA,0,,2,6021989,6021989,6021989,5912700,109289,...,381060,4394984,891147,3503837,2.38,0.49,1.88,,,
2,UR1ST,MA,0,,3,5912700,5912700,5912700,5912700,0,...,378278,4300601,872634,3427967,2.38,0.5,1.88,,,
3,UR1ST,MA,0,,4,109289,109289,109289,0,109289,...,2782,94383,18513,75870,2.2,0.44,1.76,,,
4,UR1ST,MA,0,,5,525640,525640,0,0,0,...,4573,497314,112809,384505,2.62,0.6,2.02,,,


d) Dropping column CIFSN and any rows that contain NaN values and columns P013+ since they aren't included in the model. In addition, renaming STUSAB to state.

In [9]:
sf1_mass.isna().sum() # CIFSN is the only column with all na values so drop it

FILEID           0
STUSAB           0
CHARITER         0
CIFSN       196412
LOGRECNO         0
             ...  
P017A002         0
P017A003         0
BLOCK        38904
COUNTY        3130
TRACT         9439
Length: 952, dtype: int64

In [10]:
cols_to_drop = ['CIFSN'] + list(sf1_mass.filter(regex='P013|P015|P016|P017'))

In [11]:
sf1_mass.drop(cols_to_drop, axis=1, inplace=True)
sf1_mass.dropna(inplace=True)
sf1_mass.rename(columns={'STUSAB':'STATE'}, inplace=True)

In [12]:
sf1_mass.head()

Unnamed: 0,FILEID,STATE,CHARITER,LOGRECNO,P0010001,P0020001,P0020002,P0020003,P0020004,P0020005,...,P012I043,P012I044,P012I045,P012I046,P012I047,P012I048,P012I049,BLOCK,COUNTY,TRACT
59,UR1ST,MA,0,60,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3000.0,1.0,10206.0
60,UR1ST,MA,0,61,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3163.0,1.0,10206.0
65,UR1ST,MA,0,66,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3000.0,1.0,10208.0
66,UR1ST,MA,0,67,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3001.0,1.0,10208.0
71,UR1ST,MA,0,72,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1019.0,1.0,10400.0


In [13]:
print(sf1_mass['BLOCK'].nunique())
print(sf1_mass['COUNTY'].nunique())
print(sf1_mass['TRACT'].nunique())

1838
14
1469


e) Dividing sf1_mass into smaller dfs based on variables of interest

In [14]:
cols = ['STATE', 'LOGRECNO', 'BLOCK', 'COUNTY', 'TRACT']

P1 = sf1_mass[cols + list(sf1_mass.filter(regex='P001'))] # Total pop
P8 = sf1_mass[cols + list(sf1_mass.filter(regex='P008'))] # Race
P9 = sf1_mass[cols + list(sf1_mass.filter(regex='P009'))] # Hispanic or Latino
P10 = sf1_mass[cols + list(sf1_mass.filter(regex='P010'))] # Race for 18+
P11 = sf1_mass[cols + list(sf1_mass.filter(regex='P011'))] # Hispanic or Latio for 18+
P12 = sf1_mass[cols + list(sf1_mass.filter(regex='P012'))] # Sex by age
P14 = sf1_mass[cols + list(sf1_mass.filter(regex='P014'))] # Sex by age for under 20 years

In [15]:
P12.head()

Unnamed: 0,STATE,LOGRECNO,BLOCK,COUNTY,TRACT,P0120001,P0120002,P0120003,P0120004,P0120005,...,P012I040,P012I041,P012I042,P012I043,P012I044,P012I045,P012I046,P012I047,P012I048,P012I049
59,MA,60,3000.0,1.0,10206.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,MA,61,3163.0,1.0,10206.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,MA,66,3000.0,1.0,10208.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66,MA,67,3001.0,1.0,10208.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,MA,72,1019.0,1.0,10400.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


f) Futher dividing P12 into smaller dfs based on race categories which can be found [here](https://api.census.gov/data/2010/dec/sf1/variables.html).

In [16]:
letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
df_list_filtered = []

for letter in letters:
    df = P12[cols + list(sf1_mass.filter(regex= f'P012{letter}'))]
    df_list_filtered.append(df)
    
P12A = df_list_filtered[0] # sex by age (White)
P12B = df_list_filtered[1] # sex by age (Black or African American)
P12C = df_list_filtered[2] # sex by age (Native American or Alaska Native)
P12D = df_list_filtered[3] # ... (Asian)
P12E = df_list_filtered[4] # ... (Native Hawaiian and other Pacific Islander)
P12F = df_list_filtered[5] # ... ("Some other Race")
P12G = df_list_filtered[6] # ... (2 or more races)                    
P12H = df_list_filtered[7] # ... (Hispanic or Latino)
P12I = df_list_filtered[8] # .... (None Hispanic or Latino White)

In [17]:
# Dropping the above variables from P12 
P12.drop(list(P12.filter(regex='A|B|C|D|E|F|G|H|I')), axis=1, inplace=True)
P12.columns.to_list()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


['P0120001',
 'P0120002',
 'P0120003',
 'P0120004',
 'P0120005',
 'P0120006',
 'P0120007',
 'P0120008',
 'P0120009',
 'P0120010',
 'P0120011',
 'P0120012',
 'P0120013',
 'P0120014',
 'P0120015',
 'P0120016',
 'P0120017',
 'P0120018',
 'P0120019',
 'P0120020',
 'P0120021',
 'P0120022',
 'P0120023',
 'P0120024',
 'P0120025',
 'P0120026',
 'P0120027',
 'P0120028',
 'P0120029',
 'P0120030',
 'P0120031',
 'P0120032',
 'P0120033',
 'P0120034',
 'P0120035',
 'P0120036',
 'P0120037',
 'P0120038',
 'P0120039',
 'P0120040',
 'P0120041',
 'P0120042',
 'P0120043',
 'P0120044',
 'P0120045',
 'P0120046',
 'P0120047',
 'P0120048',
 'P0120049']

g) Saving dfs to csv files 

In [22]:
table_list = [P1, P8, P9, P10, P11, P12, P12A, P12B, P12C, P12D, P12E, P12F, P12G, P12H, P12I, P14]
name_list = ['P1', 'P8', 'P9', 'P10', 'P11', 'P12', 'P12A', 'P12B', 'P12C', 'P12D', 'P12E', 'P12F', 'P12G', 'P12H', 'P12I', 'P14']
for table, name in zip(table_list,name_list):
    table.to_csv(path + '/data/mass_sf1/table_{}.csv'.format(name))

 II. Loading 2010 DHC data from Massachusetts
 a) Read in the segments of the 2010 SF1 for Massachusetts downloaded from [here](https://archive.ciser.cornell.edu/explore/download-centers/census-2010-sf1/files)

a) The specific segments are selected based on the columns from on Abie and Os's work:
* P1 - total population
* P8 - race in 63 categories
* P9 - race in 63 categories, non-hispanic
* P10 - race in 63 categories for 18+
* P11 - race in 63 categories, non-hispanic for 18+
* P12 - sex by age
* P14 - sex by for below 20 years of age

In [19]:
# Demographic and Housing Characteristics File
col_names = 'FILEID,STUSAB,SUMLEV,GEOVAR,GEOCOMP,CHARITER,CIFSN,LOGRECNO,GEOID,GEOCODE,REGION,DIVISION,STATE,STATENS,COUNTY,COUNTYCC,COUNTYNS,COUSUB,COUSUBCC,COUSUBNS,SUBMCD,SUBMCDCC,SUBMCDNS,ESTATEFP,ESTATECC,ESTATENS,CONCIT,CONCITCC,CONCITNS,PLACE,PLACECC,PLACENS,TRACT,BLKGRP,BLOCK,AIANHH,AIHHTLI,AIANHHFP,AIANHHCC,AIANHHNS,AITS,AITSFP,AITSCC,AITSNS,TTRACT,BTBG,ANRC,ANRCCC,ANRCNS,CBSA,MEMI,CSA,METDIV,NECTA,NMEMI,CNECTA,NECTADIV,CBSAPCI,NECTAPCI,UA,UATYPE,UR,CD111,CD113,CD114,CD115,CD116,SLDU11,SLDU12,SLDU14,SLDU16,SLDU18,SLDL11,SLDL12,SLDL14,SLDL16,SLDL18,VTD,VTDI,ZCTA,SDELM,SDSEC,SDUNI,PUMA,AREALAND,AREAWATR,BASENAME,NAME,FUNCSTAT,GCUNI,POP100,HU100,INTPTLAT,INTPTLON,LSADC,PARTFLAG,UGA'.split(',')
mass_dhc = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /mageo2010.dhc',
                     sep='|',
                     header=None,
                     names=col_names,
                     low_memory=False,
                     encoding='latin1')
mass_dhc.head()

Unnamed: 0,FILEID,STUSAB,SUMLEV,GEOVAR,GEOCOMP,CHARITER,CIFSN,LOGRECNO,GEOID,GEOCODE,...,NAME,FUNCSTAT,GCUNI,POP100,HU100,INTPTLAT,INTPTLON,LSADC,PARTFLAG,UGA
0,DHCST,MA,40,0,0,0,0,1,0400000US25,25,...,Massachusetts,A,N,6547629,0,42.15652,-71.489592,0,,
1,DHCST,MA,40,0,1,0,0,2,0400001US25,25,...,Massachusetts,A,N,6020932,0,42.223216,-71.313364,0,,
2,DHCST,MA,40,0,43,0,0,3,0400043US25,25,...,Massachusetts,A,N,526697,0,42.19481,-71.769011,0,,
3,DHCST,MA,40,0,44,0,0,4,0400044US25,25,...,Massachusetts,A,N,68532,0,42.163135,-71.498091,0,,
4,DHCST,MA,40,0,48,0,0,5,0400048US25,25,...,Massachusetts,A,N,458165,0,42.195537,-71.812951,0,,


In [20]:
mass_dhc_1 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000012010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_2 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000022010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_4 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000042010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_5 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000052010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_6 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000062010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_7 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000072010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_8 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000082010.dhc',sep='|',header=None,low_memory=False)
mass_dhc_9 = pd.read_csv(path + '/ma2010ur1_49segments_csv/ma2010.dhc /ma000092010.dhc',sep='|',header=None,low_memory=False)

In [23]:
mass_dhc_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,195,196,197,198,199,200,201,202,203,204
0,DHCST,MA,0,1,1,6308764,6547629,6547629,6020932,526697,...,0,82,75,7,0,0,0,0,7,7
1,DHCST,MA,0,1,2,5788754,6020932,6020932,6020932,0,...,0,74,67,7,0,0,0,0,6,6
2,DHCST,MA,0,1,3,520010,526697,526697,0,526697,...,0,8,8,0,0,0,0,0,1,1
3,DHCST,MA,0,1,4,67298,68532,68532,0,68532,...,0,2,2,0,0,0,0,0,0,0
4,DHCST,MA,0,1,5,452712,458165,458165,0,458165,...,0,6,6,0,0,0,0,0,1,1


b) The DHC file segments doesn't come with column names so naming them below based on the technical documentation found [here](https://www2.census.gov/programs-surveys/decennial/2020/program-management/data-product-planning/2010-demonstration-data-products/02-Demographic_and_Housing_Characteristics/2022-03-16_Summary_File/2022-03-16_Technical%20Document/2022-03-16_Technical%20Document.pdf).

In [24]:
# Dropping the columns 1-6 for now to make it easier to count
P1_P9=mass_dhc_1.iloc[:,6:] # P1-P9
P1_P9.rename(columns={x:y for x,y in zip(P1_P9.columns,range(0,len(P1_P9.columns)))})

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,189,190,191,192,193,194,195,196,197,198
0,6547629,6547629,6020932,526697,0,6547629,5265246,434398,18849,349772,...,0,82,75,7,0,0,0,0,7,7
1,6020932,6020932,6020932,0,0,6020932,4761797,429562,17893,342669,...,0,74,67,7,0,0,0,0,6,6
2,526697,526697,0,526697,0,526697,503449,4836,956,7103,...,0,8,8,0,0,0,0,0,1,1
3,68532,68532,0,68532,0,68532,65051,958,130,826,...,0,2,2,0,0,0,0,0,0,0
4,458165,458165,0,458165,0,458165,438398,3878,826,6277,...,0,6,6,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196392,17503,17503,17503,0,0,17503,16057,347,32,214,...,0,0,0,0,0,0,0,0,0,0
196393,38120,38120,38120,0,0,38120,32110,1575,49,2793,...,0,0,0,0,0,0,0,0,0,0
196394,181041,181041,181041,0,0,181041,125726,21044,781,11058,...,0,4,4,0,0,0,0,0,3,3
196395,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# First, let's isolate variables P1, P8, P9 into their own dfs
cols_to_keep = mass_dhc_1[[0,1,4]] # col 0 - FILEID, col 1 - STATE, col 4 - LOGRECNO
dhc_P1 = cols_to_keep.join(P1_P9.iloc[:, 0]) # Total Population
dhc_P8 = cols_to_keep.join(P1_P9.iloc[:,(1+4+8+3+17+7+16):(1+4+8+3+17+7+15+72)])
dhc_P9 = cols_to_keep.join(P1_P9.iloc[:,(1+4+8+3+17+7+15+71):])

In [26]:
# Sanity check to make sure each df has the right number of cols:
print(dhc_P1.shape) # P1 should have 3 + 1 = 4 cols
print(dhc_P8.shape) # P8 should have 3 + 71 = 74 cols
print(dhc_P9.shape) # P9 should have 3 + 73 = 76 cols

(196397, 4)
(196397, 74)
(196397, 76)


In [27]:
# Now, let's start renaming columns in each df, starting with P1
dhc_P1.rename(columns={0:'FILEID', 1: 'STATE', 4: 'LOGRECNO',6:'P0010001'}, inplace=True)
dhc_P1.head()

Unnamed: 0,FILEID,STATE,LOGRECNO,P0010001
0,DHCST,MA,1,6547629
1,DHCST,MA,2,6020932
2,DHCST,MA,3,526697
3,DHCST,MA,4,68532
4,DHCST,MA,5,458165


In [28]:
# Creating a function to rename columns for the rest of the dfs
def rename_cols(variable, length, df):
    col_names = []
    for i in range(1,length+1):
        if variable < 10:
            if i < 10:
                col_names.append('P00{}00{}'.format(variable,i))
            else: 
                col_names.append('P00{}0{}'.format(variable,i))
        else:
            if i < 10:
                col_names.append('P0{}000{}'.format(variable,i))
            else: 
                col_names.append('P0{}00{}'.format(variable,i))
             
    col_names = ['FILEID', 'STATE', 'LOGRECNO'] + col_names
    
    df.columns = col_names
    
    print(df.columns)

In [29]:
rename_cols(8, 71, dhc_P8)

Index(['FILEID', 'STATE', 'LOGRECNO', 'P008001', 'P008002', 'P008003',
       'P008004', 'P008005', 'P008006', 'P008007', 'P008008', 'P008009',
       'P008010', 'P008011', 'P008012', 'P008013', 'P008014', 'P008015',
       'P008016', 'P008017', 'P008018', 'P008019', 'P008020', 'P008021',
       'P008022', 'P008023', 'P008024', 'P008025', 'P008026', 'P008027',
       'P008028', 'P008029', 'P008030', 'P008031', 'P008032', 'P008033',
       'P008034', 'P008035', 'P008036', 'P008037', 'P008038', 'P008039',
       'P008040', 'P008041', 'P008042', 'P008043', 'P008044', 'P008045',
       'P008046', 'P008047', 'P008048', 'P008049', 'P008050', 'P008051',
       'P008052', 'P008053', 'P008054', 'P008055', 'P008056', 'P008057',
       'P008058', 'P008059', 'P008060', 'P008061', 'P008062', 'P008063',
       'P008064', 'P008065', 'P008066', 'P008067', 'P008068', 'P008069',
       'P008070', 'P008071'],
      dtype='object')


In [30]:
rename_cols(9, 73, dhc_P9)

Index(['FILEID', 'STATE', 'LOGRECNO', 'P009001', 'P009002', 'P009003',
       'P009004', 'P009005', 'P009006', 'P009007', 'P009008', 'P009009',
       'P009010', 'P009011', 'P009012', 'P009013', 'P009014', 'P009015',
       'P009016', 'P009017', 'P009018', 'P009019', 'P009020', 'P009021',
       'P009022', 'P009023', 'P009024', 'P009025', 'P009026', 'P009027',
       'P009028', 'P009029', 'P009030', 'P009031', 'P009032', 'P009033',
       'P009034', 'P009035', 'P009036', 'P009037', 'P009038', 'P009039',
       'P009040', 'P009041', 'P009042', 'P009043', 'P009044', 'P009045',
       'P009046', 'P009047', 'P009048', 'P009049', 'P009050', 'P009051',
       'P009052', 'P009053', 'P009054', 'P009055', 'P009056', 'P009057',
       'P009058', 'P009059', 'P009060', 'P009061', 'P009062', 'P009063',
       'P009064', 'P009065', 'P009066', 'P009067', 'P009068', 'P009069',
       'P009070', 'P009071', 'P009072', 'P009073'],
      dtype='object')


c) Repeating the above steps to isolate variables P10, P11, P12, and P12A from mass_dhc_2.

In [31]:
mass_dhc_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,237,238,239,240,241,242,243,244,245,246
0,DHCST,MA,0,2,1,5128695,5030711,4217037,315900,13622,...,217997,195178,71765,100367,54093,71879,94551,85126,79002,94943
1,DHCST,MA,0,2,2,4722168,4627997,3825503,312383,12875,...,193310,172626,63786,89262,48744,65139,86333,78538,73259,88879
2,DHCST,MA,0,2,3,406527,402714,391534,3517,747,...,24687,22552,7979,11105,5349,6740,8218,6588,5743,6064
3,DHCST,MA,0,2,4,54797,54233,52365,762,102,...,3025,2932,1044,1607,728,973,1405,1171,1035,1264
4,DHCST,MA,0,2,5,351730,348481,339169,2755,645,...,21662,19620,6935,9498,4621,5767,6813,5417,4708,4800


In [32]:
P10_P12A = mass_dhc_2.iloc[:,5:] # P10-P12A
P10_P12A.rename(columns={x:y for x,y in zip(P10_P12A.columns,range(0,len(P10_P12A.columns)))})

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,232,233,234,235,236,237,238,239,240,241
0,5128695,5030711,4217037,315900,13622,270514,1706,211932,97984,90785,...,217997,195178,71765,100367,54093,71879,94551,85126,79002,94943
1,4722168,4627997,3825503,312383,12875,265571,1607,210058,94171,87342,...,193310,172626,63786,89262,48744,65139,86333,78538,73259,88879
2,406527,402714,391534,3517,747,4943,99,1874,3813,3443,...,24687,22552,7979,11105,5349,6740,8218,6588,5743,6064
3,54797,54233,52365,762,102,596,14,394,564,501,...,3025,2932,1044,1607,728,973,1405,1171,1035,1264
4,351730,348481,339169,2755,645,4347,85,1480,3249,2942,...,21662,19620,6935,9498,4621,5767,6813,5417,4708,4800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196392,14382,14149,13279,273,27,183,4,383,233,209,...,772,642,257,376,197,273,313,305,254,264
196393,30563,30118,26202,1141,31,2127,8,609,445,421,...,1343,1185,428,584,318,446,675,708,642,697
196394,141019,137122,103340,14483,544,8360,57,10338,3897,3670,...,4720,4035,1534,1970,1085,1469,2087,2030,2043,2681
196395,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
cols_to_keep = mass_dhc_2[[0,1,4]]
dhc_P10 = cols_to_keep.join(P10_P12A.iloc[:, :71]) #first 71 cols
dhc_P11 = cols_to_keep.join(P10_P12A.iloc[:, 71:(71+73)])
dhc_P12 = cols_to_keep.join(P10_P12A.iloc[:, (71+73):(71+73+49)])
dhc_P12A = cols_to_keep.join(P10_P12A.iloc[:, (71+73+49):(71+73+49+49)])

In [34]:
# Sanity check to make sure we have the right number of columns
print(dhc_P10.shape) # P10 should have 3 + 71 = 74 cols
print(dhc_P11.shape) # ... 3 + 73 = 76
print(dhc_P12.shape) # ... 3 + 49 = 52
print(dhc_P12A.shape) # ... 3 + 49 = 52

(196397, 74)
(196397, 76)
(196397, 52)
(196397, 52)


In [35]:
rename_cols(10,71,dhc_P10)

Index(['FILEID', 'STATE', 'LOGRECNO', 'P0100001', 'P0100002', 'P0100003',
       'P0100004', 'P0100005', 'P0100006', 'P0100007', 'P0100008', 'P0100009',
       'P0100010', 'P0100011', 'P0100012', 'P0100013', 'P0100014', 'P0100015',
       'P0100016', 'P0100017', 'P0100018', 'P0100019', 'P0100020', 'P0100021',
       'P0100022', 'P0100023', 'P0100024', 'P0100025', 'P0100026', 'P0100027',
       'P0100028', 'P0100029', 'P0100030', 'P0100031', 'P0100032', 'P0100033',
       'P0100034', 'P0100035', 'P0100036', 'P0100037', 'P0100038', 'P0100039',
       'P0100040', 'P0100041', 'P0100042', 'P0100043', 'P0100044', 'P0100045',
       'P0100046', 'P0100047', 'P0100048', 'P0100049', 'P0100050', 'P0100051',
       'P0100052', 'P0100053', 'P0100054', 'P0100055', 'P0100056', 'P0100057',
       'P0100058', 'P0100059', 'P0100060', 'P0100061', 'P0100062', 'P0100063',
       'P0100064', 'P0100065', 'P0100066', 'P0100067', 'P0100068', 'P0100069',
       'P0100070', 'P0100071'],
      dtype='object')


In [36]:
rename_cols(11, 73, dhc_P11)

Index(['FILEID', 'STATE', 'LOGRECNO', 'P0110001', 'P0110002', 'P0110003',
       'P0110004', 'P0110005', 'P0110006', 'P0110007', 'P0110008', 'P0110009',
       'P0110010', 'P0110011', 'P0110012', 'P0110013', 'P0110014', 'P0110015',
       'P0110016', 'P0110017', 'P0110018', 'P0110019', 'P0110020', 'P0110021',
       'P0110022', 'P0110023', 'P0110024', 'P0110025', 'P0110026', 'P0110027',
       'P0110028', 'P0110029', 'P0110030', 'P0110031', 'P0110032', 'P0110033',
       'P0110034', 'P0110035', 'P0110036', 'P0110037', 'P0110038', 'P0110039',
       'P0110040', 'P0110041', 'P0110042', 'P0110043', 'P0110044', 'P0110045',
       'P0110046', 'P0110047', 'P0110048', 'P0110049', 'P0110050', 'P0110051',
       'P0110052', 'P0110053', 'P0110054', 'P0110055', 'P0110056', 'P0110057',
       'P0110058', 'P0110059', 'P0110060', 'P0110061', 'P0110062', 'P0110063',
       'P0110064', 'P0110065', 'P0110066', 'P0110067', 'P0110068', 'P0110069',
       'P0110070', 'P0110071', 'P0110072', 'P0110073'],
 

In [37]:
rename_cols(12, 49, dhc_P12)

Index(['FILEID', 'STATE', 'LOGRECNO', 'P0120001', 'P0120002', 'P0120003',
       'P0120004', 'P0120005', 'P0120006', 'P0120007', 'P0120008', 'P0120009',
       'P0120010', 'P0120011', 'P0120012', 'P0120013', 'P0120014', 'P0120015',
       'P0120016', 'P0120017', 'P0120018', 'P0120019', 'P0120020', 'P0120021',
       'P0120022', 'P0120023', 'P0120024', 'P0120025', 'P0120026', 'P0120027',
       'P0120028', 'P0120029', 'P0120030', 'P0120031', 'P0120032', 'P0120033',
       'P0120034', 'P0120035', 'P0120036', 'P0120037', 'P0120038', 'P0120039',
       'P0120040', 'P0120041', 'P0120042', 'P0120043', 'P0120044', 'P0120045',
       'P0120046', 'P0120047', 'P0120048', 'P0120049'],
      dtype='object')


In [38]:
def rename_lettered_cols(variable, length, df):
    col_names = []
    for i in range(1,length+1):
        if i < 10:
            col_names.append('P0{}00{}'.format(variable,i))
        else: 
            col_names.append('P0{}0{}'.format(variable,i))
    
    col_names = ['FILEID', 'STATE', 'LOGRECNO'] + col_names
    
    df.columns = col_names
    
    print(df.columns)

In [39]:
rename_lettered_cols('12A', 49, dhc_P12A)

Index(['FILEID', 'STATE', 'LOGRECNO', 'P012A001', 'P012A002', 'P012A003',
       'P012A004', 'P012A005', 'P012A006', 'P012A007', 'P012A008', 'P012A009',
       'P012A010', 'P012A011', 'P012A012', 'P012A013', 'P012A014', 'P012A015',
       'P012A016', 'P012A017', 'P012A018', 'P012A019', 'P012A020', 'P012A021',
       'P012A022', 'P012A023', 'P012A024', 'P012A025', 'P012A026', 'P012A027',
       'P012A028', 'P012A029', 'P012A030', 'P012A031', 'P012A032', 'P012A033',
       'P012A034', 'P012A035', 'P012A036', 'P012A037', 'P012A038', 'P012A039',
       'P012A040', 'P012A041', 'P012A042', 'P012A043', 'P012A044', 'P012A045',
       'P012A046', 'P012A047', 'P012A048', 'P012A049'],
      dtype='object')


d) Repeating above steps to isolate 12B-12U from mass_dhc_4 - mass_dhc_8

In [40]:
mass_dhc_4.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
0,DHCST,MA,0,4,1,243593,119672,11453,10413,10539,...,647,521,144,214,98,132,173,155,99,97
1,DHCST,MA,0,4,2,241904,118812,11375,10330,10474,...,600,475,130,193,90,126,165,148,89,93
2,DHCST,MA,0,4,3,1689,860,78,83,65,...,47,46,14,21,8,6,8,7,10,4
3,DHCST,MA,0,4,4,402,228,15,17,12,...,8,8,2,1,1,1,2,2,2,0
4,DHCST,MA,0,4,5,1287,632,63,66,53,...,39,38,12,20,7,5,6,5,8,4


In [41]:
def isolate_variable(df, start, end):
    cols_to_keep = df[[0,1,4]]
    appended_df = df.iloc[:,5:]
    appended_df.rename(columns={x:y for x,y in zip(appended_df.columns,range(0,len(appended_df.columns)))})
    variable_df = cols_to_keep.join(appended_df.iloc[:,start:end])
    print(variable_df.shape[1])
    return variable_df

In [42]:
# All of these should have a length of 52
dhc_P12B = isolate_variable(mass_dhc_4,0,49)
dhc_P12C = isolate_variable(mass_dhc_4,49,(49+49))
dhc_P12D = isolate_variable(mass_dhc_5,0,49)
dhc_P12E = isolate_variable(mass_dhc_5,49,(49+49))
dhc_P12F = isolate_variable(mass_dhc_5,(49+49),(49+49+49))
dhc_P12G = isolate_variable(mass_dhc_5,(49+49+49),(49+49+49+49))
dhc_P12H = isolate_variable(mass_dhc_5,(49+49+49+49),(49+49+49+49+49))
dhc_P12I = isolate_variable(mass_dhc_6,0,49)
dhc_P12K = isolate_variable(mass_dhc_6,49,(49+49))
dhc_P12M = isolate_variable(mass_dhc_6,(49+49+49),(49+49+49+49))
dhc_P12O = isolate_variable(mass_dhc_7,49,(49+49))
dhc_P12Q = isolate_variable(mass_dhc_7,(49+49+49), (49+49+49+49))
dhc_P12S = isolate_variable(mass_dhc_8,0,49)
dhc_P12U = isolate_variable(mass_dhc_8,(49+49),(49+49+49))

52
52
52
52
52
52
52
52
52
52
52
52
52
52


In [43]:
letters = ['B','C','D','E','F','G','H','I','K','M','O','Q','S','U']

for letter in letters:
    rename_lettered_cols('12{}'.format(letter), 49, globals()['dhc_P12{}'.format(letter)])

Index(['FILEID', 'STATE', 'LOGRECNO', 'P012B001', 'P012B002', 'P012B003',
       'P012B004', 'P012B005', 'P012B006', 'P012B007', 'P012B008', 'P012B009',
       'P012B010', 'P012B011', 'P012B012', 'P012B013', 'P012B014', 'P012B015',
       'P012B016', 'P012B017', 'P012B018', 'P012B019', 'P012B020', 'P012B021',
       'P012B022', 'P012B023', 'P012B024', 'P012B025', 'P012B026', 'P012B027',
       'P012B028', 'P012B029', 'P012B030', 'P012B031', 'P012B032', 'P012B033',
       'P012B034', 'P012B035', 'P012B036', 'P012B037', 'P012B038', 'P012B039',
       'P012B040', 'P012B041', 'P012B042', 'P012B043', 'P012B044', 'P012B045',
       'P012B046', 'P012B047', 'P012B048', 'P012B049'],
      dtype='object')
Index(['FILEID', 'STATE', 'LOGRECNO', 'P012C001', 'P012C002', 'P012C003',
       'P012C004', 'P012C005', 'P012C006', 'P012C007', 'P012C008', 'P012C009',
       'P012C010', 'P012C011', 'P012C012', 'P012C013', 'P012C014', 'P012C015',
       'P012C016', 'P012C017', 'P012C018', 'P012C019', 'P012C02

e) Lastly, isolatating P14 from mass_dhc_9

In [44]:
dhc_P14 = isolate_variable(mass_dhc_9, 30,(30+43)) 
rename_cols(14,43,dhc_P14)

46
Index(['FILEID', 'STATE', 'LOGRECNO', 'P0140001', 'P0140002', 'P0140003',
       'P0140004', 'P0140005', 'P0140006', 'P0140007', 'P0140008', 'P0140009',
       'P0140010', 'P0140011', 'P0140012', 'P0140013', 'P0140014', 'P0140015',
       'P0140016', 'P0140017', 'P0140018', 'P0140019', 'P0140020', 'P0140021',
       'P0140022', 'P0140023', 'P0140024', 'P0140025', 'P0140026', 'P0140027',
       'P0140028', 'P0140029', 'P0140030', 'P0140031', 'P0140032', 'P0140033',
       'P0140034', 'P0140035', 'P0140036', 'P0140037', 'P0140038', 'P0140039',
       'P0140040', 'P0140041', 'P0140042', 'P0140043'],
      dtype='object')


f) Merging in COUNTY, TRACT, and BLOCK info

In [63]:
mass_dhc_cols = mass_dhc[['LOGRECNO', 'COUNTY', 'TRACT', 'BLOCK' ]]
dhc_P1 = (pd.merge(dhc_P1, mass_dhc_cols, how='inner', on ='LOGRECNO')).dropna(axis=0,inplace=True)

TypeError: Can only merge Series or DataFrame objects, a <class 'NoneType'> was passed

AttributeError: 'NoneType' object has no attribute 'head'

In [50]:
mass_dhc_cols = mass_dhc[['LOGRECNO', 'COUNTY', 'TRACT', 'BLOCK' ]]
table_list = [dhc_P1, dhc_P8, dhc_P9, dhc_P10, dhc_P11, dhc_P12, dhc_P12A, dhc_P12B, dhc_P12C, dhc_P12D, dhc_P12E, dhc_P12F, dhc_P12G, dhc_P12H, dhc_P12I, dhc_P12K, dhc_P12M, dhc_P12O, dhc_P12Q, dhc_P12S, dhc_P12U, dhc_P14]

for table in table_list:
    table.merge(mass_dhc_cols, how='inner', on='')
    
    
    

Index(['FILEID', 'STUSAB', 'SUMLEV', 'GEOVAR', 'GEOCOMP', 'CHARITER', 'CIFSN',
       'LOGRECNO', 'GEOID', 'GEOCODE', 'REGION', 'DIVISION', 'STATE',
       'STATENS', 'COUNTY', 'COUNTYCC', 'COUNTYNS', 'COUSUB', 'COUSUBCC',
       'COUSUBNS', 'SUBMCD', 'SUBMCDCC', 'SUBMCDNS', 'ESTATEFP', 'ESTATECC',
       'ESTATENS', 'CONCIT', 'CONCITCC', 'CONCITNS', 'PLACE', 'PLACECC',
       'PLACENS', 'TRACT', 'BLKGRP', 'BLOCK', 'AIANHH', 'AIHHTLI', 'AIANHHFP',
       'AIANHHCC', 'AIANHHNS', 'AITS', 'AITSFP', 'AITSCC', 'AITSNS', 'TTRACT',
       'BTBG', 'ANRC', 'ANRCCC', 'ANRCNS', 'CBSA', 'MEMI', 'CSA', 'METDIV',
       'NECTA', 'NMEMI', 'CNECTA', 'NECTADIV', 'CBSAPCI', 'NECTAPCI', 'UA',
       'UATYPE', 'UR', 'CD111', 'CD113', 'CD114', 'CD115', 'CD116', 'SLDU11',
       'SLDU12', 'SLDU14', 'SLDU16', 'SLDU18', 'SLDL11', 'SLDL12', 'SLDL14',
       'SLDL16', 'SLDL18', 'VTD', 'VTDI', 'ZCTA', 'SDELM', 'SDSEC', 'SDUNI',
       'PUMA', 'AREALAND', 'AREAWATR', 'BASENAME', 'NAME', 'FUNCSTAT', 'GCUNI',
     

g) Finally saving tables to csv files

In [46]:
table_list = [dhc_P1, dhc_P8, dhc_P9, dhc_P10, dhc_P11, dhc_P12, dhc_P12A, dhc_P12B, dhc_P12C, dhc_P12D, dhc_P12E, dhc_P12F, dhc_P12G, dhc_P12H, dhc_P12I, dhc_P12K, dhc_P12M, dhc_P12O, dhc_P12Q, dhc_P12S, dhc_P12U, dhc_P14]
name_list = ['P1', 'P8', 'P9', 'P10', 'P11', 'P12', 'P12A', 'P12B', 'P12C', 'P12D', 'P12E', 'P12F', 'P12G', 'P12H', 'P12I', 'P12K', 'P12M', 'P12O', 'P12Q', 'P12S', 'P12U','P14']
for table, name in zip(table_list,name_list):
    table.to_csv(path + '/data/mass_dhc/table_{}.csv'.format(name))