In [129]:
import pandas as pd
import os

In [130]:
# NOTE: THIS PATH READS TO THE LOCATION OF THE LARGE FILE USB DRIVE ON A MAC,
# MAY NEED TO BE REDIRECTED TO THE APPROPRIATE LOCATION ON THE USER'S MACHINE
absolute_filepath = '/Volumes/Lexar/2000 data/'

In [131]:
# Context columns to drop from raw_df
raw_drop_cols = ['AIANHHA', 'STATEA', 'COUNTYA', 'PLACEA', 'FWB001', 'FWC001','FWC002',
                 'FWC003','FWC004','FWC005','FWC006','FWD001','FWD002','FWD003','FWD004',
                 'FWD005','FWD006','FWD007','FWK001']

In [132]:
# Context columns to drop from the median df 
median_drop_cols = ['URB_AREAA','URBRURALA','CD106A','CD108A','CD109A','ZIP3A','ZCTAA',
                    'TRACTA','TRBL_CTA','TRBL_BGA','BLCK_GRPA','TRUSTA','RES_ONLYA',
                    'AIANHHA','TRBL_SUBA','ANRCA', 'PLACEA', 'DIVISIONA','REGIONA', 
                    'PMSAA', 'NECMAA', 'NAME', 'STATEA', 'COUNTYA', 'CTY_SUBA']

In [133]:
# Context columns to drop from the the merged df
merge_drop_cols = ['YEAR', 'STATE', 'COUNTY', 'CTY_SUBA', 'NAME']

In [134]:
# Dictionary of column name mappings
column_mapper = {'FYE001':'White alone','FYE002':'Black or African American alone',
                 'FYE003':'American Indian and Alaska Native alone','FYE004':'Asian alone',
                 'FYE005':'Native Hawaiian and Other Pacific Islander alone',
                 'FYE006':'Some other race alone','FYE007':'Two or more races', 
                 'FV5001':'Total Units', 'FV8001':'Occupied', 'FV8002':'Vacant',
                 'FV9001':'Total Occupied Units', 'FWA001':'Owner occupied',
                 'FWA002':'Renter occupied'}

median_col_mapper = {'GB7001':'Median'}

In [135]:
#Chicago's MSA code
chicago_msa_code = 1602

In [136]:
# Load data into df
if os.path.exists(absolute_filepath + 'nhgis0006_ds147_2000_block.csv'):
    raw_df = pd.read_csv(absolute_filepath + 'nhgis0006_ds147_2000_block.csv').drop(raw_drop_cols, axis = 1)
else:
    raise IOError('This file does not exists in this location')

In [137]:
# Load data into df
if os.path.exists(absolute_filepath + 'nhgis0007_ds151_2000_tract.csv'):
    median_df = pd.read_csv(absolute_filepath + 'nhgis0007_ds151_2000_tract.csv', 
                            low_memory=False).drop(median_drop_cols, axis = 1)
else:
    raise IOError('This file does not exists in this location')

In [138]:
# Change MSA to numeric value to subset on Chicago code
median_df['MSA_CMSAA'] = pd.to_numeric(median_df['MSA_CMSAA'], errors = 'coerce')
chicago_median_df = median_df[median_df['MSA_CMSAA'] == chicago_msa_code]
chi_median_df.rename(columns = median_col_mapper, inplace = True)

In [139]:
# Rename columns using mapper
raw_df.rename(columns = column_mapper, inplace = True)

In [140]:
# Shorten GISJOIN field to exclude block identifiers so that it can be used to match values
raw_df['GIS_SHORT'] = raw_df['GISJOIN'].str[:14]
raw_df.rename(columns = {'GISJOIN': 'GISJOIN_BLOCK'}, inplace = True)

In [141]:
# Merge median and raw dfs on shortened GISJOIN field
full_df = chi_median_df.merge(raw_df.drop(merge_drop_cols, axis = 1), left_on = ['GISJOIN'], right_on = ['GIS_SHORT'])

In [142]:
# Combine race categories
full_df['Other races'] = full_df[['American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Two or more races']].sum(axis = 1)
full_df['Total Pop'] = full_df[['White alone', 'Black or African American alone', 
                                'Other races']].sum(axis = 1)
full_df = full_df.drop(['American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Two or more races'], axis = 1)

In [143]:
# Calculate race percentages
full_df['Pct_White'] = full_df['White alone']/full_df['Total Pop']
full_df['Pct_Black'] = full_df['Black or African American alone']/full_df['Total Pop']
full_df['Pct_Other'] = full_df['Other races']/full_df['Total Pop']
full_df = full_df.drop(['White alone', 'Black or African American alone','Other races'], axis = 1)

In [144]:
# Calculate unit occupancy/vacancy percentages
full_df['% Occupied'] = full_df['Total Occupied Units']/full_df['Total Units']
full_df['% Vacant'] = full_df['Vacant']/full_df['Total Units']
full_df['% Owner Occupied'] = full_df['Owner occupied']/full_df['Total Occupied Units']
full_df['% Renter'] = full_df['Renter occupied']/full_df['Total Occupied Units']
full_df = full_df.drop(['Occupied', 'Vacant', 'Owner occupied', 'Renter occupied'], axis = 1)

In [145]:
# Reorder columns to match other years
reorder = ['GISJOIN_BLOCK','YEAR', 'BLOCKA', 'BLCK_GRPA', 'TRACTA', 
           'C_CITYA', 'COUNTY','MSA_CMSAA', 'STATE', 'Total Pop', 
           'Pct_White', 'Pct_Black', 'Pct_Other', 'Total Units', 
           'Median', '% Occupied', '% Vacant','% Owner Occupied', 
           '% Renter']

full_df = full_df[reorder]

In [146]:
# Update index column name to match other years, set as index
full_df.rename(columns = {'GISJOIN_BLOCK': 'GISJOIN'}, inplace = True)
full_df.set_index(keys = 'GISJOIN', inplace = True)

In [147]:
# Write out df to CSV
full_df.to_csv("../2000_census_data.csv", sep = "|")