In [67]:
import pandas as pd
import os

In [68]:
absolute_filepath = '/Users/claire/Documents/CS 122 Project Files/2000 data/'

In [69]:
# Context columns to drop
drop_cols = ['AIANHHA', 'STATEA', 'COUNTYA', 'PLACEA', 'FWB001', 'FWC001','FWC002','FWC003','FWC004',
             'FWC005','FWC006','FWD001','FWD002','FWD003','FWD004','FWD005','FWD006','FWD007','FWK001']

In [70]:
# Context columns to drop from the median df 
median_drop_cols = ['URB_AREAA','URBRURALA','CD106A','CD108A','CD109A','ZIP3A','ZCTAA','TRACTA','TRBL_CTA','TRBL_BGA',
                    'BLCK_GRPA','TRUSTA','RES_ONLYA','AIANHHA','TRBL_SUBA','ANRCA', 'PLACEA', 'DIVISIONA', 
                    'REGIONA', 'PMSAA', 'NECMAA', 'NAME', 'STATEA', 'COUNTYA', 'CTY_SUBA']

In [71]:
raw_drop_cols = ['YEAR', 'STATE', 'COUNTY', 'CTY_SUBA', 'NAME']

In [72]:
column_mapper = {'FYE001':'White alone','FYE002':'Black or African American alone',
                 'FYE003':'American Indian and Alaska Native alone','FYE004':'Asian alone',
                 'FYE005':'Native Hawaiian and Other Pacific Islander alone','FYE006':'Some other race alone',
                 'FYE007':'Two or more races', 'FV5001':'Total Units', 'FV8001':'Occupied',
                 'FV8002':'Vacant','FV9001':'Total Occupied Units', 'FWA001':'Owner occupied',
                 'FWA002':'Renter occupied'}

median_col_mapper = {'GB7001':'Median'}

In [73]:
chicago_msa_code = 1602

In [74]:
# Load data into df
if os.path.exists(absolute_filepath + 'nhgis0006_ds147_2000_block.csv'):
    raw_df = pd.read_csv(absolute_filepath + 'nhgis0006_ds147_2000_block.csv').drop(drop_cols, axis = 1)
else:
    raise IOError('This file does not exists in this location')

In [75]:
# Load data into df
if os.path.exists(absolute_filepath + 'nhgis0007_ds151_2000_tract.csv'):
    median_df = pd.read_csv(absolute_filepath + 'nhgis0007_ds151_2000_tract.csv').drop(median_drop_cols, axis = 1)
else:
    raise IOError('This file does not exists in this location')

  interactivity=interactivity, compiler=compiler, result=result)


In [76]:
# Change MSA to numeric value to subset on Chicago code
median_df['MSA_CMSAA'] = pd.to_numeric(median_df['MSA_CMSAA'],errors = 'coerce')
chi_median_df = median_df[median_df['MSA_CMSAA'] == chicago_msa_code]
chi_median_df.rename(columns = median_col_mapper, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [77]:
# Rename columns using mapper
raw_df.rename(columns = column_mapper, inplace = True)

In [78]:
# Shorten GISJOIN field to exclude block identifiers so that it can be used to match values
raw_df['GIS_SHORT'] = raw_df['GISJOIN'].str[:14]
raw_df.rename(columns = {'GISJOIN': 'GISJOIN_BLOCK'}, inplace = True)

In [79]:
# Merge median and raw dfs on shortened GISJOIN field
full_df = chi_median_df.merge(raw_df.drop(raw_drop_cols, axis = 1), left_on = ['GISJOIN'], right_on = ['GIS_SHORT'])

In [80]:
full_df.columns

Index(['GISJOIN', 'YEAR', 'STATE', 'COUNTY', 'C_CITYA', 'MSA_CMSAA', 'Median',
       'GISJOIN_BLOCK', 'TRACTA', 'BLCK_GRPA', 'BLOCKA', 'White alone',
       'Black or African American alone',
       'American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Two or more races', 'Total Units', 'Occupied',
       'Vacant', 'Total Occupied Units', 'Owner occupied', 'Renter occupied',
       'GIS_SHORT'],
      dtype='object')

In [81]:
# Combine race categories
full_df['Other races'] = full_df[['American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Two or more races']].sum(axis = 1)
full_df['Total Pop'] = full_df[['White alone', 'Black or African American alone', 'Other races']].sum(axis = 1)
full_df = full_df.drop(['American Indian and Alaska Native alone', 'Asian alone',
       'Native Hawaiian and Other Pacific Islander alone',
       'Some other race alone', 'Two or more races'], axis = 1)

In [82]:
# Calculate race percentages
full_df['% White'] = full_df['White alone']/full_df['Total Pop']
full_df['% Black'] = full_df['Black or African American alone']/full_df['Total Pop']
full_df['% Other races'] = full_df['Other races']/full_df['Total Pop']
full_df = full_df.drop(['White alone', 'Black or African American alone','Other races'], axis = 1)

In [83]:
# Calculate unit occupancy/vacancy percentages
full_df['% Occupied'] = full_df['Total Occupied Units']/full_df['Total Units']
full_df['% Vacant'] = full_df['Vacant']/full_df['Total Units']
full_df['% Owner Occupied'] = full_df['Owner occupied']/full_df['Total Occupied Units']
full_df['% Renter'] = full_df['Renter occupied']/full_df['Total Occupied Units']
full_df = full_df.drop(['Occupied', 'Vacant', 'Owner occupied', 'Renter occupied'], axis = 1)

In [84]:
# Reorder columns to match other years
reorder = ['GISJOIN_BLOCK','YEAR', 'BLOCKA', 'BLCK_GRPA', 'TRACTA', 'C_CITYA', 'COUNTY',
       'MSA_CMSAA', 'STATE', 'Total Pop', '% White', '% Black',
       '% Other races', 'Total Units', 'Median', '% Occupied', '% Vacant',
       '% Owner Occupied', '% Renter']

full_df = full_df[reorder]

In [86]:
# Update index column name to match other years, set as index
full_df.rename(columns = {'GISJOIN_BLOCK': 'GISJOIN'}, inplace = True)
full_df.set_index(keys = 'GISJOIN', inplace = True)

In [87]:
# Write out df to CSV
full_df.to_csv("2000_census_data.csv", sep = "|")