In [2]:
import pandas as pd
import numpy as np
import csv 
import os 
import re

pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
pd.set_option("display.max_seq_items",500)

In [3]:
def read_csv(csv_file):
    '''
    read in the csv file
    
    input csv file (data)
    returns pandas df of the data for just specified counties
    '''
    assert os.path.isfile(csv_file), "csv file does not exist"

    # convert the pd cols from NHGIS names
    data = pd.read_csv(csv_file, low_memory=False)

    # only keep IL
    working_df = data[(data.STATE == 'Illinois') & (data.COUNTY == 'Cook')]
    
    # SET INDEX
    working_df = working_df.set_index('GISJOIN')
    
    return working_df

In [7]:
# create three dataframes based on the different files
df96 = read_csv('./nhgis0009_ds96_1970_tract.csv')
df97 = read_csv('./nhgis0016_ds97_1970_tract.csv')
df97_2 = read_csv('./nhgis0017_ds97_1970_tract.csv')

# pull each type of variable based on the code table. 
# note: these are defined manually from looking at the code table. 
ALL_PEOPLE = df97['CY7001'] # table 97
BLACK_PEOPLE = df97['CY8001'] # table 97
OCCUPIED = df97['CU9001'] # table 97
VACANT = df97['CU9002']+ df97['CU9003']+df97['CU9004'] +\
    df97['CU9005'] + df97['CU9006']
TOTAL_UNITS = df97_2.CZQ001
NON_WHITE_PEOPLE = df96.CM6003 + df96.CM6004
OWNER = df96.CK3001 + df96.CK4001 # white and black are separately tabulated
RENTER = df96.CK4002 + df96.CK3002 

In [268]:
# produce variables of interest
final_df = df97[['YEAR', 'TRACTA', 'COUNTY', 'STATE']]
final_df = final_df.assign(total_pop = ALL_PEOPLE)
final_df = final_df.assign(Pwhite = (ALL_PEOPLE - BLACK_PEOPLE - 
                          NON_WHITE_PEOPLE)/ALL_PEOPLE)
final_df = final_df.assign(Pblack = BLACK_PEOPLE/ALL_PEOPLE)
final_df = final_df.assign(Pnonwhite = NON_WHITE_PEOPLE/ALL_PEOPLE)
final_df = final_df.assign(units = TOTAL_UNITS)
final_df = final_df.assign(occupied = OCCUPIED/TOTAL_UNITS)
final_df = final_df.assign(vacant = VACANT/TOTAL_UNITS)
final_df = final_df.assign(own = OWNER/(OWNER+RENTER)) 
           # due to slight variation per table, owner+renter is used instead of "occupied"
final_df = final_df.assign(rent = RENTER/(OWNER+RENTER))
final_df = final_df.reset_index()

In [269]:
# create names and export
column_names = ['GISJOIN','YEAR', 'TRACTA', 'COUNTY','STATE', 'Total Pop', '% White', 
               '% Black', '% Other races', 'Total Units', '% Occupied', 
               '% Vacant', '% Owner Occupied', '% Renter']

final_df.to_csv(path_or_buf='1970S_fordb.csv', index=False, header=column_names)