Note: code is copied between notebooks (because we are only using it twice and we are using notebooks). Alternatively, we could have one shared notebook for two directories that used the same function, or we could have used non-jupyter notebooks. However, for our purposes, this seemed most efficient.

In [2]:
import pandas as pd
import numpy as np
import csv 
import os 
import re

pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
pd.set_option("display.max_seq_items",500)

In [3]:
def read_csv(csv_file):
    '''
    read in the csv file
    
    input csv file (data)
    returns pandas df of the data for just specified counties
    '''
    assert os.path.isfile(csv_file), "csv file does not exist"

    # convert the pd cols from NHGIS names
    data = pd.read_csv(csv_file, low_memory=False)

    # only keep IL
    working_df = data[(data.STATE == 'Illinois') & (data.COUNTY == 'Cook')]
    
    # SET INDEX
    working_df = working_df.set_index('GISJOIN')
    
    return working_df

In [4]:
# read in df
df104 = read_csv('./nhgis0010_ds104_1980_tract.csv')
df116 = read_csv('./nhgis0010_ds116_1980_tract.csv')

# identify key variables
ALL_PEOPLE = df116.C6W001
WHITE_PEOPLE = df116.C6X001
BLACK_PEOPLE = df116.C6X002
OTHER_PEOPLE = ALL_PEOPLE - BLACK_PEOPLE - WHITE_PEOPLE

ALL_UNITS = df104.C8Y001
OCCUPIED = df104.C7V001
VACANT = ALL_UNITS - OCCUPIED

OWNER = df104.C7W001
RENTER = df104.C7W002

In [5]:
# create the variables of interest from our variables
final_df = df104[['YEAR', 'TRACTA', 'COUNTY', 'STATE']]
final_df = final_df.assign(total_pop = ALL_PEOPLE)
final_df = final_df.assign(Pwhite = (WHITE_PEOPLE)/ALL_PEOPLE)
final_df = final_df.assign(Pblack = BLACK_PEOPLE/ALL_PEOPLE)
final_df = final_df.assign(Pnonwhite = OTHER_PEOPLE/ALL_PEOPLE)
final_df = final_df.assign(units = ALL_UNITS)
final_df = final_df.assign(occupied = OCCUPIED/ALL_UNITS)
final_df = final_df.assign(vacant = VACANT/ALL_UNITS)

final_df = final_df.assign(own = OWNER/(OCCUPIED))
final_df = final_df.assign(rent = RENTER/(OCCUPIED))
final_df = final_df.reset_index()

In [6]:
# rename column
column_names = ['GISJOIN','YEAR', 'TRACTA', 'COUNTY','STATE', 'Total Pop', '% White', 
               '% Black', '% Other races', 'Total Units', '% Occupied', 
               '% Vacant', '% Owner Occupied', '% Renter']

# export
final_df.to_csv(path_or_buf='1980s_fordb.csv', index=False, header=column_names)