In [3]:
import pandas as pd
import numpy as np
import csv 
import os 
import re

In [51]:
CODEFILE = './nhgis0015_ds92_1960_tract_codebook.txt'
DATAFILE = './nhgis0015_ds92_1960_tract.csv'
OUTPUTFILE = './1960s_fordb.csv'

pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
pd.set_option("display.max_seq_items",500)

In [52]:
f = open(CODEFILE, 'r')

# keep all lines that are not whitespace
lines = [line for line in f.readlines() if not line is ""]

lines

['--------------------------------------------------------------------------------\n',
 "Codebook for NHGIS data file 'nhgis0015_ds92_1960_tract'\n",
 '--------------------------------------------------------------------------------\n',
 ' \n',
 'Contents\n',
 '    - Data Summary\n',
 '    - Data Dictionary\n',
 '    - Citation and Use\n',
 ' \n',
 'Additional documentation on NHGIS data sources is available at: \n',
 '    https://www.nhgis.org/documentation/tabular-data \n',
 ' \n',
 '--------------------------------------------------------------------------------\n',
 'Data Summary\n',
 '--------------------------------------------------------------------------------\n',
 ' \n',
 'Year:             1960\n',
 'Geographic level: Census Tract (by State--County)\n',
 'Dataset:          1960 Census: Population & Housing Data [Tracts: Major Cities & Surrounds]\n',
 '   NHGIS code:    1960_tPH\n',
 '   NHGIS ID:      ds92\n',
 ' \n',
 'Tables:\n',
 ' \n',
 '1. Total Persons\n',
 '   Univers

In [53]:
# regex to save lines that start with #.,  "NHGIS code:"
tab = []
code = []
mini_code = {}

for line in lines:
    if re.match('[0-9]+', line):
        temp = re.sub('[0-9]*\. ', '', line)
        temp = re.sub(' ', '_', temp)
        tab.append(temp[:-1])
    if re.match('NHGIS code:', line):
        code.append(line[-4:-1])
    if re.match('.*[0-9]{3}:', line):
        preproc = re.sub(' ', '', line)
        preproc = re.sub('\n', '', preproc)
        c, n = preproc.split(":")[0], "_".join(preproc.split(":")[1:])
        threeletter = c[:3]
        store = mini_code.get(threeletter, {})
        store[c] = n
        mini_code[threeletter] = store

In [54]:
# construct a dictionary of code to table
code_dict = {}

for i, c in enumerate(code):
    code_dict[c] = tab[i]

In [55]:
# convert the pd cols from NHGIS names
data = pd.read_csv(DATAFILE)

# only keep IL
working_df = data[(data.STATE == 'Illinois') & (data.COUNTY == 'Cook')]

In [56]:
new_cols = []
for colname in working_df.columns:
    if colname[:3] in code_dict.keys():
        sub_cat_dict = mini_code.get(colname[:3], {}) 
        subcat = sub_cat_dict[colname]
        keep = code_dict[colname[:3]]+"_"+ subcat
    else: 
        keep = colname
    new_cols.append(keep)

In [57]:
working_df.columns = new_cols

In [58]:
working_df.columns

Index(['GISJOIN', 'YEAR', 'STATE', 'STATEA', 'COUNTY', 'COUNTYA', 'PRETRACTA',
       'TRACTA', 'POSTTRCTA', 'AREANAME', 'Total_Persons_Total',
       'Total_Housing_Units_Total',
       'Owner-Occupied_Housing_Units_Reporting_Value_Total',
       'White_Persons_by_Household_Relationship_Headofprimaryfamily',
       'White_Persons_by_Household_Relationship_Primaryindividual',
       'White_Persons_by_Household_Relationship_Wifeofhead',
       'White_Persons_by_Household_Relationship_Relatedsinglechildunder18',
       'White_Persons_by_Household_Relationship_Otherrelativeofhead',
       'White_Persons_by_Household_Relationship_Non-relativeofhead',
       'White_Persons_by_Household_Relationship_Inmateofinstitution',
       'White_Persons_by_Household_Relationship_Otheringroupquarters',
       'Occupied_Year-Round_Housing_Units_by_Tenure_Owner-occupied',
       'Occupied_Year-Round_Housing_Units_by_Tenure_Renter-occupied',
       'Vacant_Year-Round_Housing_Units_by_Occupancy/Vacancy_Stat

In [59]:
final_df = working_df[['GISJOIN', 'YEAR', 'TRACTA', 'COUNTY', 'STATE']]

# race
final_df = final_df.assign(Totalpop = working_df['Total_Persons_Total'])
final_df = final_df.assign(Pwhite = (final_df.Totalpop - working_df.Total_Nonwhite_Population_Total)/final_df.Totalpop)
final_df = final_df.assign(Pblack = (working_df['Nonwhite_Persons_by_Race_by_Sex_Negroes>>Male'] +
                                     working_df['Nonwhite_Persons_by_Race_by_Sex_Negroes>>Female'])/final_df.Totalpop)
final_df = final_df.assign(Pnonwhite = (working_df['Nonwhite_Persons_by_Race_by_Sex_OtherNonwhites>>Male']+
                                     working_df['Nonwhite_Persons_by_Race_by_Sex_OtherNonwhites>>Female'])/final_df.Totalpop)

# units
final_df = final_df.assign(Totalunits = working_df['Total_House_Units_[from_printed_report]_Total'])

#value = np.where(working_df[
#    'Median_Value_of_Homes_for_Which_Value_was_Reported_Medianvalue'] == 0, 'NaN', 
#                working_df['Median_Value_of_Homes_for_Which_Value_was_Reported_Medianvalue'])
#final_df = final_df.assign(Medianhomeval = value)
Vac = (working_df['Housing_Units_by_Vacancy_[from_printed_report]_Availablevacant'] + 
                          working_df['Housing_Units_by_Vacancy_[from_printed_report]_Othervacant']) / final_df.Totalunits

final_df = final_df.assign(Poccupied = 1-Vac)
final_df = final_df.assign(Pvacant = Vac)

Allowner = working_df['Occupied_Housing_Units_by_Tenure_[from_printed_report]_Owner-occupied']
final_df = final_df.assign(Powner = Allowner / final_df.Totalunits) #check not just of non-vacant

Allrenter = working_df['Occupied_Housing_Units_by_Tenure_[from_printed_report]_Renter-occupied']
final_df = final_df.assign(Prented = Allrenter / final_df.Totalunits)

final_df.head()

KeyError: 'Total_Occupied_Dwelling_Units_Total'