In [66]:
import pandas as pd
import numpy as np
import csv 
import os 
import re

In [67]:
CODEFILE = './nhgis0015_ds92_1960_tract_codebook.txt'
DATAFILE = './nhgis0015_ds92_1960_tract.csv'
OUTPUTFILE = './1960s_fordb.csv'

pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
pd.set_option("display.max_seq_items",500)

In [68]:
f = open(CODEFILE, 'r')

# keep all lines that are not whitespace
lines = [line for line in f.readlines() if not line is ""]

lines

['--------------------------------------------------------------------------------\n',
 "Codebook for NHGIS data file 'nhgis0015_ds92_1960_tract'\n",
 '--------------------------------------------------------------------------------\n',
 ' \n',
 'Contents\n',
 '    - Data Summary\n',
 '    - Data Dictionary\n',
 '    - Citation and Use\n',
 ' \n',
 'Additional documentation on NHGIS data sources is available at: \n',
 '    https://www.nhgis.org/documentation/tabular-data \n',
 ' \n',
 '--------------------------------------------------------------------------------\n',
 'Data Summary\n',
 '--------------------------------------------------------------------------------\n',
 ' \n',
 'Year:             1960\n',
 'Geographic level: Census Tract (by State--County)\n',
 'Dataset:          1960 Census: Population & Housing Data [Tracts: Major Cities & Surrounds]\n',
 '   NHGIS code:    1960_tPH\n',
 '   NHGIS ID:      ds92\n',
 ' \n',
 'Tables:\n',
 ' \n',
 '1. Total Persons\n',
 '   Univers

In [69]:
# regex to save lines that start with #.,  "NHGIS code:"
tab = []
code = []
mini_code = {}

for line in lines:
    if re.match('[0-9]+', line):
        temp = re.sub('[0-9]*\. ', '', line)
        temp = re.sub(' ', '_', temp)
        tab.append(temp[:-1])
    if re.match('NHGIS code:', line):
        code.append(line[-4:-1])
    if re.match('.*[0-9]{3}:', line):
        preproc = re.sub(' ', '', line)
        preproc = re.sub('\n', '', preproc)
        c, n = preproc.split(":")[0], "_".join(preproc.split(":")[1:])
        threeletter = c[:3]
        store = mini_code.get(threeletter, {})
        store[c] = n
        mini_code[threeletter] = store

In [77]:
# construct a dictionary of code to table
code_dict = {}

for i, c in enumerate(code):
    code_dict[c] = tab[i]

In [78]:
# convert the pd cols from NHGIS names
data = pd.read_csv(DATAFILE)

# only keep IL
working_df = data[(data.STATE == 'Illinois') & (data.COUNTY == 'Cook')]

In [79]:
new_cols = []
for colname in working_df.columns:
    if colname[:3] in code_dict.keys():
        sub_cat_dict = mini_code.get(colname[:3], {}) 
        subcat = sub_cat_dict[colname]
        keep = code_dict[colname[:3]]+"_"+ subcat
    else: 
        keep = colname
    new_cols.append(keep)

In [80]:
working_df.columns = new_cols

In [81]:
working_df.columns

Index(['GISJOIN', 'YEAR', 'STATE', 'STATEA', 'COUNTY', 'COUNTYA', 'PRETRACTA',
       'TRACTA', 'POSTTRCTA', 'AREANAME', 'Total_Persons_Total',
       'Total_Housing_Units_Total',
       'Owner-Occupied_Housing_Units_Reporting_Value_Total',
       'White_Persons_by_Household_Relationship_Headofprimaryfamily',
       'White_Persons_by_Household_Relationship_Primaryindividual',
       'White_Persons_by_Household_Relationship_Wifeofhead',
       'White_Persons_by_Household_Relationship_Relatedsinglechildunder18',
       'White_Persons_by_Household_Relationship_Otherrelativeofhead',
       'White_Persons_by_Household_Relationship_Non-relativeofhead',
       'White_Persons_by_Household_Relationship_Inmateofinstitution',
       'White_Persons_by_Household_Relationship_Otheringroupquarters',
       'Occupied_Year-Round_Housing_Units_by_Tenure_Owner-occupied',
       'Occupied_Year-Round_Housing_Units_by_Tenure_Renter-occupied',
       'Vacant_Year-Round_Housing_Units_by_Occupancy/Vacancy_Stat

In [83]:
final_df = working_df[['GISJOIN', 'YEAR', 'TRACTA', 'COUNTY', 'STATE']]

# race
final_df = final_df.assign(Totalpop = working_df['Total_Persons_Total'])
final_df = final_df.assign(Pwhite = (final_df.Totalpop - working_df.Total_Nonwhite_Population_Total)/final_df.Totalpop)
final_df = final_df.assign(Pblack = (working_df['Nonwhite_Persons_by_Race_by_Sex_Negroes>>Male'] +
                                     working_df['Nonwhite_Persons_by_Race_by_Sex_Negroes>>Female'])/final_df.Totalpop)
final_df = final_df.assign(Pnonwhite = (working_df['Nonwhite_Persons_by_Race_by_Sex_OtherNonwhites>>Male']+
                                     working_df['Nonwhite_Persons_by_Race_by_Sex_OtherNonwhites>>Female'])/final_df.Totalpop)

# units
final_df = final_df.assign(Totalunits = working_df['Total_House_Units_[from_printed_report]_Total'])

#value = np.where(working_df[
#    'Median_Value_of_Homes_for_Which_Value_was_Reported_Medianvalue'] == 0, 'NaN', 
#                working_df['Median_Value_of_Homes_for_Which_Value_was_Reported_Medianvalue'])
#final_df = final_df.assign(Medianhomeval = value)
Vac = (working_df['Housing_Units_by_Vacancy_[from_printed_report]_Availablevacant'] + 
                          working_df['Housing_Units_by_Vacancy_[from_printed_report]_Othervacant']) / final_df.Totalunits

final_df = final_df.assign(Poccupied = 1-Vac)
final_df = final_df.assign(Pvacant = Vac)

Allowner = working_df['Occupied_Housing_Units_by_Tenure_[from_printed_report]_Owner-occupied']
final_df = final_df.assign(Powner = Allowner / (final_df.Poccupied*final_df.Totalunits)) #check not just of non-vacant

Allrenter = working_df['Occupied_Housing_Units_by_Tenure_[from_printed_report]_Renter-occupied']
final_df = final_df.assign(Prented = Allrenter / (final_df.Poccupied*final_df.Totalunits))

final_df.head()

Unnamed: 0,GISJOIN,YEAR,TRACTA,COUNTY,STATE,Totalpop,Pwhite,Pblack,Pnonwhite,Totalunits,Poccupied,Pvacant,Powner,Prented
5110,G17003100001,1960,1,Cook,Illinois,4237.0,0.989379,0.000236,0.0059,1698.0,0.957008,0.042992,0.158154,0.841846
5111,G17003100002,1960,2,Cook,Illinois,1510.0,1.0,0.0,0.003311,477.0,0.981132,0.018868,0.239316,0.760684
5112,G17003100003,1960,3,Cook,Illinois,6622.0,0.996074,0.000604,0.00453,2793.0,0.964554,0.035446,0.143281,0.856719
5113,G17003100004,1960,4,Cook,Illinois,8051.0,0.990809,0.000869,0.00857,3348.0,0.93399,0.06601,0.159578,0.840422
5114,G17003100005A,1960,5,Cook,Illinois,9596.0,0.989162,0.001042,0.010004,4410.0,0.931293,0.068707,0.089847,0.910153


In [85]:
column_names = ['GISJOIN','YEAR', 'TRACTA', 'COUNTY','STATE', 'Total Pop', '% White', 
               '% Black', '% Other races', 'Total Units', '% Occupied', 
               '% Vacant', '% Owner Occupied', '% Renter']

final_df.to_csv(path_or_buf=OUTPUTFILE, index=False, header=column_names)