A lot of the code across these notebooks is duplicated, however handling each bit of data varies. As a result, each is independent, although we could've made one function to read in files and one to parse them. This was most efficient, however, given the care each file needed code-interpretation-wise.

In [2]:
import pandas as pd
import numpy as np
import csv 
import os 
import re

In [3]:
# hardcode files
CODEFILE = './nhgis0007_ds82_1950_tract_codebook.txt'
DATAFILE = './nhgis0007_ds82_1950_tract.csv'
OUTPUTFILE = './1950s_fordb.csv'


pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
pd.set_option("display.max_seq_items",500)

In [4]:
f = open(CODEFILE, 'r')

# keep all lines that are not whitespace
lines = [line for line in f.readlines() if not line is ""]

In [5]:
# regex to save lines that start with #.,  "NHGIS code:"
tab = []
code = []
mini_code = {}

for line in lines:
    if re.match('[0-9]+', line):
        temp = re.sub('[0-9]*\. ', '', line)
        temp = re.sub(' ', '_', temp)
        tab.append(temp[:-1])
    if re.match('NHGIS code:', line):
        code.append(line[-4:-1])
    if re.match('.*[0-9]{3}:', line):
        preproc = re.sub(' ', '', line)
        preproc = re.sub('\n', '', preproc)
        c, n = preproc.split(":")[0], "_".join(preproc.split(":")[1:])
        threeletter = c[:3]
        store = mini_code.get(threeletter, {})
        store[c] = n
        mini_code[threeletter] = store

In [6]:
# construct a dictionary of code to table
code_dict = {}

for i, c in enumerate(code):
    code_dict[c] = tab[i]

In [7]:
# convert the pd cols from NHGIS names
data = pd.read_csv(DATAFILE)

# only keep IL
working_df = data[(data.STATE == 'Illinois') & (data.COUNTY == 'Cook')]

In [8]:
# get new column namess
new_cols = []
for colname in working_df.columns:
    if colname[:3] in code_dict.keys():
        sub_cat_dict = mini_code.get(colname[:3], {}) 
        subcat = sub_cat_dict[colname]
        keep = code_dict[colname[:3]]+"_"+ subcat
    else: 
        keep = colname
    new_cols.append(keep)

In [12]:
# rename and view columns
working_df.columns = new_cols
working_df.columns

Index(['GISJOIN', 'YEAR', 'STATE', 'STATEA', 'COUNTY', 'COUNTYA', 'PRETRACTA',
       'TRACTA', 'POSTTRCTA', 'AREANAME', 'Population_by_Race_White',
       'Population_by_Race_Negro', 'Population_by_Race_Othernon-white',
       'Total_Number_of_Households_Total',
       'Persons_per_Occupied_Household_Personsperoccupiedhousehold',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Noschoolcompleted',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Elementary1-4',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Elementary5-6',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Elementary7',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Elementary8',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Highschool1-3',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_Highschool4',
       'Years_of_School_Completed_by_Persons_25_Years_and_Over_College1-3',
       'Years_of_School_Completed_by_Per

In [15]:
### store and calculate final entries

final_df = working_df[['GISJOIN', 'YEAR', 'TRACTA', 'COUNTY', 'STATE']]

# race
final_df = final_df.assign(Totalpop = working_df['Population_by_Race_White'] + 
                          working_df['Population_by_Race_Negro'] + working_df['Population_by_Race_Othernon-white'])
final_df = final_df.assign(Pwhite = working_df['Population_by_Race_White']/final_df.Totalpop)
final_df = final_df.assign(Pblack = working_df['Population_by_Race_Negro']/final_df.Totalpop)
final_df = final_df.assign(Pnonwhite = working_df['Population_by_Race_Othernon-white']/final_df.Totalpop)

# units
final_df = final_df.assign(Totalunits = working_df['Total_Dwelling_Units_Total'])

value = np.where(working_df['Median_Value_Medianvalue'] == 0, 'NaN', 
                working_df['Median_Value_Medianvalue'])
final_df = final_df.assign(Medianhomeval = value)
final_df = final_df.assign(Poccupied = working_df['Total_Occupied_Dwelling_Units_Total'
                                                 ]/final_df.Totalunits)
final_df = final_df.assign(Pvacant = 1 - final_df.Poccupied)

Allowner = working_df['Dwelling_Units_by_Occupancy_Type_Occupied_Owneroccupied']
final_df = final_df.assign(Powner = Allowner / working_df['Total_Occupied_Dwelling_Units_Total']) #check not just of non-vacant

Allrenter = working_df['Dwelling_Units_by_Occupancy_Type_Occupied_Renteroccupied']
final_df = final_df.assign(Prented = Allrenter / working_df['Total_Occupied_Dwelling_Units_Total'])

final_df.head()

Unnamed: 0,GISJOIN,YEAR,TRACTA,COUNTY,STATE,Totalpop,Pwhite,Pblack,Pnonwhite,Totalunits,Medianhomeval,Poccupied,Pvacant,Powner,Prented
1849,G17003100001,1950,1,Cook,Illinois,4775.0,0.998325,0.001257,0.000419,1631.0,13539.0,0.992643,0.007357,0.137122,0.862878
1850,G17003100002,1950,2,Cook,Illinois,1578.0,1.0,0.0,0.0,485.0,,0.991753,0.008247,0.24948,0.75052
1851,G17003100003,1950,3,Cook,Illinois,7147.0,0.998041,0.001679,0.00028,2536.0,17316.0,0.989748,0.010252,0.139044,0.860956
1852,G17003100004,1950,4,Cook,Illinois,9050.0,0.995359,0.002541,0.002099,3088.0,13647.0,0.990609,0.009391,0.161818,0.838182
1853,G17003100005,1950,5,Cook,Illinois,17082.0,0.9976,0.001639,0.000761,6273.0,,0.973059,0.026941,0.091579,0.908421


In [18]:
# rename and export

column_names = ['GISJOIN','YEAR', 'TRACTA', 'COUNTY','STATE', 'Total Pop', '% White', 
               '% Black', '% Other races', 'Total Units', 'Median', '% Occupied', 
               '% Vacant', '% Owner Occupied', '% Renter']

final_df.to_csv(path_or_buf=OUTPUTFILE, index=False, header=column_names)