In [44]:
import pandas as pd
import numpy as np
import csv 
import os 
import re

pd.set_option("display.max_rows",101)
pd.set_option("display.max_columns",101)
pd.set_option("display.max_seq_items",500)


In [45]:
CODEFILE = './nhgis0011_ds76_1940_tract_codebook.txt'
DATAFILE = './nhgis0011_ds76_1940_tract.csv'
OUTPUTFILE = './1940s_full.csv'

In [46]:
f = open(CODEFILE, 'r')

# keep all lines that are not whitespace
lines = [line for line in f.readlines() if not line is ""]

In [47]:
# regex to save lines that start with #.,  "NHGIS code:"
tab = []
code = []
mini_code = {}

for line in lines:
    if re.match('[0-9]+', line):
        temp = re.sub('[0-9]*\. ', '', line)
        temp = re.sub(' ', '_', temp)
        tab.append(temp[:-1])
    if re.match('NHGIS code:', line):
        code.append(line[-4:-1])
    if re.match('.*[0-9]{3}:', line):
        preproc = re.sub(' ', '', line)
        preproc = re.sub('\n', '', preproc)
        c, n = preproc.split(":")
        threeletter = c[:3]
        store = mini_code.get(threeletter, {})
        store[c] = n
        mini_code[threeletter] = store

In [48]:
# construct a dictionary of code to table
code_dict = {}

for i, c in enumerate(code):
    code_dict[c] = tab[i]

In [49]:
# convert the pd cols from NHGIS names
data = pd.read_csv(DATAFILE)

# only keep IL
working_df = data[(data.STATE == 'Illinois') & (data.COUNTY == 'Cook')]

In [50]:
new_cols = []
for colname in working_df.columns:
    if colname[:3] in code_dict.keys():
        sub_cat_dict = mini_code.get(colname[:3], {}) 
        subcat = sub_cat_dict[colname]
        keep = code_dict[colname[:3]]+"_"+ subcat
    else: 
        keep = colname
    new_cols.append(keep)

In [51]:
working_df.columns = new_cols

In [52]:
working_df.columns

Index(['GISJOIN', 'YEAR', 'STATE', 'STATEA', 'COUNTY', 'COUNTYA', 'PRETRACTA',
       'TRACTA', 'POSTTRCTA', 'AREANAME', 'Population_Total',
       'Population_by_Race_White', 'Population_by_Race_Nonwhite',
       'Negro_Population_Total', 'Occupied_Dwelling_Units_Total',
       'Total_Occupied_Dwelling_Units_Total', 'Total_Dwelling_Units_Total',
       'Occupied_Dwelling_Units_by_Tenure_Owneroccupied',
       'Occupied_Dwelling_Units_by_Tenure_Tenantoccupied',
       'Occupied_Dwelling_Units_by_Tenure_by_Race_Owneroccupied>>White',
       'Occupied_Dwelling_Units_by_Tenure_by_Race_Owneroccupied>>Negro',
       'Occupied_Dwelling_Units_by_Tenure_by_Race_Owneroccupied>>Othernon-white',
       'Occupied_Dwelling_Units_by_Tenure_by_Race_Tenantoccupied>>White',
       'Occupied_Dwelling_Units_by_Tenure_by_Race_Tenantoccupied>>Negro',
       'Occupied_Dwelling_Units_by_Tenure_by_Race_Tenantoccupied>>Othernon-white',
       'Vacant_Dwelling_Units_by_Market_Status_Forsaleorrent',
       'Vaca

In [53]:
working_df.to_csv(OUTPUTFILE)

In [65]:
final_df = working_df[['GISJOIN', 'YEAR', 'TRACTA', 'COUNTY', 'STATE']]

# race
final_df = final_df.assign(Totalpop = working_df['Population_Total'])
final_df = final_df.assign(Pwhite = working_df['Population_by_Race_White']/Totalpop)
final_df = final_df.assign(Pblack = working_df['Negro_Population_Total']/Totalpop)
final_df = final_df.assign(Pnonwhite = working_df['Population_by_Race_Nonwhite']/Totalpop)

# units
final_df = final_df.assign(Totalunits = working_df['Total_Dwelling_Units_Total'])

value = np.where(working_df[
    'Median_Value_of_Homes_for_Which_Value_was_Reported_Medianvalue'] == 0, 'NaN', 
                working_df['Median_Value_of_Homes_for_Which_Value_was_Reported_Medianvalue'])
final_df = final_df.assign(Medianhomeval = value)
final_df = final_df.assign(Poccupied = working_df['Total_Occupied_Dwelling_Units_Total'
                                                 ]/Totalunits)
final_df = final_df.assign(Pvacant = 1-Poccupied)

Allowner = working_df['Occupied_Dwelling_Units_by_Tenure_by_Race_Owneroccupied>>Negro'] +\
working_df['Occupied_Dwelling_Units_by_Tenure_by_Race_Owneroccupied>>Othernon-white'] +\
working_df['Occupied_Dwelling_Units_by_Tenure_by_Race_Owneroccupied>>White']
final_df = final_df.assign(Powner = Allowner / Totalunits) #check not just of non-vacant

Allrenter = working_df['Occupied_Dwelling_Units_by_Tenure_by_Race_Tenantoccupied>>Negro'] +\
working_df['Occupied_Dwelling_Units_by_Tenure_by_Race_Tenantoccupied>>Othernon-white'] +\
working_df['Occupied_Dwelling_Units_by_Tenure_by_Race_Tenantoccupied>>White']
final_df = final_df.assign(Prented = Allrenter / Totalunits)

final_df.head()

Unnamed: 0,GISJOIN,YEAR,TRACTA,COUNTY,STATE,Totalpop,Pwhite,Pblack,Pnonwhite,Totalunits,Medianhomeval,Poccupied,Pvacant,Powner,Prented
1316,G17003100001,1940,1,Cook,Illinois,4606,0.998697,0.000868,0.001303,1555,5630.0,0.936334,0.063666,0.085531,0.850804
1317,G17003100002,1940,2,Cook,Illinois,1619,0.998765,0.001235,0.001235,467,,0.927195,0.072805,0.158458,0.768737
1318,G17003100003,1940,3,Cook,Illinois,7391,0.997159,0.002841,0.002841,2470,6971.0,0.940891,0.059109,0.088259,0.852632
1319,G17003100004,1940,4,Cook,Illinois,8901,0.99618,0.002135,0.00382,3001,4922.0,0.945018,0.054982,0.104965,0.840053
1320,G17003100005,1940,5,Cook,Illinois,16447,0.996656,0.002371,0.003344,5662,5978.0,0.916284,0.083716,0.052985,0.863299


In [66]:
column_names = ['GISJOIN','YEAR', 'TRACTA', 'COUNTY','STATE', 'Total Pop', '% White', 
               '% Black', '% Other races', 'Total Units', 'Median', '% Occupied', 
               '% Vacant', '% Owner Occupied', '% Renter']

final_df.to_csv(index=False, header=column_names)

'GISJOIN,YEAR,TRACTA,COUNTY,STATE,Total Pop,% White,% Black,% Other races,Total Units,Median,% Occupied,% Vacant,% Owner Occupied,% Renter\nG17003100001,1940,1,Cook,Illinois,4606,0.998697351280938,0.0008684324793747286,0.0013026487190620929,1555,5630,0.9363344051446946,0.06366559485530543,0.08553054662379421,0.8508038585209003\nG17003100002,1940,2,Cook,Illinois,1619,0.9987646695491044,0.0012353304508956147,0.0012353304508956147,467,NaN,0.9271948608137045,0.0728051391862955,0.15845824411134904,0.7687366167023555\nG17003100003,1940,3,Cook,Illinois,7391,0.997158706534975,0.0028412934650250306,0.0028412934650250306,2470,6971,0.9408906882591093,0.059109311740890735,0.08825910931174089,0.8526315789473684\nG17003100004,1940,4,Cook,Illinois,8901,0.9961802044714078,0.0021345916189192226,0.003819795528592293,3001,4922,0.9450183272242586,0.0549816727757414,0.10496501166277908,0.8400533155614796\nG17003100005,1940,5,Cook,Illinois,16447,0.996655925092722,0.0023712531160698,0.003344074907277923,5662