In [1]:
import os, pathlib
base_dir = pathlib.Path("../..")
data_dir = os.path.join(base_dir, "data")
data_archive_dir = os.path.join(base_dir, "data_archive")
shapes_dir = os.path.join(data_dir,"shapes")
json_dir = os.path.join(data_dir,"geojson")

In [2]:
import pandas as pd

os.makedirs(json_dir, exist_ok=True)
shapefiles=os.listdir(shapes_dir)
files_by_fips = []
for file in shapefiles:
    state_fips = file.replace(".zip","").split("_")[2]
    path = os.path.join(shapes_dir,file)
    files_by_fips.append([state_fips, str(path)])
shapes_df = pd.DataFrame.from_records(files_by_fips, columns=['STATEFP', 'FILE'], index='STATEFP')
shapes_df.head(5)

Unnamed: 0_level_0,FILE
STATEFP,Unnamed: 1_level_1
37,../../data/shapes/tl_2020_37_tract.zip
2,../../data/shapes/tl_2020_02_tract.zip
49,../../data/shapes/tl_2020_49_tract.zip
47,../../data/shapes/tl_2020_47_tract.zip
78,../../data/shapes/tl_2020_78_tract.zip


In [3]:
# data files are stored compressed to save time and space
import tarfile

# 17mb of GIS data saved in a 2.4mb archive
gis_file = "2020_Gaz_tracts_national.gz"

# 32mb+ of census data saved in a 4.7mb archive
census_data_archive = os.path.join(base_dir, "data_archive/census_data_2022_03_01.tgz")

# This is the US Census file with population data we will extract
# this file is contained in the above tgz file
census_2020_file = "DECENNIALPL2020.P1_data_with_overlays_2021-12-02T121459.csv"

use_cols = [0, 1, 2]
col_names = ['GEOID', 'CENSUS TRACT NAME', 'POPULATION']

# This extracts a DataFrame from a tgz archived file
def extract_from_tgz(filename):
    with tarfile.open(filename) as tf:
        for file in tf.getmembers():
            if file.name == census_2020_file:
                data = tf.extractfile(file)
                return pd.read_csv(data, low_memory=False, skiprows=1, header=0, usecols=use_cols, names=col_names)
                        
df_census_pop = extract_from_tgz(census_data_archive)

# change some options that determine how much data is displayed in the notebook
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

df_census_pop.head(5)

# below is the decennial census data, first 3 columns in a new df

Unnamed: 0,GEOID,CENSUS TRACT NAME,POPULATION
0,1400000US01001020100,"Census Tract 201, Autauga County, Alabama",1775
1,1400000US01001020200,"Census Tract 202, Autauga County, Alabama",2055
2,1400000US01001020300,"Census Tract 203, Autauga County, Alabama",3216
3,1400000US01001020400,"Census Tract 204, Autauga County, Alabama",4246
4,1400000US01001020501,"Census Tract 205.01, Autauga County, Alabama",4322


In [4]:
import numpy, codecs

decoder = codecs.getdecoder('utf8')
col_names = ['TRACT NAME', 'COUNTY', 'STATE NAME']
df_census_pop[col_names] = (df_census_pop.iloc[:,1]
                            .str.encode('ascii', 'ignore')
                            .apply(lambda x: decoder(x)[0])
                            .str.split(",")
                            .values.tolist()
                           )
cols = [x for x in col_names]
cols = ['GEOID', *cols, 'POPULATION']
df_census_exp = df_census_pop.drop(columns='CENSUS TRACT NAME')[cols]

df_census_exp['STATE NAME'] = df_census_exp['STATE NAME'].str.strip()

df_census_exp.head(5)

Unnamed: 0,GEOID,TRACT NAME,COUNTY,STATE NAME,POPULATION
0,1400000US01001020100,Census Tract 201,Autauga County,Alabama,1775
1,1400000US01001020200,Census Tract 202,Autauga County,Alabama,2055
2,1400000US01001020300,Census Tract 203,Autauga County,Alabama,3216
3,1400000US01001020400,Census Tract 204,Autauga County,Alabama,4246
4,1400000US01001020501,Census Tract 205.01,Autauga County,Alabama,4322


In [5]:
id_col_names = ['GEOID', 'STATEFP', 'COUNTYFP', 'TRACT']
tract_geoids = [[x, x[9:11], x[11:14], x[14:]] for x in df_census_pop['GEOID'].values]
df_ids = pd.DataFrame.from_records(tract_geoids, columns=id_col_names, index=id_col_names[0])
df_ids.head(5)

Unnamed: 0_level_0,STATEFP,COUNTYFP,TRACT
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1400000US01001020100,1,1,20100
1400000US01001020200,1,1,20200
1400000US01001020300,1,1,20300
1400000US01001020400,1,1,20400
1400000US01001020501,1,1,20501


In [6]:
id_cols = [x for x in id_col_names]
exp_cols = df_census_exp.columns.values.tolist()[1:]
exp_cols = [*id_cols, *exp_cols]
df_all = df_census_exp.join(df_ids, on='GEOID')[exp_cols]
df_all['COUNTY'] = df_all['COUNTY'].str.strip()
df_all.head(5)

Unnamed: 0,GEOID,STATEFP,COUNTYFP,TRACT,TRACT NAME,COUNTY,STATE NAME,POPULATION
0,1400000US01001020100,1,1,20100,Census Tract 201,Autauga County,Alabama,1775
1,1400000US01001020200,1,1,20200,Census Tract 202,Autauga County,Alabama,2055
2,1400000US01001020300,1,1,20300,Census Tract 203,Autauga County,Alabama,3216
3,1400000US01001020400,1,1,20400,Census Tract 204,Autauga County,Alabama,4246
4,1400000US01001020501,1,1,20501,Census Tract 205.01,Autauga County,Alabama,4322


## PLACElist.txt file 

ftp://ftp2.census.gov/geo/docs/reference/codes/PLACElist.txt

In [7]:
# this is a list of all "Places" in the US
# a place can span multiple counties

places_file = os.path.join(data_archive_dir, 'PLACElist.txt')
df_places = pd.read_csv(places_file, delimiter="|", encoding_errors='ignore', dtype={'STATEFP':str})
df_places.loc[df_places['PLACENAME'] == 'New York city']

Unnamed: 0,STATE,STATEFP,PLACEFP,PLACENAME,TYPE,FUNCSTAT,COUNTY
24316,NY,36,51000,New York city,Incorporated Place,A,"Bronx County, Kings County, New York County, Queens County, Richmond County"


In [8]:
# code below splits every Place into separate record for each county

df_places_stack = (df_places['COUNTY']    # County can contain multiple comma-separated names
                   .str.split(",")        # split into lists
                   .apply(pd.Series)      # convert to multiple cols / Series
                   .stack()               # pivot the series into rows
                   .str.strip()           # strip leading/trailing spaces
                   .reset_index(level=1)  # convert to DF by resetting index
                   .drop('level_1',axis=1) # drop the new 'index'
                   .rename(columns={0:'COUNTY'}) # now-split counties are in column "0" so rename
                   .join(df_places.drop(['COUNTY','FUNCSTAT'],axis=1), how='left') # Join with orig dataframe but remove old COUNTIES and FUNCSTAT
                  )

df_places_stack.loc[df_places_stack['PLACENAME'] == 'New York city']

Unnamed: 0,COUNTY,STATE,STATEFP,PLACEFP,PLACENAME,TYPE
24316,Bronx County,NY,36,51000,New York city,Incorporated Place
24316,Kings County,NY,36,51000,New York city,Incorporated Place
24316,New York County,NY,36,51000,New York city,Incorporated Place
24316,Queens County,NY,36,51000,New York city,Incorporated Place
24316,Richmond County,NY,36,51000,New York city,Incorporated Place


In [9]:
# this merges the NYC places list with the massaged census data
# now we have COUNTYFP joined with PLACENAME

df_all_places = (df_all.merge(df_places_stack, how='left', on=['COUNTY', 'STATEFP']))
df_all_places.loc[df_all_places['PLACENAME'] == 'New York city']

Unnamed: 0,GEOID,STATEFP,COUNTYFP,TRACT,TRACT NAME,COUNTY,STATE NAME,POPULATION,STATE,PLACEFP,PLACENAME,TYPE
1962171,1400000US36005000100,36,005,000100,Census Tract 1,Bronx County,New York,3772,NY,51000.0,New York city,Incorporated Place
1962173,1400000US36005000200,36,005,000200,Census Tract 2,Bronx County,New York,4779,NY,51000.0,New York city,Incorporated Place
1962175,1400000US36005000400,36,005,000400,Census Tract 4,Bronx County,New York,6272,NY,51000.0,New York city,Incorporated Place
1962177,1400000US36005001600,36,005,001600,Census Tract 16,Bronx County,New York,5795,NY,51000.0,New York city,Incorporated Place
1962179,1400000US36005001901,36,005,001901,Census Tract 19.01,Bronx County,New York,2292,NY,51000.0,New York city,Incorporated Place
...,...,...,...,...,...,...,...,...,...,...,...,...
2079079,1400000US36085030302,36,085,030302,Census Tract 303.02,Richmond County,New York,6669,NY,51000.0,New York city,Incorporated Place
2079081,1400000US36085031901,36,085,031901,Census Tract 319.01,Richmond County,New York,3262,NY,51000.0,New York city,Incorporated Place
2079083,1400000US36085031902,36,085,031902,Census Tract 319.02,Richmond County,New York,5500,NY,51000.0,New York city,Incorporated Place
2079085,1400000US36085032300,36,085,032300,Census Tract 323,Richmond County,New York,1233,NY,51000.0,New York city,Incorporated Place


In [10]:
# now we are joining tract record with the shapefiles it can be dound in

# rename shapes_df to shapefiles_by_state

df_all_shapes = df_all_places.join(shapes_df, on='STATEFP')
df_all_shapes.loc[(df_all_shapes['PLACENAME'] == 'New York city') & (df_all_shapes['COUNTY'] == 'Kings County')]

Unnamed: 0,GEOID,STATEFP,COUNTYFP,TRACT,TRACT NAME,COUNTY,STATE NAME,POPULATION,STATE,PLACEFP,PLACENAME,TYPE,FILE
1999627,1400000US36047000100,36,047,000100,Census Tract 1,Kings County,New York,4616,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
1999629,1400000US36047000200,36,047,000200,Census Tract 2,Kings County,New York,1205,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
1999631,1400000US36047000301,36,047,000301,Census Tract 3.01,Kings County,New York,3850,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
1999633,1400000US36047000501,36,047,000501,Census Tract 5.01,Kings County,New York,4404,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
1999635,1400000US36047000502,36,047,000502,Census Tract 5.02,Kings County,New York,2640,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001227,1400000US36047122000,36,047,122000,Census Tract 1220,Kings County,New York,6169,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
2001229,1400000US36047123700,36,047,123700,Census Tract 1237,Kings County,New York,8586,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
2001231,1400000US36047150200,36,047,150200,Census Tract 1502,Kings County,New York,2616,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
2001233,1400000US36047152200,36,047,152200,Census Tract 1522,Kings County,New York,3287,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip


In [11]:
files = df_all_shapes.loc[df_all_shapes['PLACENAME'] == 'New York city']['FILE'].unique()
files

array(['../../data/shapes/tl_2020_36_tract.zip'], dtype=object)

In [17]:
import geopandas as gpd

zipfile = f"zip://{files[0]}"
geo_df = gpd.read_file(zipfile).rename(columns={'TRACTCE':'TRACT'})


In [13]:
geo_df_all = geo_df.merge(df_all_shapes, on=['STATEFP', 'COUNTYFP', 'TRACT'], how='left')
#geo_df_all.loc[(geo_df_all['PLACENAME'] == 'New York city') & (geo_df_all['COUNTY'] == 'Kings County') ].apply(lambda x: gpd.GeoDataFrame([x.values], columns=geo_df_all.columns.values).to_json(), axis=1)
geo_df_all.head(5)

Unnamed: 0,STATEFP,COUNTYFP,TRACT,GEOID_x,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,...,GEOID_y,TRACT NAME,COUNTY,STATE NAME,POPULATION,STATE,PLACEFP,PLACENAME,TYPE,FILE
0,36,47,700,36047000700,7,Census Tract 7,G5020,S,176774,0,...,1400000US36047000700,Census Tract 7,Kings County,New York,4415,NY,10022.0,Brooklyn borough,County Subdivision,../../data/shapes/tl_2020_36_tract.zip
1,36,47,700,36047000700,7,Census Tract 7,G5020,S,176774,0,...,1400000US36047000700,Census Tract 7,Kings County,New York,4415,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
2,36,47,900,36047000900,9,Census Tract 9,G5020,S,163469,0,...,1400000US36047000900,Census Tract 9,Kings County,New York,5167,NY,10022.0,Brooklyn borough,County Subdivision,../../data/shapes/tl_2020_36_tract.zip
3,36,47,900,36047000900,9,Census Tract 9,G5020,S,163469,0,...,1400000US36047000900,Census Tract 9,Kings County,New York,5167,NY,51000.0,New York city,Incorporated Place,../../data/shapes/tl_2020_36_tract.zip
4,36,47,1100,36047001100,11,Census Tract 11,G5020,S,168507,0,...,1400000US36047001100,Census Tract 11,Kings County,New York,1578,NY,10022.0,Brooklyn borough,County Subdivision,../../data/shapes/tl_2020_36_tract.zip


In [14]:
# Load an empty map
from keplergl import KeplerGl
ny_map = KeplerGl(height=1200, show_docs=False)
map_df = geo_df_all.loc[(geo_df_all['PLACENAME'] == 'New York city') & (geo_df_all['COUNTY'] == 'Kings County') ]
ny_map.add_data(data=map_df, name="Brooklyn")
ny_map

KeplerGl(data={'Brooklyn': {'index': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 213, 215, 217, 219, 221, 223, 225…

In [15]:
#map_df_file = os.path.join(data_dir,"brooklyn_geo.parquet")

#map_df_json_file = os.path.join(data_dir,"brooklyn_geo.json")

#map_df.to_parquet(map_df_file)
#with open(map_df_json_file, "w") as file:
#    file.write(map_df.to_json())


In [16]:
#import json
#map_config_file = os.path.join(data_archive_dir, "brooklyn_map_config.json")
#with open(map_config_file, "w") as file:
#    file.write(json.dumps(ny_map.config))