## Mapping New York City

### Data Sources

*GIS*
* **2020 Census Redistricting Data (P.L. 94-171) Shapefiles** - downloaded from the [ftp archive](https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/TRACT/2020/) via ftp client
* To download just the files for NY State, selecting only files whose names begin with "tl_2020_36"
* Data can also be downloaded through a browser, but may result in formatting issues, so I recommend avoiding this, is possible

In [1]:
import os, pathlib
base_dir = pathlib.Path(os.getcwd()).parent
data_archive_dir = os.path.join(base_dir, "data_archive")
data_dir = os.path.join(base_dir, "data")
shapes_dir = os.path.join(data_dir,"shapes")
json_dir = os.path.join(data_dir,"geojson")

In [2]:
import tarfile
import pandas as pd

file_census_data_tgz = os.path.join(data_archive_dir, 'census_data_2022_03_01.tgz')
file_census_data_csv = 'DECENNIALPL2020.P1_data_with_overlays_2021-12-02T121459.csv'

# this function creates a DataFrame from our tgz archive file
def extract_from_tgz(filename):
    with tarfile.open(filename) as tf:
        for file in tf.getmembers():
            if file.name == file_census_data_csv:
                data = tf.extractfile(file)
                return pd.read_csv(data, low_memory=False, usecols=[0, 1, 2])

df_census = extract_from_tgz(file_census_data_tgz)
df_census = df_census.drop(0)
df_census.columns = ["GEOID Census Tract Full", "Census Tract Name", "Population"]
df_census['Census Tract Code'] = df_census['GEOID Census Tract Full'].str.slice(-6)
df_census['State FIPS'] = df_census['GEOID Census Tract Full'].str.slice(9,11)
df_census['County FIPS'] = df_census['GEOID Census Tract Full'].str.slice(11,14)
#df_census.dtypes
df_census.head(2)

Unnamed: 0,GEOID Census Tract Full,Census Tract Name,Population,Census Tract Code,State FIPS,County FIPS
1,1400000US01001020100,"Census Tract 201, Autauga County, Alabama",1775,20100,1,1
2,1400000US01001020200,"Census Tract 202, Autauga County, Alabama",2055,20200,1,1


In [3]:
import pandas as pd

file_place = os.path.join(data_archive_dir, 'PLACElist.txt')

df_place = pd.read_csv(file_place, 
                       delimiter="|", 
                       usecols=['STATE', 'STATEFP', 'PLACEFP', 'PLACENAME', 'COUNTY'], # use only these columns
                       encoding="iso-8859-1" # QUESTION: Patrick, this txt file uses ANSI encoding, what should we use here?
                       )[['STATE','STATEFP', 'PLACENAME', 'PLACEFP', 'COUNTY']] # reorder columns

#encoding_errors='ignore'
df_place.rename(columns={'STATE': 'State', 'STATEFP': 'State FIPS', 'PLACENAME': 'Place', 'PLACEFP': 'Place FIPS', 'COUNTY': 'County'}, inplace=True) # rename columns
df_place.head(3)

Unnamed: 0,State,State FIPS,Place,Place FIPS,County
0,AL,1,Abanda CDP,100,Chambers County
1,AL,1,Abbeville city,124,Henry County
2,AL,1,Adamsville city,460,Jefferson County


## Geographic Data

The Census Bureau provides geographic this information in the form of shapefiles. We'll convert this to GeoJSON for mapping.

In [4]:
# Uses the geopandas function read_file to grab our file
import geopandas as gpd
# Julie download this file from here: /geo/tiger/TIGER2020PL/LAYER/PLACE/2020
# this is a different file than Patrick is using
shapefiles_dir_place = os.path.join(shapes_dir,"tl_2020_36_place20.zip")
shape_place_df = gpd.read_file(shapefiles_dir_place)

In [14]:
shapefiles_dir_tract = os.path.join(shapes_dir,"tiger2020PL_NY_tracts") # provide the full path to our shapefiles
shapefiles_tract_ny=[x for x in pathlib.Path(shapefiles_dir_tract).iterdir() if x.is_file()] # make a list of all the files in the directory with their full path
# shapefiles_tract_ny

df_shapes_tract_ny = pd.DataFrame()
for file in shapefiles_tract_ny:
    df_shapes_tract_ny = pd.concat([df_shapes_tract_ny, gpd.read_file(file)], ignore_index=True, copy=False)
    # df_shapes_tract_ny.rename(columns={'STATEFP20': 'State FIPS', 'COUNTYFP20': 'County FIPS', 'TRACTCE20': 'Census Tract Code'}, inplace=True)

df_shapes_tract_ny.rename(columns={'STATEFP20': 'State FIPS', 'COUNTYFP20': 'County FIPS', 'TRACTCE20': 'Census Tract Code'}, inplace=True)
df_shapes_tract_ny.head(2)


Unnamed: 0,State FIPS,County FIPS,Census Tract Code,GEOID20,NAME20,NAMELSAD20,MTFCC20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,geometry
0,36,25,970101,36025970101,9701.01,Census Tract,G5020,S,108853793,691913,42.4452144,-74.7028619,"POLYGON ((-74.79374 42.49635, -74.79359 42.496..."
1,36,25,970102,36025970102,9701.02,Census Tract,G5020,S,133548339,904696,42.4544478,-74.9018086,"POLYGON ((-75.05907 42.43064, -75.05219 42.430..."


In [15]:
df_shapes_tract_ny = df_shapes_tract_ny.merge(df_census, left_on=["STATEFP20", "COUNTYFP20", "TRACTCE20"], right_on=["State FIPS", "County FIPS", "Tract ID"], how='inner')
# df_shapes_tract_ny = df_shapes_tract_ny[["Census Tract Name", "Population", "geometry"]]
df_shapes_tract_ny

KeyError: 'STATEFP20'

In [None]:
shape_df.loc[ shape_df["NAMELSAD"].str.startswith("New York") ]

In [None]:
ny_shape = shape_df.loc[ shape_df["NAMELSAD"] == "New York city" ]
ny_shape

In [None]:
from keplergl import KeplerGl
ny_map = KeplerGl( height=600, show_docs=False)
ny_map.add_data(ny_shape, name='New York City')
ny_map.add_data(ny_tract_shapes_df, name='Population') 

In [None]:
ny_map.config = {'version': 'v1',
 'config': {'visState': {'filters': [],
   'layers': [{'id': '657ayan',
     'type': 'geojson',
     'config': {'dataId': 'New York City',
      'label': 'New York City',
      'color': [77, 193, 156],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'geojson': 'geometry'},
      'isVisible': True,
      'visConfig': {'opacity': 0.8,
       'strokeOpacity': 0.8,
       'thickness': 1,
       'strokeColor': [212, 204, 243],
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'strokeColorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'radius': 10,
       'sizeRange': [0, 10],
       'radiusRange': [0, 50],
       'heightRange': [0, 500],
       'elevationScale': 5,
       'enableElevationZoomFactor': True,
       'stroked': True,
       'filled': False,
       'enable3d': False,
       'wireframe': False},
      'hidden': False,
      'textLabel': [{'field': None,
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'start',
        'alignment': 'center'}]},
     'visualChannels': {'colorField': None,
      'colorScale': 'quantile',
      'strokeColorField': None,
      'strokeColorScale': 'quantile',
      'sizeField': None,
      'sizeScale': 'linear',
      'heightField': None,
      'heightScale': 'linear',
      'radiusField': None,
      'radiusScale': 'linear'}},
    {'id': 'g18qutg',
     'type': 'geojson',
     'config': {'dataId': 'Population',
      'label': 'Population',
      'color': [23, 184, 190],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'geojson': 'geometry'},
      'isVisible': True,
      'visConfig': {'opacity': 0.8,
       'strokeOpacity': 0.8,
       'thickness': 0.5,
       'strokeColor': [246, 209, 138],
       'colorRange': {'name': 'Global Warming 8',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#4C0035',
         '#650031',
         '#7F0023',
         '#98000A',
         '#B21800',
         '#CB4600',
         '#E57F00',
         '#FFC300']},
       'strokeColorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'radius': 10,
       'sizeRange': [0, 10],
       'radiusRange': [0, 50],
       'heightRange': [0, 500],
       'elevationScale': 5,
       'enableElevationZoomFactor': True,
       'stroked': False,
       'filled': True,
       'enable3d': False,
       'wireframe': False},
      'hidden': False,
      'textLabel': [{'field': None,
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'start',
        'alignment': 'center'}]},
     'visualChannels': {'colorField': {'name': 'Population',
       'type': 'integer'},
      'colorScale': 'quantile',
      'strokeColorField': None,
      'strokeColorScale': 'quantile',
      'sizeField': None,
      'sizeScale': 'linear',
      'heightField': None,
      'heightScale': 'linear',
      'radiusField': None,
      'radiusScale': 'linear'}}],
   'interactionConfig': {'tooltip': {'fieldsToShow': {'New York City': [{'name': 'STATEFP',
        'format': None},
       {'name': 'PLACEFP', 'format': None},
       {'name': 'PLACENS', 'format': None},
       {'name': 'AFFGEOID', 'format': None},
       {'name': 'GEOID', 'format': None}],
      'Population': [{'name': 'Census Tract Name', 'format': None},
       {'name': 'Population', 'format': None}]},
     'compareMode': False,
     'compareType': 'absolute',
     'enabled': True},
    'brush': {'size': 0.5, 'enabled': False},
    'geocoder': {'enabled': False},
    'coordinate': {'enabled': False}},
   'layerBlending': 'normal',
   'splitMaps': [],
   'animationConfig': {'currentTime': None, 'speed': 1}},
  'mapState': {'bearing': 0,
   'dragRotate': False,
   'latitude': 42.7462215,
   'longitude': -75.7700405,
   'pitch': 0,
   'zoom': 6,
   'isSplit': False},
  'mapStyle': {'styleType': 'dark',
   'topLayerGroups': {},
   'visibleLayerGroups': {'label': True,
    'road': True,
    'border': False,
    'building': True,
    'water': True,
    'land': True,
    '3d building': False},
   'threeDBuildingColor': [9.665468314072013,
    17.18305478057247,
    31.1442867897876],
   'mapStyles': {}}}}

In [None]:
ny_map

In [None]:
ny_map.config

In [None]:
# this code block loops through our 'ny_shapefiles' list, and creates a separate list of FIPS codes for county and state 
county_codes=[] # create an empty list for County FIPS
state_codes=[] # create an empty list for State FIPS
filtered_shapefiles=[]

for file in ny_shapefiles: 
    filename_parts = file.name.replace(".zip","").split("_") # take each filename - remove '.zip', split the remaining string wherever "_"  appears, and save it as a list
    if len(filename_parts) >=3: # take every 'filename_parts' list containing 3 or more elements (fewer than 3 parts indicates a file is extraneous and we don't want it)
        if len(filename_parts[2]) ==5: # take from each list the element at index 2 (position 3), but only if it contains 5 digits [State FIPS + County FIPS = 5 digits]
            # filename_parts -->  tl_2020_36013_tract20.zip
            # 36013 <---filename_parts[2]
            # 013 <---filename_parts[2][1:4]
            county_codes.append(filename_parts[2][-3:]) # take the last 3 digits of the element at index 2, and append it to the list 
            state_codes.append(filename_parts[2][0:2])  # take the first 3 digits of the element at index 2, State FIPS, and append it to our list
            filtered_shapefiles.append(file)

In [None]:
# lets zip our 3 lists into one and call it 'files_to_load'
files_list = list(zip(state_codes, county_codes, filtered_shapefiles))

In [None]:
# now let's turn it into a DataFrame and rename the columns for consistency
df_ny_files = pd.DataFrame.from_records(files_list).rename({0: 'State FIPS', 1: 'County FIPS', 2: 'File name'}, axis=1)
df_ny_files['State FIPS']=df_ny_files['State FIPS'].astype(int)
df_ny_files['County FIPS']=df_ny_files['County FIPS'].astype(int)
df_ny_files

# let's check out the datatypes
df_ny_files.info()

In [None]:
df_ny_files.head()

In [None]:
df_codes = pd.read_csv('codes.csv')
# need to drop column "unnamed: 0"
df_codes['State FIPS']=df_codes['State FIPS'].astype(int)
df_codes['County FIPS']=df_codes['County FIPS'].astype(int)
df_codes.head()

In [None]:
# make a df here with just the files for ny state, including columns: State Name, State FIPS, County, County Fips, File name
df_ny_codes_files = pd.merge(df_codes, df_ny_files)
df_ny_codes_files
df_ny_codes_files.head()
# df_ny_codes_files[['State Name', 'State FIPS', 'County', 'County FIPS', 'File name']]

In [None]:
df_ny_codes_files.shape

In [None]:
#### now I want to load the map of NY state census tracts

from keplergl import KeplerGl
ny_map = KeplerGl(height=600, show_docs=False)
for row in df_ny_codes_files.itertuples():
    zipfile = f"zip://{row[7]}"
    ny_map.add_data(data=gpd.read_file(zipfile), name=row[5])
ny_map

In [None]:
# now I want to load the population data for just NYC and add it to the map
# should i join this with geo_df_nyc?
import tarfile

# 32mb+ of census data saved in a 4.7mb archive
census_data_archive = os.path.join(data_archive_dir, "census_data_2022_03_01.tgz")

# This is the US Census file with population data we will extract
# this file is contained in the above tgz file
census_2020_file = "DECENNIALPL2020.P1_data_with_overlays_2021-12-02T121459.csv"

use_cols = [0, 1, 2]
col_names = ['GEOID', 'CENSUS TRACT NAME', 'POPULATION']

# This extracts a DataFrame from a tgz archived file
def extract_from_tgz(filename):
    with tarfile.open(filename) as tf:
        for file in tf.getmembers():
            if file.name == census_2020_file:
                data = tf.extractfile(file)
                return pd.read_csv(data, low_memory=False, skiprows=1, header=0, usecols=use_cols, names=col_names)

df_census_raw = extract_from_tgz(census_data_archive)

# change some options that determine how much data is displayed in the notebook


df_census_raw.head(5)

The last 6 digits of the GEOID are the census tract code.

In [None]:
# make a GEO DataFrame for all of NY State, so that I can add in population data and calculate pop density
# geo_df_ny.info()

In [None]:
# this is for later if i want to save the config file for my map
#ny_map.config

In [None]:
# state_fp = df_census_raw['GEOID'].str.slice(9,11).rename('State FIPS').astype(int)
# county_fp = df_census_raw['GEOID'].str.slice(11,14).rename('County FIPS').astype(int)
# df_census_pop = pd.concat([df_census_raw, state_fp, county_fp], axis=1).drop('GEOID', axis=1)
# df_census_pop