#### Example of using crosswalks between census tracts of different years

This notebook used to provide example code for using the output from the script in creating geojsons with multiple years of census data for the same tract. As I'm migrating it to a dedicated script in main.py, it's now for testing different parts of that script.  

#### 1.) Download the crosswalks from the Azure container 

In [3]:
import utils

In [5]:
import tract_crosswalk
import pandas as pd
import requests
import geopandas as gpd 
import pygris
import json
import sys
import os
import us 
from logger import logger
import yaml
import census_data
import datetime as dt
import shapely
import numpy as np
import sys, getopt
import yaml
import utils
from collections import defaultdict


with open('config.yaml', 'r') as file: 
    config = yaml.full_load(file)

CENSUS_VARS = config['census_vars']
YEARS = range(config['start_year'], config['end_year']+1)
STATES = utils.load_state_list(config['states'])

from tract_crosswalk import get_all_tract_geoms_year, get_tract_crosswalks
from census_data import load_raw_census_data, _transform_raw_data_long, _widen_df, _extract_2020_data, _collapse_df, join_crosswalk, apply_crosswalk, rejoin_2020, join_geoms, bin_variables
from logger import logger


In [None]:

## Download/load crosswalk and pygris geoms 
logger.info(f'Obtaining tract crosswalks (tract_crosswalk.py)')
get_tract_crosswalks() 

logger.info(f'Obtaining simplified geometries from pygris')
py_geoms = get_all_tract_geoms_year(year=2020, erase_water=False, simplify_tolerance=.001)
states_fips_to_keep = [s['fips'] for s in STATES]
py_geoms = py_geoms[py_geoms['STATEFP'].isin(states_fips_to_keep)]

# get_tract_crosswalks uses raw (unsimplified) geometries to calculate overlaps -- these are for our final output here

## Download (or load cached) raw census data
df = load_raw_census_data()

## Transform raw data (long format)
df_long = _transform_raw_data_long(df)

## Widen data 
wide_df = _widen_df(df_long)

## Separate 2020 data from other years
df_2020 = _extract_2020_data(wide_df)

## Collapse pre-2020 data 
df = _collapse_df(wide_df)

## Join and apply the crosswalk column to the pre-2020 data
df = join_crosswalk(df)
apply_df = apply_crosswalk(df)

## Re-Join the 2020 data to crosswalked pre-2020 data
rejoined_df = rejoin_2020(apply_df, df_2020)

## Bin the census variable columns 
binned_df = bin_variables(rejoined_df)

# ## Join the geometries 
df_geoms = binned_df.merge(py_geoms['geometry'], how='right', left_on='GEOID', right_index=True)

df_geoms

In [2]:
df_geoms

Unnamed: 0,GEOID,state_fips,state_name,state_usps,county_fips,county_name,tract_fips,tract_dec,B19013_001E,geometry
276.0,01045020801,01,Alabama,AL,045,Dale,020801,208.01,"{'2010': {'value': 42076.8, 'state_bin': '39,9...","POLYGON ((-85.70064 31.38509, -85.69648 31.388..."
147.0,01017954200,01,Alabama,AL,017,Chambers,954200,9542,"{'2010': {'value': 34587.38, 'state_bin': '33,...","POLYGON ((-85.31549 32.80580, -85.31003 32.813..."
143.0,01017953800,01,Alabama,AL,017,Chambers,953800,9538,"{'2010': {'value': 32068.0, 'state_bin': '28,8...","POLYGON ((-85.59345 33.00012, -85.59318 33.107..."
148.0,01017954300,01,Alabama,AL,017,Chambers,954300,9543,"{'2010': {'value': 28136.0, 'state_bin': '22,7...","POLYGON ((-85.38077 32.78301, -85.37879 32.782..."
151.0,01017954700,01,Alabama,AL,017,Chambers,954700,9547,"{'2010': {'value': 31368.0, 'state_bin': '28,8...","POLYGON ((-85.22897 32.74543, -85.22332 32.754..."
...,...,...,...,...,...,...,...,...,...,...
16092.0,11001009801,11,District of Columbia,DC,001,District of Columbia,009801,98.01,"{'2010': {'value': 31917.0, 'state_bin': '26,8...","POLYGON ((-77.00386 38.83099, -77.00228 38.833..."
15969.0,11001002801,11,District of Columbia,DC,001,District of Columbia,002801,28.01,"{'2010': {'value': 43661.0, 'state_bin': '42,1...","POLYGON ((-77.03646 38.93412, -77.03645 38.937..."
15970.0,11001002802,11,District of Columbia,DC,001,District of Columbia,002802,28.02,"{'2010': {'value': 41328.64, 'state_bin': '35,...","POLYGON ((-77.03671 38.92712, -77.03646 38.933..."
16057.0,11001008001,11,District of Columbia,DC,001,District of Columbia,008001,80.01,"{'2010': {'value': 89038.87, 'state_bin': '78,...","POLYGON ((-76.99025 38.89731, -76.98360 38.900..."


In [6]:
py_geoms

Unnamed: 0_level_0,STATEFP,COUNTYFP,TRACTCE,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
GEOID_TRACT_20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
01045020801,01,045,020801,208.01,Census Tract 208.01,G5020,S,19261326,0,+31.3971437,-085.6615959,"POLYGON ((-85.70064 31.38509, -85.69648 31.388..."
01017954200,01,017,954200,9542,Census Tract 9542,G5020,S,62998696,34641,+32.8341436,-085.2678516,"POLYGON ((-85.31549 32.80580, -85.31003 32.813..."
01017953800,01,017,953800,9538,Census Tract 9538,G5020,S,515692207,11516898,+33.0302580,-085.3867880,"POLYGON ((-85.59345 33.00012, -85.59318 33.107..."
01017954300,01,017,954300,9543,Census Tract 9543,G5020,S,132642536,178276,+32.7766468,-085.2734917,"POLYGON ((-85.38077 32.78301, -85.37879 32.782..."
01017954700,01,017,954700,9547,Census Tract 9547,G5020,S,38036820,264376,+32.7702191,-085.1744682,"POLYGON ((-85.22897 32.74543, -85.22332 32.754..."
...,...,...,...,...,...,...,...,...,...,...,...,...
72119130300,72,119,130300,1303,Census Tract 1303,G5020,S,8150827,110315,+18.3989008,-065.8358049,"POLYGON ((-65.85254 18.40170, -65.84382 18.405..."
72119130500,72,119,130500,1305,Census Tract 1305,G5020,S,1215550,0,+18.3759303,-065.8453190,"POLYGON ((-65.85240 18.37359, -65.85014 18.380..."
72087110102,72,087,110102,1101.02,Census Tract 1101.02,G5020,S,2518426,27500,+18.4151874,-065.8422860,"POLYGON ((-65.85261 18.40230, -65.84763 18.413..."
72087110200,72,087,110200,1102,Census Tract 1102,G5020,S,1045973,791595,+18.4298876,-065.8446315,"POLYGON ((-65.84922 18.43887, -65.83888 18.438..."


In [4]:
binned_df['B19013_001E']

0        {'2010': {'value': 70151.78, 'state_bin': '68,...
1        {'2010': {'value': 58420.0, 'state_bin': '55,5...
2        {'2010': {'value': 41091.0, 'state_bin': '39,9...
3        {'2010': {'value': 44031.0, 'state_bin': '43,8...
4        {'2010': {'value': 56627.0, 'state_bin': '55,5...
                               ...                        
82167    {'2010': {'value': 36862.0, 'state_bin': '36,8...
82168    {'2010': {'value': 34796.0, 'state_bin': '34,6...
82169    {'2010': {'value': 29198.0, 'state_bin': '25,8...
82170    {'2010': {'value': 39272.0, 'state_bin': '36,8...
82171    {'2010': {'value': 35972.0, 'state_bin': '34,6...
Name: B19013_001E, Length: 83849, dtype: object

In [33]:
split_df = rejoined_df.drop(list(rejoined_df.filter(regex='|'.join(CENSUS_VARS)).columns), axis=1) # to re-concatenate with cvar_df
non_cvar_cols = list(split_df.columns)
for cvar in CENSUS_VARS: 
    cvar_df = pd.json_normalize(rejoined_df[cvar])
    cvar_df.columns = [f'{cvar}-{year}' for year in cvar_df.columns]
    split_df = pd.concat([split_df, cvar_df], axis=1)

display('Split Df')
display(split_df)

'Split Df'

Unnamed: 0,GEOID,state_fips,state_name,state_usps,county_fips,county_name,tract_fips,tract_dec,B19013_001E-2010,B19013_001E-2011,B19013_001E-2012,B19013_001E-2013,B19013_001E-2014,B19013_001E-2015,B19013_001E-2016,B19013_001E-2017,B19013_001E-2018,B19013_001E-2019,B19013_001E-2020
0,01001020100,01,Alabama,AL,001,Autauga,020100,201,70151.78,58787.15,56443.50,62966.97,59940.00,61776.16,65934.00,67758.17,58566.38,60147.79,60388.0
1,01001020803,01,Alabama,AL,001,Autauga,020803,208.03,58420.00,59149.00,60019.00,54711.00,56681.00,60063.00,61242.00,64439.00,75793.00,65878.00,29893.0
2,01001020200,01,Alabama,AL,001,Autauga,020200,202,41091.00,42019.00,41250.00,44019.00,42971.00,32303.00,41107.00,41287.00,43531.00,43958.00,49144.0
3,01001020300,01,Alabama,AL,001,Autauga,020300,203,44031.00,43145.00,43088.00,43201.00,43717.00,44922.00,51250.00,46806.00,51875.00,55345.00,62423.0
4,01001020400,01,Alabama,AL,001,Autauga,020400,204,56627.00,58419.00,54503.00,54730.00,55814.00,54329.00,52704.00,55895.00,54050.00,59663.00,64310.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83844,56043000200,56,Wyoming,WY,043,Washakie,000200,2,53686.00,60560.00,52804.00,55938.00,51799.00,54545.00,51024.00,59706.00,54936.00,55859.00,55268.0
83845,56043000302,56,Wyoming,WY,043,Washakie,000302,3.02,56066.80,61292.10,57984.22,58513.23,49859.64,55186.82,52568.54,52273.94,56131.36,58611.80,65543.0
83846,56043000301,56,Wyoming,WY,043,Washakie,000301,3.01,40304.00,40086.00,37700.00,34619.00,35691.00,34643.00,38221.00,41496.00,50428.00,46500.00,52074.0
83847,56045951100,56,Wyoming,WY,045,Weston,951100,9511,52440.00,64103.00,64412.00,66653.00,66515.00,69222.00,69048.00,62435.00,50000.00,52763.00,51694.0


In [48]:
import pandas as pd

# Sample data
state_df = pd.DataFrame({
    'B19013_001E-2010': [178694, 68751, 43869, 48824, 68751, 39912, 36425, 33433, 22747, 28803],
})

col = 'B19013_001E-2010'
state_bins = 10

state_df[f'{col}-state_bin'] = pd.qcut(state_df[col], q=state_bins).apply(
    lambda b: f"{int(b.left):,} <= {int(b.right):,}" if not pd.isna(b) else 'NaN'
)

# Extract and modify the categories
categories = state_df[f'{col}-state_bin'].cat.categories
categories = [f"{int(c.split(' <= ')[0].replace(',', '')) + 1:,}" + c[c.find(' <= '):] if i != 0 else c for i, c in enumerate(categories)]

# Map the old categories to the new ones in the Series
state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].map(dict(zip(state_df[f'{col}-state_bin'].cat.categories, categories)))
state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].astype('category')

print(state_df[f'{col}-state_bin'])

0    79,746 <= 178,694
1     54,803 <= 68,751
2     41,891 <= 45,851
3     45,852 <= 54,802
4     54,803 <= 68,751
5     38,518 <= 41,890
6     35,528 <= 38,517
7     32,508 <= 35,527
8     22,746 <= 28,197
9     28,198 <= 32,507
Name: B19013_001E-2010-state_bin, dtype: category
Categories (10, object): ['22,746 <= 28,197' < '28,198 <= 32,507' < '32,508 <= 35,527' < '35,528 <= 38,517' ... '45,852 <= 54,802' < '54,803 <= 68,751' < '68,752 <= 79,745' < '79,746 <= 178,694']


In [46]:
state_bins = 10
df = split_df.copy()
dataframes = []
for state_name in df['state_name'].unique():     
    state_df = df[df['state_name'] == state_name]
    for col in state_df.filter(regex='|'.join(CENSUS_VARS)): # first column loop
        state_df[f'{col}-state_bin'] = pd.qcut(state_df[col], q=state_bins)\
                .apply(lambda b:f"{int(b.left):,} <= {int(b.right):,}") ## TO-DO: Use more complex bin formatting function to increment left endpoints by 1, starting from left 
        
        display(state_df[f'{col}-state_bin'])
        # Adjust the left endpoint for all levels except the first (TO-DO: write utils function for this)
        categories = state_df[f'{col}-state_bin'].cat.categories
        categories = [f"{int(c.split(' <= ')[0].replace(',', '')) + 1:,}" + c[c.find(' <= '):] if i != 0 else c for i, c in enumerate(categories)]
        print(categories)
        state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].cat.set_categories(categories)

        display(state_df[f'{col}-state_bin'])

        # # # Record the bin levels in the logging dict
        # cvar, year = col.split("-") # column format: <cvar>-<year>
        # bin_dict[cvar][year][state_name] = list(categories) 
        break 
    dataframes.append(state_df)
    break 
# df = pd.concat(dataframes)

# df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[f'{col}-state_bin'] = pd.qcut(state_df[col], q=state_bins)\


0       68,751 <= 178,694
1        55,524 <= 68,751
2        39,912 <= 43,869
3        43,869 <= 48,224
4        55,524 <= 68,751
              ...        
1429     36,425 <= 39,912
1430     33,433 <= 36,425
1431     33,433 <= 36,425
1432     22,747 <= 28,803
1433     28,803 <= 33,433
Name: B19013_001E-2010-state_bin, Length: 1433, dtype: category
Categories (10, object): ['37 <= 22,747' < '22,747 <= 28,803' < '28,803 <= 33,433' < '33,433 <= 36,425' ... '43,869 <= 48,224' < '48,224 <= 55,524' < '55,524 <= 68,751' < '68,751 <= 178,694']

['37 <= 22,747', '22,748 <= 28,803', '28,804 <= 33,433', '33,434 <= 36,425', '36,426 <= 39,912', '39,913 <= 43,869', '43,870 <= 48,224', '48,225 <= 55,524', '55,525 <= 68,751', '68,752 <= 178,694']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].cat.set_categories(categories)


0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1429    NaN
1430    NaN
1431    NaN
1432    NaN
1433    NaN
Name: B19013_001E-2010-state_bin, Length: 1433, dtype: category
Categories (10, object): ['37 <= 22,747' < '22,748 <= 28,803' < '28,804 <= 33,433' < '33,434 <= 36,425' ... '43,870 <= 48,224' < '48,225 <= 55,524' < '55,525 <= 68,751' < '68,752 <= 178,694']

In [43]:
df['B19013_001E-2011-state_bin']
categories = df[f'{col}-state_bin'].cat.categories
print(categories)
categories = [f"{int(c.split(' <= ')[0].replace(',', '')) + 1:,}" + c[c.find(' <= '):] if i != 0 else c for i, c in enumerate(categories)]
print(categories)


AttributeError: Can only use .cat accessor with a 'category' dtype

In [51]:
def bin_variables(rejoined_df:pd.DataFrame):
    """Bin each census variable per year in the widened dataframe (depending on config parameters), recollapsing at the end.
    Logs the bin levels for each census variable, year, and national + state (depending on config parameters).

    Previously was done on wide_df and before the crosswalk step, but the conversion to 2020 tracts affects a variable's per-tract distribution.
    Hence if bins are assigned to values in 2020 tracts it needs to occur after the tract conversion. But binning also needs to be done on an un-collapsed dataframe 
    (if we're using pd.qcut()), and our functions to convert to 2020 tracts are written to work on the already-collapsed dataframe (Catch-22).

    At least for the time being, we will un-collapse the completed dataframe, calculate/assign bins, and re-collapse.
    Given the current JSON-Rows data model of the crosswalk (convert-ctracts_pct-area_2010-to-2020.json), to edit the conversion functions to apply 
    on the un-collapsed dataframe would require normalizing the 'GEOID_TRACT_20_overlap' column, which creates a massive sparse dataframe of 85395 columns (one for every 2020 tract)
    that is time-consuming to produce. I might be able to adjust the cross-walk structure to work around this, but the current data model is flexible and applicable beyond this script.

    This un-collapse and re-collapse step from my tests adds <2 seconds for a single variable run on a national dataset. 
    """ 

    logger.info(f'Checking binning parameters ({config["bins"]})')
    state_bins = config['bins']['state'] if isinstance(config['bins']['state'], int) else None
    nat_bins = config['bins']['national'] if isinstance(config['bins']['national'], int) else None

    if (state_bins is None) and (nat_bins is None): 
        logger.info('Skipping binning.')
        return rejoined_df
    else:
        # Initialize dict for logging bins to JSON for front end's reference 
        bin_dict = {cvar:{str(year):{} for year in YEARS} for cvar in CENSUS_VARS}

        # Break out the census variable columns by year 
        df = rejoined_df.drop(list(rejoined_df.filter(regex='|'.join(CENSUS_VARS)).columns), axis=1) # to re-concatenate with cvar_df
        non_cvar_cols = list(df.columns)
        for cvar in CENSUS_VARS: 
            cvar_df = pd.json_normalize(rejoined_df[cvar])
            cvar_df.columns = [f'{cvar}-{year}' for year in cvar_df.columns]
            df = pd.concat([df, cvar_df], axis=1)

        ## Calculate, assign, and log state/national bins in dict
        # Bin by state first -- reduce amount of state loops/filters vs. doing it inside the column loop, 
        # though we have to loop through the columns again separately for the national bins.
        if state_bins is not None: 
            dataframes = []
            for state_name in df['state_name'].unique():     
                state_df = df[df['state_name'] == state_name]
                for col in state_df.filter(regex='|'.join(CENSUS_VARS)): # first column loop
                    state_df[f'{col}-state_bin'] = pd.qcut(state_df[col], q=state_bins)\
                            .apply(lambda b:f"{int(b.left):,} <= {int(b.right):,}") ## TO-DO: Use more complex bin formatting function to increment left endpoints by 1, starting from left 
                    
                    # Adjust the left endpoint for all levels except the first (TO-DO: write utils function for this)
                    categories = state_df[f'{col}-state_bin'].cat.categories
                    categories = [f"{int(c.split(' <= ')[0].replace(',', '')) + 1:,}" + c[c.find(' <= '):] if i != 0 else c for i, c in enumerate(categories)]

                    # Map the old categories to the new ones in the Series
                    state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].map(dict(zip(state_df[f'{col}-state_bin'].cat.categories, categories)))
                    state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].astype('category')

                    # Record the bin levels in the logging dict
                    cvar, year = col.split("-") # column format: <cvar>-<year>
                    bin_dict[cvar][year][state_name] = list(categories) 


                dataframes.append(state_df)

            df = pd.concat(dataframes)

        # National Bins
        if nat_bins is not None: 
            for col in df.filter(regex='|'.join(CENSUS_VARS)): # second column loop
                df[f'{col}-nat_bin'] = pd.qcut(df[col], q=nat_bins)\
                    .apply(lambda b:f"{int(b.left):,} <= {int(b.right):,}")
                
                # Adjust the left endpoint for all levels except the first (TO-DO: write utils function for this)
                categories = df[f'{col}-nat_bin'].cat.categories
                categories = [f"{int(c.split(' <= ')[0].replace(',', '')) + 1:,}" + c[c.find(' <= '):] if i != 0 else c for i, c in enumerate(categories)]

                # Map the old categories to the new ones in the Series
                state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].map(dict(zip(state_df[f'{col}-state_bin'].cat.categories, categories)))
                state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].astype('category')

                # Record the bin levels in the logging dict
                cvar, year = col.split("-") # column format: <cvar>-<year>
                bin_dict[cvar][year]['nat_bins'] = list(categories) 

        # Collapse cvar columns and bin columns (separately)
        def collapse_cvar_row(row:pd.Series, cvar:str) -> dict: 
            # Collapsed column (named cvar): {'2010':{'state_bin':<bin>, 'nat_bin':<bin>, 'value': <var_value>}, '2011':{...}, ...}
            result_dict = defaultdict(dict)
            for col in row.keys(): 
                if cvar in col: 
                    col_split = col.split('-') # either a value column (<cvar>-<year>) or a bin column (<cvar>-<year>-<state/national>_bin)
                    if len(col_split) == 3: 
                        cvar, year, bin_level_str = col_split
                        result_dict[year][bin_level_str] = row[col]
                    else: 
                        cvar, year = col_split
                        result_dict[year]['value'] = row[col]
            return result_dict

        for cvar in CENSUS_VARS: 
            df[cvar] = df.apply(lambda row: collapse_cvar_row(row, cvar), axis=1)
            # drop other non-collapsed cvar columns 
            other_cvar_cols = [col for col in df.columns if cvar in col and len(col.split('-')) > 1]
            df.drop(other_cvar_cols, axis=1, inplace=True)
                
        # Sort columns alphabetically
        cols_sorted = non_cvar_cols + sorted(list(df.filter(regex='|'.join(CENSUS_VARS)).columns))
        df = df[cols_sorted]

        # Log the dict of bins
        logger.info('BIN LEVELS:\n' + json.dumps(bin_dict, indent=2)) ## TO-DO: Log the bins to separate file for frontend to assign colors to bins

        return df
    
binned_df = bin_variables(rejoined_df)

binned_df

2023-12-29 12:17:42,743 - logger - INFO - Checking binning parameters ({'state': 10, 'national': None})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[f'{col}-state_bin'] = pd.qcut(state_df[col], q=state_bins)\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  state_df[f'{col}-state_bin'] = state_df[f'{col}-state_bin'].map(dict(zip(state_df[f'{col}-state_bin'].cat.categories, categories)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

Unnamed: 0,GEOID,state_fips,state_name,state_usps,county_fips,county_name,tract_fips,tract_dec,B19013_001E
0,01001020100,01,Alabama,AL,001,Autauga,020100,201,"{'2010': {'value': 70151.78, 'state_bin': '68,..."
1,01001020803,01,Alabama,AL,001,Autauga,020803,208.03,"{'2010': {'value': 58420.0, 'state_bin': '55,5..."
2,01001020200,01,Alabama,AL,001,Autauga,020200,202,"{'2010': {'value': 41091.0, 'state_bin': '39,9..."
3,01001020300,01,Alabama,AL,001,Autauga,020300,203,"{'2010': {'value': 44031.0, 'state_bin': '43,8..."
4,01001020400,01,Alabama,AL,001,Autauga,020400,204,"{'2010': {'value': 56627.0, 'state_bin': '55,5..."
...,...,...,...,...,...,...,...,...,...
82167,54109002901,54,West Virginia,WV,109,Wyoming,002901,29.01,"{'2010': {'value': 36862.0, 'state_bin': '36,8..."
82168,54109002902,54,West Virginia,WV,109,Wyoming,002902,29.02,"{'2010': {'value': 34796.0, 'state_bin': '34,6..."
82169,54109003000,54,West Virginia,WV,109,Wyoming,003000,30,"{'2010': {'value': 29198.0, 'state_bin': '25,8..."
82170,54109003100,54,West Virginia,WV,109,Wyoming,003100,31,"{'2010': {'value': 39272.0, 'state_bin': '36,8..."


In [54]:
binned_df['B19013_001E'][0]

defaultdict(dict,
            {'2010': {'value': 70151.78, 'state_bin': '68,752 <= 178,694'},
             '2011': {'value': 58787.15, 'state_bin': '56,439 <= 69,554'},
             '2012': {'value': 56443.5, 'state_bin': '49,914 <= 57,054'},
             '2013': {'value': 62966.97, 'state_bin': '56,894 <= 69,174'},
             '2014': {'value': 59940.0, 'state_bin': '56,927 <= 69,681'},
             '2015': {'value': 61776.16, 'state_bin': '57,380 <= 69,638'},
             '2016': {'value': 65934.0, 'state_bin': '58,745 <= 71,495'},
             '2017': {'value': 67758.17, 'state_bin': '61,210 <= 73,731'},
             '2018': {'value': 58566.38, 'state_bin': '54,666 <= 63,786'},
             '2019': {'value': 60147.79, 'state_bin': '56,889 <= 65,255'},
             '2020': {'value': 60388.0, 'state_bin': '53,425 <= 60,491'}})

In [17]:
binned_df['B19013_001E']

0        {'2010': {'value': 70151.78, 'state_bin': nan}...
1        {'2010': {'value': 58420.0, 'state_bin': nan},...
2        {'2010': {'value': 41091.0, 'state_bin': nan},...
3        {'2010': {'value': 44031.0, 'state_bin': nan},...
4        {'2010': {'value': 56627.0, 'state_bin': nan},...
                               ...                        
82167    {'2010': {'value': 36862.0, 'state_bin': nan},...
82168    {'2010': {'value': 34796.0, 'state_bin': nan},...
82169    {'2010': {'value': 29198.0, 'state_bin': nan},...
82170    {'2010': {'value': 39272.0, 'state_bin': nan},...
82171    {'2010': {'value': 35972.0, 'state_bin': nan},...
Name: B19013_001E, Length: 83849, dtype: object

In [116]:
df_geoms = binned_df.merge(py_geoms['geometry'], how='right', left_on='GEOID', right_index=True)


In [119]:
df_geoms['B19013_001E'][0]

defaultdict(dict,
            {'2010': {'value': 70151.78, 'state_bin': nan},
             '2011': {'value': 58787.15, 'state_bin': nan},
             '2012': {'value': 56443.5, 'state_bin': nan},
             '2013': {'value': 62966.97, 'state_bin': nan},
             '2014': {'value': 59940.0, 'state_bin': nan},
             '2015': {'value': 61776.16, 'state_bin': nan},
             '2016': {'value': 65934.0, 'state_bin': nan},
             '2017': {'value': 67758.17, 'state_bin': nan},
             '2018': {'value': 58566.38, 'state_bin': nan},
             '2019': {'value': 60147.79, 'state_bin': nan},
             '2020': {'value': 60388.0, 'state_bin': nan}})

In [26]:
gdf = gpd.read_file('data/B19013_001E_allStates+DC_2010-2020_state-bin-10.json')

In [1]:
import folium
from folium import plugins
import geopandas as gpd

gdf = gpd.read_file('data/B19013_001E_allStates+DC_2010-2020_state-bin-10.json')

# Create a Folium map centered at the mean of the polygons
map_center = [gdf.geometry.centroid.y.mean(), gdf.geometry.centroid.x.mean()]
mymap = folium.Map(location=map_center, zoom_start=12)

# Add GeoDataFrame polygons to the map with custom style
folium.GeoJson(gdf, style_function=lambda feature: {
    'fillColor': 'green',    # Set the fill color of the polygons
    'color': 'black',        # Set the border color
    'weight': 2,             # Set the border width
    'fillOpacity': 0.6       # Set the fill opacity
}).add_to(mymap)

# Display the map


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd

  map_center = [gdf.geometry.centroid.y.mean(), gdf.geometry.centroid.x.mean()]


AttributeError: 'Map' object has no attribute 'plot'

In [32]:
x = gdf[gdf['GEOID'] == '10005980000']
x['B19013_001E'].iloc[0]

{'2010': {'value': 57348.0, 'state_bin': '56,588 <= 60,887'},
 '2011': {'value': 65404.0, 'state_bin': '62,944 <= 66,696'},
 '2012': {'value': 52875.0, 'state_bin': '50,525 <= 52,891'},
 '2013': {'value': 58382.0, 'state_bin': '56,920 <= 60,120'},
 '2014': {'value': 56500.0, 'state_bin': '53,918 <= 57,527'},
 '2015': {'value': 58355.0, 'state_bin': '58,285 <= 61,631'},
 '2016': {'value': 69578.0, 'state_bin': '68,206 <= 76,094'},
 '2017': {'value': 73316.0, 'state_bin': '71,283 <= 82,031'},
 '2018': {'value': 78547.0, 'state_bin': '75,505 <= 83,859'},
 '2019': {'value': 81515.0, 'state_bin': '78,264 <= 86,701'},
 '2020': {'value': 'NaN', 'state_bin': 'NaN'}}

In [34]:
gdf[gdf['state_fips'] == '10'].to_file('test_file_de.json', index=False)

In [35]:
from utils import AzureBlobStorageManager, load_state_list, validate_config, read_json_rows

with open('config.yaml', 'r') as file: 
    config = yaml.full_load(file)

# Set parameters from config 
DATA_DIR = config['data_dir']
OVERLAP_PRECISION = config['tract_crosswalk']['overlap_precision']
OVERWRITE_AZURE = config['tract_crosswalk']['overwrite_azure']
OVERWRITE_LOCAL = config['tract_crosswalk']['overwrite_local']

# Create azure client 
azure_manager = AzureBlobStorageManager(connection_str=config['api-info']['azure']['connection-str'], 
                                        container_name=config['api-info']['azure']['container-name'], 
                                        download_dir=DATA_DIR)

In [37]:
azure_manager.upload_blob('data/B19013_001E_allStates+DC_2010-2020_state-bin-10.json')

Blob B19013_001E_allStates+DC_2010-2020_state-bin-10.json uploaded successfully.
