In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns

import shrug_processing
from importlib import reload
from shrug_processing import merge_raw_shrug_files, prepare_shrug_data_for_geom, split_urban_rural

import shrid_aggregation
from shrid_aggregation import loop_over_states

from multiprocessing import Pool

### This notebook processes shrug data into the shapefiles ready for downstream analysis.

shrug_stats.dta and shrid2.gpkg are the files that Paul Novosad shared directly with us via email. He said public version would be released (as of Sep 2022). 

In [2]:
# at what threshold to merge rural regions until -- we tried 14 (median area of rural regions) and 25
sq_km_thresh= 25

## 1. process the raw data and shapefiles into one dataframe

In [3]:
data_dir = '/data/mosaiks/shrug/'

In [4]:
data_fp = os.path.join(data_dir,'shrug_stats.dta')
shapefile_fp = os.path.join(data_dir,'shrid2.gpkg')
merge_raw_shrug_files(data_fp, shapefile_fp, data_dir=data_dir)

26422 entries dropped for not having consumption value
525868 entries remaining
for the 126 shrids with urban and rural, consumption  is a weighted average of urban and rural consumption, weighted by urban and rural population counts.
saving compiled data in /data/mosaiks/shrug/shrug.csv
saving geo compiled data in /data/mosaiks/shrug/shrug.geojson


In [5]:
# check the output csv 
shrug_csv_fp = f'{data_dir}/shrug.csv'
all_df = pd.read_csv(os.path.join(data_dir,'shrug.csv'))
all_df.head(2)

Unnamed: 0,shrid,pc11_pca_tot_p,shrid_pc11_pca_tot_p_r,shrid_pc11_pca_tot_p_u,ec13_emp_all,ec13_emp_manuf,secc_cons_pc_rural,secc_pov_rate_rural,secc_pov_rate_urban,secc_cons_pc_urban,...,pc11_id,geometry,rural,urban,secc_cons_pc_combined,pc11_pca_tot_p_combined,frac_rural,frac_urban,state,dummy_ones
0,11-01-001-00001-000002,3770.0,3770.0,,49.0,0.0,13509.517578,0.477891,,,...,2.0,"POLYGON ((73.83561696 34.55965044, 73.82536307...",True,False,13509.517578,3770.0,1.0,0.0,1,1.0
1,11-01-001-00001-000005,5255.0,5255.0,,82.0,9.0,8611.757812,0.776421,,,...,5.0,"POLYGON ((73.94628924 34.5699081, 73.94711292 ...",True,False,8611.757812,5255.0,1.0,0.0,1,1.0


## 2. combine small rural shapefiles 

In [6]:
data_dir_condensed = data_dir+'shrug_condensed_geoms'
if not os.path.exists(data_dir_condensed):
    os.mkdir(data_dir_condensed)
    print(data_dir_condensed)
    

In [7]:
# load gdf and add projected area
gdf_by_shrid = prepare_shrug_data_for_geom(shrug_csv_fp)

# separate urban and rural gdfs
gdfs_split = split_urban_rural(gdf_by_shrid)
gdf_urban = gdfs_split['gdf_urban']
gdf_rural = gdfs_split['gdf_rural']

print(f'there are {len(gdf_urban)} urban shrids')
print(f'there are {len(gdf_rural)} rural shrids before merging')
print(f'median area of urban regions: {np.median(gdf_urban.proj_area) / 1e6} km^2')

there are 3524 urban shrids
there are 522344 rural shrids before merging
median area of urban regions: 13.919551831449446 km^2


In [8]:
print(f'median area of urban regions: {np.median(gdf_rural.proj_area) / 1e6} km^2')

median area of urban regions: 2.919125231763903 km^2


In [9]:
chunked_file_dir = os.path.join(data_dir_condensed,f'thresh_{sq_km_thresh}_sq_km')
if not os.path.exists(chunked_file_dir):
    os.mkdir(chunked_file_dir)
    print('making dir: ',chunked_file_dir)

In [10]:
gdf_urban.to_file(os.path.join(data_dir_condensed,'shrug_urban.geojson'), driver='GeoJSON')

### 2.1 loop over states in parallel

In [11]:
# this is the loop that takes a while.
loop_over_states(gdf_rural, 
                 sq_km_thresh, 
                 chunked_file_dir, 
                 num_threads=20)

01
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_01.geojson
state 01 took 0.47 minutes
02
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_02.geojson
state 02 took 2.97 minutes
03
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_03.geojson
state 03 took 1.03 minutes
05
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_05.geojson
state 05 took 2.19 minutes
06
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_06.geojson
state 06 took 0.37 minutes
08
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_08.geojson
state 08 took 2.26 minutes
09
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_09.geojson
state 09 took 4.76 minutes
10
writing to /data/mosaiks/shrug/shrug_condensed_geoms/thresh_25_sq_km/shrug_rural_state_10.geojson
sta

### 2.2  Read in the separate chunked files and compile

In [12]:
states_rural = np.sort(gdf_rural.state.unique())
# read in and compile

fps = [os.path.join(data_dir_condensed,'shrug_urban.geojson')] +\
      [os.path.join(chunked_file_dir,f'shrug_rural_state_{state}.geojson') for state in states_rural] 
      

dfs_by_state  = []
print(f'of {len(fps)}: ', end = '')
for i,fp in enumerate(fps):
    print(f'{i} ', end = '')
    df_by_state = gpd.read_file(fp)
    if fp.endswith('shrug_urban.geojson'):
        df_by_state['region'] = df_by_state['shrid'].apply(lambda x: x.split('-')[2])
    dfs_by_state.append(df_by_state.to_crs(epsg=4326))
print()

# concatenate results by state
shrug_condensed = pd.concat(dfs_by_state).reset_index(drop=True)
# put in the shrids_merged_str value for the urban shrids
shrug_condensed.loc[shrug_condensed.urban==True,'shrids_merged_str'] = shrug_condensed[shrug_condensed.urban].shrid

# perform some checks

# make sure all shrids are used by evaluating if there are the same number of unique shrids
num_ids_per_row = shrug_condensed.shrids_merged_str.apply(lambda x: len(x.split(',')))
print(f'same number of original shrids after merged? {(sum(num_ids_per_row) == len(gdf_by_shrid))}')

# assign rural to places with no urban
shrug_condensed.rename(columns={'rural':'has_rural',
                                'urban':'has_urban'},inplace=True)
shrug_condensed.loc[:,'rural'] = ~ shrug_condensed.loc[:,'has_urban'] 

# make new id and dicts explaining the relationships
shrug_condensed['condensed_shrug_id'] = np.arange(len(shrug_condensed))

# save output
shrug_condensed.to_file(os.path.join(data_dir,
                                     f'shrug_condensed_regions_{sq_km_thresh}.geojson'),driver='GeoJSON')
shrug_condensed.drop('geometry',axis=1).to_csv(os.path.join(data_dir,
                                     f'shrug_condensed_regions_{sq_km_thresh}.csv'))

of 27: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
same number of original shrids after merged? True


In [13]:
# if you want to just load the preloaded one:
#shrug_condensed = gpd.read_file(os.path.join(data_dir,
#                                f'shrug_condensed_regions_{sq_km_thresh}.geojson'))

print(f'there are {len(shrug_condensed)} total shrids after merging')
print(f'there are {sum(~shrug_condensed["rural"])} urban shrids after merging')
print(f'there are {sum(shrug_condensed["rural"])} rural shrids after merging')

there are 63356 total shrids after merging
there are 3524 urban shrids after merging
there are 59832 rural shrids after merging


### 2.3 Make and save dictionaries explaining the relationship between condensed shrids and original

In [14]:
new_id_to_shrids = {}
for key, val in zip(shrug_condensed['condensed_shrug_id'], 
                    shrug_condensed.shrids_merged_str.apply(lambda x: x.split(','))):
    new_id_to_shrids[key] = val

shrids_to_new_id = {}

for new_id, shrids_this_id in zip(shrug_condensed['condensed_shrug_id'], 
                    shrug_condensed.shrids_merged_str.apply(lambda x: x.split(','))):

    for shrid in shrids_this_id:
        shrids_to_new_id[shrid] = new_id
    
# save jsons
json.dump(shrids_to_new_id, open(os.path.join(data_dir,f'shrids_to_condensed_ids_{sq_km_thresh}.json'), 'w' ) )
json.dump(new_id_to_shrids, open(os.path.join(data_dir,f'condensed_ids_to_shrids_{sq_km_thresh}.json'), 'w' ) )