## Preapare data for regression
#prepare case data for conditional regression to calculate population density for each cluster
#pixel level population weights are aggregated into various combinations of admin units
#this is to deal with the matter of MAUP

In [1]:
import numpy as np
import rasterio
import rasterio.mask as mask
import geopandas as gpd
import pandas as pd
import tqdm
import os
import sys
import argparse 
import multiprocessing as mp

In [2]:
print('-----------------------------------------------------------------------------')
print('Population data process 6/7: preparing census data for conditional regression')
print('-----------------------------------------------------------------------------')

-----------------------------------------------------------------------------
Population data process 6/7: preparing census data for conditional regression
-----------------------------------------------------------------------------


In [3]:
#setting up paths for files
#top_folder="/home/dohyungkim/population"
#ISO="SGP"
#year="2015"

In [26]:
# Initialize parser 
my_parser = argparse.ArgumentParser(description='initial input')
my_parser.add_argument('top_folder',metavar='top_folder',type=str,help='working folder')
my_parser.add_argument('ISO',metavar='ISO',type=str,help='3 character country iso code')
my_parser.add_argument('year',metavar='year',type=str,help='population year')
args = my_parser.parse_args()
top_folder = args.top_folder
ISO = args.ISO
year = args.year
if not os.path.isdir(top_folder):
    print('The path specified does not exist')
    sys.exit()

In [4]:
df_info=pd.read_pickle(os.path.join(top_folder,ISO,"df_info.pkl"))
n_component=int(df_info["n_component"])

In [5]:
fb_file=os.path.join(top_folder,ISO,"wp_data","fb_weight_gmm.tif")
wp_admin_file=os.path.join(top_folder,ISO,"wp_data","shp", ISO+"_subnational_2000_2020.shp")
wp_census_file=os.path.join(top_folder,ISO,"wp_data",ISO+"_population_2000_2020.csv")

In [6]:
admin=gpd.read_file(wp_admin_file)
wp_census = pd.read_csv(wp_census_file)
amdin_key=admin.columns[1]
wp_key=wp_census.columns[1]
admin=admin.merge(wp_census, left_on=amdin_key, right_on=wp_key)
rec_no=len(admin)
X=np.zeros((n_component, rec_no))

In [7]:
raster=rasterio.open(fb_file)
df_y=pd.DataFrame(np.float32(admin["P_"+year]))
df_s=pd.DataFrame(np.zeros(rec_no))
df_s[:]='admin3'

In [8]:
raster=rasterio.open(fb_file)

In [9]:
print("aggregation in progress..")

aggregation in progress..


In [10]:
i=0
for k in tqdm.tqdm(range(rec_no)):
    geom = admin['geometry'][k]
    #build up
    tmp, tmp_mask = mask.mask(raster, [geom], crop=True)
    for j in range(n_component):
        X[j][i]=len(tmp[np.where(tmp==j+1)])
    i=i+1

100%|██████████| 932/932 [00:52<00:00, 17.76it/s] 


In [11]:
df_x=pd.DataFrame(X)
df_x=df_x.T
df_x['y']=df_y
df_x['s']=df_s
#df_x['ADM1_NAME']=pd.DataFrame(admin["adm_name"])

In [12]:
for index, row in admin.iterrows():  
    try:
        neighbors = admin[admin.geometry.touches(row['geometry'])].index.tolist() 
        df_tmp = df_x.loc[neighbors].agg('sum')
        df_tmp['s']='sum1'
        df_x=df_x.append(df_tmp,ignore_index=True)
    except:
        print('error')

In [13]:
#df_x_admin1=df_x.groupby('ADM1_NAME').agg('sum')
#df_x_admin1['s']='admin1'

In [14]:
#df_x=df_x.append(df_x_admin1, sort=False)

In [15]:
df_x=df_x.drop(df_x.index[np.where(df_x['y']==0)[0]])
df_x["sum"] = df_x.iloc[:,0:n_component].sum(axis=1)
df_x=df_x.drop(df_x.index[np.where(df_x['sum']==0)])

In [16]:
df_x=df_x.reset_index(drop=True)

In [17]:
df_x.to_pickle(os.path.join(top_folder,ISO,"df_x.pkl"))