# Flood Vulnerability Index (FVI) Data Preparation
This notebook imports socioeconomic and physical data for FVI assessment.

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from shapely.geometry import Point
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, QuantileTransformer
import matplotlib.pyplot as plt
from functools import reduce
import fhv
pd.options.mode.chained_assignment = None

### Load 2011 BBS Census Data

This section imports [Bangladesh Bureau of Statistics (BBS)](http://203.112.218.65:8008/) 2011 census data downloaded from [BBS-REDATAM](http://203.112.218.69/binbgd/RpWebEngine.exe/Portal).

In [2]:
census_name = [['PAGE5','pos','person','Percent of children under 5 years','BBS 2011'],
               ['PAGE65','pos','person','Percent of elder population (65+ years)','BBS 2011'],
               ['PFEMALE','pos','person','Percent of females','BBS 2011'],
               ['PRURAL','pos','house','Percent of households in rural areas','BBS 2011'], 
               ['PWEAKBUILT','pos','house','Percent of households with weak materials','BBS 2011'],
               ['PNOWATER','pos','house','Percent of households without public water supply','BBS 2011'],
               ['PNOSANITARY','pos','house','ercent of households without sanitary facilities','BBS 2011'],
               ['PNOELEC','pos','house','Percent of households without electricity','BBS 2011'],
               ['PDISABL','pos','person','Percent of population with disability','BBS 2011'],
               ['PLITERACY','pos','person','Percent of population who cannot read and write','BBS 2011'],
               ['PETHNIC','pos','person','Percent of ethnic population','BBS 2011'],
               ['PRENT','pos','house','Percent of rented houses','BBS 2011'],
               ['PNOPRIEDU','pos','person','Percent of population who don''t complete primary education','BBS 2011'],
               ['PNOCOLLEGE','pos','person','Percent of population who don''t attend college','BBS 2011'],
               ['PNOEMPLOY','pos','person','Percent of population without employment','BBS 2011'],
               ['PAGRICULT','pos','person','Percent of population with agricultural jobs','BBS 2011']]
census_name = pd.DataFrame(census_name, columns=['Name','Sign','Type','Description','Source'])

In [76]:
# POPULATION DATA
df = fhv.LoadCensusBBS('./data/census2011/age 5 years group.xls')
popu = df.sum(axis=1)
###
# CARIBRATE POPULATION 
###
census = pd.DataFrame(index=df.index)
census.index.name = 'UID'
# - PAGE5: Percent of children under 5 years
census['PAGE5'] = df[df.columns[0]]/df.sum(axis=1)
# - PAGE65: Percent of elderly population (65+ years)
census['PAGE65'] = df[df.columns[14:]].sum(axis=1)/df.sum(axis=1)
# - PFEMALE: Percent of females
df = fhv.LoadCensusBBS('./data/census2011/sex.xls')
census['PFEMALE'] = df['Female']/df.sum(axis=1)


# BUILT ENVIRONMENT
# - PRURAL: Percent of households in rural areas
df = fhv.LoadCensusBBS('./data/census2011/Area of Residence.xls')
census['PRURAL'] = df['Rural']/df.sum(axis=1)
# - PWEAKBUILT: Percent of households with weak materials
# (#house_Kutcha_and_Jhupri / #house_total)
# *Pucca means high quality materials (e.g., cement or RCC)
# *Kutcha & Jhupri means weaker materials (e.g., mud, clay, lime, or thatched)
df = fhv.LoadCensusBBS('./data/census2011/Type of House.xls')
census['PWEAKBUILT'] = df[['Pucca','Semi-pucca']].sum(axis=1)/df.sum(1)
# - PNOWATER: Percent of households without public water supply
# *This includes "Other", excluding "Tap" and "Tube-well" water supply
df = fhv.LoadCensusBBS('./data/census2011/Source of Drinking Water.xls')
census['PNOWATER'] = df[df.columns[-1]]/df.sum(axis=1)
# - PNOSANITARY: Percent of households without sanitary facilities
# *This includes "Non-Sanitary" and "None" and excludes 
# *"Sanitary (with Water Seal)" and "Sanitary (no Water Seal)"
df = fhv.LoadCensusBBS('./data/census2011/Toilet Facilities.xls')
census['PNOSANITARY'] = df[df.columns[2:]].sum(axis=1)/df.sum(axis=1)
# - PNOELEC: Percent household without electricity
df = fhv.LoadCensusBBS('./data/census2011/Electricity Connection.xls')
census['PNOELEC'] = df['No']/df.sum(axis=1)


# SOCIAL
# - PDISABL: Percent of population with disability
# *This includes all kinds of disabilities (Speech, Vision, Hearing, Physical, Mental, Autistic)
df = fhv.LoadCensusBBS('./data/census2011/Disability.xls')
census['PDISABL'] = df[df.columns[1:]].sum(axis=1)/df.sum(axis=1)
# - PLITERACY: Percent of population who cannot read and write
df = fhv.LoadCensusBBS('./data/census2011/Literacy.xls')
census['PLITERACY'] = df['No']/df.sum(axis=1)
# - PETHNIC: Percent of ethnic population 
df = fhv.LoadCensusBBS('./data/census2011/Ethnic Population.xls')
census['PETHNIC'] = df['Yes']/df.sum(axis=1)
# - PRENT: Percent of rented houses
df = fhv.LoadCensusBBS('./data/census2011/Tenancy.xls')
census['PRENT'] = df[['Rented', 'Rent-free']].sum(axis=1)/df.sum(axis=1)


# EDUCATION
# - PNOPRIEDU: Percent of population who dont complete primary education
# *BGD's primary education is ClassI-ClassV
# *https://en.wikipedia.org/wiki/Education_in_Bangladesh#/media/File:BangEduSys.png
df = fhv.LoadCensusBBS('./data/census2011/Educational Attainment.xls')
census['PNOPRIEDU'] = df[df.columns[:5]].sum(axis=1)/df.sum(axis=1)
# - PNOCOLLEGE: Percent of population who don't attend college
census['PNOCOLLEGE'] = df[df.columns[:-4]].sum(axis=1)/df.sum(axis=1)


# EMPLOYMENT
# - PNOEMPLOY: Percent of population without employment
# *This includes "Employed" and "Household Work" and excludes "Looking For Job" and "Do Not Work"
df = fhv.LoadCensusBBS('./data/census2011/Activity Status.xls')
census['PNOEMPLOY'] = df[['Looking For Job','Do Not Work']].sum(axis=1)/df.sum(axis=1)
# - PAGRICULT : Percent of population with agricultural jobs
df = fhv.LoadCensusBBS('./data/census2011/Employment Field.xls')
census['PAGRICULT'] = df['Agriculture']/df.sum(axis=1)

# Post-
census['UID4'] = census.index % 10000


In [78]:
census.sort_values(by='UID4')

Unnamed: 0_level_0,PAGE5,PAGE65,PFEMALE,PRURAL,PWEAKBUILT,PNOWATER,PNOSANITARY,PNOELEC,PDISABL,PLITERACY,PETHNIC,PRENT,PNOPRIEDU,PNOCOLLEGE,PNOEMPLOY,PAGRICULT,UID4
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
400108,0.081670,0.041383,0.498140,0.812727,0.352676,0.052763,0.212208,0.403065,0.013863,0.364230,0.003046,0.158180,0.398776,0.908004,0.353987,0.449099,108
400114,0.102745,0.041395,0.499921,0.988999,0.095130,0.084325,0.346418,0.644892,0.012614,0.438109,0.000000,0.029144,0.502024,0.955627,0.380462,0.788877,114
400134,0.085246,0.040402,0.496317,0.804599,0.359883,0.010262,0.157758,0.456916,0.015647,0.379747,0.014849,0.073492,0.433713,0.933013,0.338677,0.583023,134
400138,0.090340,0.049221,0.507499,0.901043,0.088186,0.389552,0.102452,0.663980,0.013916,0.432220,0.000000,0.034352,0.468910,0.948972,0.351086,0.554738,138
400156,0.104005,0.037867,0.501788,0.906492,0.140396,0.028598,0.109341,0.500421,0.015503,0.499932,0.000000,0.028528,0.523608,0.957912,0.388359,0.750015,156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559408,0.111741,0.026552,0.493620,0.917558,0.205696,0.014656,0.929618,0.755081,0.013033,0.566478,0.001099,0.024303,0.587347,0.959658,0.332747,0.839119,9408
559451,0.114359,0.027861,0.498381,0.948545,0.200356,0.016530,0.623486,0.773423,0.017716,0.578624,0.003895,0.053301,0.625994,0.961238,0.348298,0.865185,9451
559482,0.104116,0.030509,0.496775,0.887757,0.232405,0.017839,0.605825,0.731110,0.019812,0.521903,0.009383,0.066948,0.553899,0.943832,0.325835,0.694034,9482
559486,0.111430,0.028077,0.495263,0.924085,0.247987,0.013810,0.704002,0.721592,0.016007,0.534208,0.011714,0.054573,0.588485,0.951440,0.343466,0.754792,9486


## Other socio-economic data
- [Bangladesh 2010 Poverty Maps (Zila Upazila)](http://203.112.218.65:8008/WebTestApplication/userfiles/Image/LatestReports/Bangladesh_ZilaUpazila_pov_est_2010.pdf) is obtained from [BBS Income, Expenditure & Poverty](http://203.112.218.65:8008/PageWebMenuContent.aspx?MenuKey=366).


In [92]:
from tabula import read_pdf

# Poverty
# Read PDF document and obtain data
df = read_pdf('./data/socioecon/Bangladesh_ZilaUpazila_pov_est_2010.pdf', 
             pages=list(range(3,13)), multiple_tables=False,
             pandas_options={'header': None, 'skiprows':2})
df.columns = ['zl-code','zila-name','UID4','upz-name','PPOOREXTR','PPOOR']
df = df.drop(['zl-code','zila-name','upz-name'], axis=1)
# Here we use only 4 upazila code to match with census UID, since all 4 digits are unique! Which means
assert len(np.unique(census.index % 10000)) == len(np.unique(df['UID4']))
# Sorting by UID4
df = df.set_index('UID4').sort_index()
# Merging

In [97]:
census.merge(df, on='UID4')

Unnamed: 0,PAGE5,PAGE65,PFEMALE,PRURAL,PWEAKBUILT,PNOWATER,PNOSANITARY,PNOELEC,PDISABL,PLITERACY,PETHNIC,PRENT,PNOPRIEDU,PNOCOLLEGE,PNOEMPLOY,PAGRICULT,UID4,PPOOREXTR,PPOOR
0,0.103249,0.040339,0.511939,0.919587,0.053392,0.019316,0.330364,0.783000,0.020299,0.472155,0.004445,0.067313,0.541183,0.962848,0.334860,0.722824,409,12.0,22.8
1,0.095860,0.039666,0.504324,0.908695,0.068895,0.020730,0.166402,0.704202,0.023943,0.388951,0.000000,0.026309,0.421312,0.948729,0.329836,0.601251,419,8.9,17.1
2,0.097443,0.040805,0.508003,0.881363,0.092098,0.050962,0.268048,0.680701,0.020743,0.413562,0.000741,0.097429,0.444761,0.939851,0.345242,0.561402,428,9.9,19.2
3,0.093653,0.044466,0.516130,0.886505,0.071055,0.048994,0.218788,0.648306,0.018080,0.398837,0.000000,0.051930,0.432513,0.942979,0.367495,0.607921,447,10.3,19.6
4,0.098751,0.044111,0.508659,0.828357,0.060392,0.436741,0.223187,0.688012,0.022614,0.395149,0.000023,0.102425,0.425012,0.954199,0.325537,0.638243,485,6.1,12.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,0.150547,0.026528,0.499579,0.978997,0.195637,0.542778,0.795591,0.686969,0.011467,0.672957,0.008522,0.055613,0.725874,0.982873,0.403099,0.662260,9141,46.5,52.6
540,0.143152,0.030035,0.500637,0.945791,0.282478,0.283740,0.587684,0.593275,0.014548,0.588482,0.011797,0.093185,0.663757,0.974846,0.405466,0.527961,9153,28.9,34.7
541,0.137528,0.028306,0.510098,0.898787,0.385117,0.746354,0.581728,0.565562,0.014551,0.564579,0.001018,0.031226,0.626126,0.976171,0.427244,0.631287,9159,39.7,45.8
542,0.105126,0.023166,0.473312,0.332392,0.741495,0.046615,0.218020,0.132115,0.010691,0.387067,0.007786,0.579076,0.433749,0.875801,0.391028,0.101994,9162,9.7,14.3


In [75]:
census.merge(df, on='UID4')

Unnamed: 0,PAGE5,PAGE65,PFEMALE,PRURAL,PWEAKBUILT,PNOWATER,PNOSANITARY,PNOELEC,PDISABL,PLITERACY,PETHNIC,PRENT,PNOPRIEDU,PNOCOLLEGE,PNOEMPLOY,PAGRICULT,UID4,PPOOREXTR,PPOOR
0,0.103249,0.040339,0.511939,0.919587,0.053392,0.019316,0.330364,0.783000,0.020299,0.472155,0.004445,0.067313,0.541183,0.962848,0.334860,0.722824,409,12.0,22.8
1,0.095860,0.039666,0.504324,0.908695,0.068895,0.020730,0.166402,0.704202,0.023943,0.388951,0.000000,0.026309,0.421312,0.948729,0.329836,0.601251,419,8.9,17.1
2,0.097443,0.040805,0.508003,0.881363,0.092098,0.050962,0.268048,0.680701,0.020743,0.413562,0.000741,0.097429,0.444761,0.939851,0.345242,0.561402,428,9.9,19.2
3,0.093653,0.044466,0.516130,0.886505,0.071055,0.048994,0.218788,0.648306,0.018080,0.398837,0.000000,0.051930,0.432513,0.942979,0.367495,0.607921,447,10.3,19.6
4,0.098751,0.044111,0.508659,0.828357,0.060392,0.436741,0.223187,0.688012,0.022614,0.395149,0.000023,0.102425,0.425012,0.954199,0.325537,0.638243,485,6.1,12.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539,0.150547,0.026528,0.499579,0.978997,0.195637,0.542778,0.795591,0.686969,0.011467,0.672957,0.008522,0.055613,0.725874,0.982873,0.403099,0.662260,9141,46.5,52.6
540,0.143152,0.030035,0.500637,0.945791,0.282478,0.283740,0.587684,0.593275,0.014548,0.588482,0.011797,0.093185,0.663757,0.974846,0.405466,0.527961,9153,28.9,34.7
541,0.137528,0.028306,0.510098,0.898787,0.385117,0.746354,0.581728,0.565562,0.014551,0.564579,0.001018,0.031226,0.626126,0.976171,0.427244,0.631287,9159,39.7,45.8
542,0.105126,0.023166,0.473312,0.332392,0.741495,0.046615,0.218020,0.132115,0.010691,0.387067,0.007786,0.579076,0.433749,0.875801,0.391028,0.101994,9162,9.7,14.3
