## Flood Vulnerability Index (FVI) Data Preparation
This notebook imports socioeconomic and physical data for FVI assessment.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, QuantileTransformer
import fhv
from tabula import read_pdf

### Load demographic and socio-economic data
This section imports a variety of demographic and socio-economic data from multiple sources:
- [Bangladesh Bureau of Statistics (BBS)](http://203.112.218.65:8008/) 2011 census data downloaded from [BBS-REDATAM](http://203.112.218.69/binbgd/RpWebEngine.exe/Portal).
- [Bangladesh 2010 Poverty Maps (Zila Upazila)](http://203.112.218.65:8008/WebTestApplication/userfiles/Image/LatestReports/Bangladesh_ZilaUpazila_pov_est_2010.pdf) is obtained from [BBS Income, Expenditure & Poverty](http://203.112.218.65:8008/PageWebMenuContent.aspx?MenuKey=366).

In [2]:
census_name = [['PAGE5','pos','Person','Demographic','Percent of children under 5 years','MinMax'],
               ['PAGE65','pos','Person','Demographic','Percent of elder population (65+ years)','MinMax'],
               ['PFEMALE','pos','Person','Demographic','Percent of woman','MinMax'],
               ['PRURAL','pos','House','Built','Percent of households in rural areas','MinMax'], 
               ['PWEAKBUILT','pos','House','Built','Percent of households with weak materials','MinMax'],
               ['PNOWATER','pos','House','Built','Percent of households without public water supply','MinMax'],
               ['PNOSANITARY','pos','House','Built','Percent of households without sanitary facilities','MinMax'],
               ['PNOELEC','pos','House','Built','Percent of households without electricity','MinMax'],
               ['PDISABL','pos','Person','Social','Percent of population with any sort of disability','MinMax'],
               ['PLITERACY','pos','Person','Social','Percent of population who cannot read and write','MinMax'],
               ['PETHNIC','pos','Person','Social','Percent of ethnic population','MinMax'],
               ['PRENT','pos','House','Social','Percent of rented houses','MinMax'],
               ['PNOEMPLOY','pos','Person','Economic','Percent of population without employment','MinMax'],
               ['PAGRICULT','pos','Person','Economic','Percent of population with agricultural jobs','MinMax'],
               ['PPOOR','pos','House','Economic','Percentage of population below the upper poverty line','MinMax'],
               ['PPOOREXTR','pos','House','Economic','Percentage of population below the lower povery line','MinMax'],
               ['PNOPRIEDU','pos','Person','Education','Percent of population without primary education','MinMax'],
               ['PNOCOLLEGE','pos','Person','Education','Percent of population without college education','MinMax']
              ]
census_name = pd.DataFrame(census_name, columns=['Name','Sign','Type','Domain','Description','Normalization'])

In [3]:
# POPULATION DATA
df = fhv.LoadCensusBBS('./data/census2011/age 5 years group.xls')
popu = df.sum(axis=1)
###
# CARIBRATE POPULATION 
###
census = pd.DataFrame(index=df.index)
census.index.name = 'UID'
census['UID4'] = census.index % 10000   # Add a column of the last 4 digits of UID
# - PAGE5: Percent of children under 5 years
census['PAGE5'] = df[df.columns[0]]/df.sum(axis=1)
# - PAGE65: Percent of elderly population (65+ years)
census['PAGE65'] = df[df.columns[14:]].sum(axis=1)/df.sum(axis=1)
# - PFEMALE: Percent of females
df = fhv.LoadCensusBBS('./data/census2011/sex.xls')
census['PFEMALE'] = df['Female']/df.sum(axis=1)


# BUILT ENVIRONMENT
# - PRURAL: Percent of households in rural areas
df = fhv.LoadCensusBBS('./data/census2011/Area of Residence.xls')
census['PRURAL'] = df['Rural']/df.sum(axis=1)
# - PWEAKBUILT: Percent of households with weak materials
# (#house_Kutcha_and_Jhupri / #house_total)
# *Pucca means high quality materials (e.g., cement or RCC)
# *Kutcha & Jhupri means weaker materials (e.g., mud, clay, lime, or thatched)
df = fhv.LoadCensusBBS('./data/census2011/Type of House.xls')
census['PWEAKBUILT'] = df[['Pucca','Semi-pucca']].sum(axis=1)/df.sum(1)
# - PNOWATER: Percent of households without public water supply
# *This includes "Other", excluding "Tap" and "Tube-well" water supply
df = fhv.LoadCensusBBS('./data/census2011/Source of Drinking Water.xls')
census['PNOWATER'] = df[df.columns[-1]]/df.sum(axis=1)
# - PNOSANITARY: Percent of households without sanitary facilities
# *This includes "Non-Sanitary" and "None" and excludes 
# *"Sanitary (with Water Seal)" and "Sanitary (no Water Seal)"
df = fhv.LoadCensusBBS('./data/census2011/Toilet Facilities.xls')
census['PNOSANITARY'] = df[df.columns[2:]].sum(axis=1)/df.sum(axis=1)
# - PNOELEC: Percent household without electricity
df = fhv.LoadCensusBBS('./data/census2011/Electricity Connection.xls')
census['PNOELEC'] = df['No']/df.sum(axis=1)


# SOCIAL
# - PDISABL: Percent of population with disability
# *This includes all kinds of disabilities (Speech, Vision, Hearing, Physical, Mental, Autistic)
df = fhv.LoadCensusBBS('./data/census2011/Disability.xls')
census['PDISABL'] = df[df.columns[1:]].sum(axis=1)/df.sum(axis=1)
# - PLITERACY: Percent of population who cannot read and write
df = fhv.LoadCensusBBS('./data/census2011/Literacy.xls')
census['PLITERACY'] = df['No']/df.sum(axis=1)
# - PETHNIC: Percent of ethnic population 
df = fhv.LoadCensusBBS('./data/census2011/Ethnic Population.xls')
census['PETHNIC'] = df['Yes']/df.sum(axis=1)
# - PRENT: Percent of rented houses
df = fhv.LoadCensusBBS('./data/census2011/Tenancy.xls')
census['PRENT'] = df[['Rented', 'Rent-free']].sum(axis=1)/df.sum(axis=1)


# EDUCATION
# - PNOPRIEDU: Percent of population who dont complete primary education
# *BGD's primary education is ClassI-ClassV
# *https://en.wikipedia.org/wiki/Education_in_Bangladesh#/media/File:BangEduSys.png
df = fhv.LoadCensusBBS('./data/census2011/Educational Attainment.xls')
census['PNOPRIEDU'] = df[df.columns[:5]].sum(axis=1)/df.sum(axis=1)
# - PNOCOLLEGE: Percent of population who don't attend college
census['PNOCOLLEGE'] = df[df.columns[:-4]].sum(axis=1)/df.sum(axis=1)


# EMPLOYMENT
# - PNOEMPLOY: Percent of population without employment
# *This includes "Employed" and "Household Work" and excludes "Looking For Job" and "Do Not Work"
df = fhv.LoadCensusBBS('./data/census2011/Activity Status.xls')
census['PNOEMPLOY'] = df[['Looking For Job','Do Not Work']].sum(axis=1)/df.sum(axis=1)
# - PAGRICULT : Percent of population with agricultural jobs
df = fhv.LoadCensusBBS('./data/census2011/Employment Field.xls')
census['PAGRICULT'] = df['Agriculture']/df.sum(axis=1)


# POVERTY
# Read PDF document and obtain data
df = read_pdf('./data/socioecon/Bangladesh_ZilaUpazila_pov_est_2010.pdf', 
             pages=list(range(3,13)), multiple_tables=False,
             pandas_options={'header': None, 'skiprows':2})
df.columns = ['zl-code','zila-name','UID4','upz-name','PPOOREXTR','PPOOR']
df = df.drop(['zl-code','zila-name','upz-name'], axis=1)
# Percentage to decimal
df[['PPOOREXTR','PPOOR']] = df[['PPOOREXTR','PPOOR']]/100
# Here we use only 4 upazila code to match with census UID, since all 4 digits are unique! Which means
assert len(np.unique(census.index % 10000)) == len(np.unique(df['UID4']))
# Sorting by UID4
df = df.set_index('UID4').sort_index()
# Merging
census = census.reset_index().merge(df, on='UID4').set_index('UID').drop('UID4',axis=1)
# Reordering to be matched with census_name
census = census[census_name['Name']]


# Additional columns
census_name['Scale'] = 'Upazila'
census_name['Source'] = 'BBS (2011)'

In [6]:
# census_name[['Name','Domain','Description']]
census_name


Unnamed: 0,Name,Sign,Type,Domain,Description,Normalization,Scale,Source
0,PAGE5,pos,Person,Demographic,Percent of children under 5 years,MinMax,Upazila,BBS (2011)
1,PAGE65,pos,Person,Demographic,Percent of elder population (65+ years),MinMax,Upazila,BBS (2011)
2,PFEMALE,pos,Person,Demographic,Percent of woman,MinMax,Upazila,BBS (2011)
3,PRURAL,pos,House,Built,Percent of households in rural areas,MinMax,Upazila,BBS (2011)
4,PWEAKBUILT,pos,House,Built,Percent of households with weak materials,MinMax,Upazila,BBS (2011)
5,PNOWATER,pos,House,Built,Percent of households without public water supply,MinMax,Upazila,BBS (2011)
6,PNOSANITARY,pos,House,Built,Percent of households without sanitary facilities,MinMax,Upazila,BBS (2011)
7,PNOELEC,pos,House,Built,Percent of households without electricity,MinMax,Upazila,BBS (2011)
8,PDISABL,pos,Person,Social,Percent of population with any sort of disability,MinMax,Upazila,BBS (2011)
9,PLITERACY,pos,Person,Social,Percent of population who cannot read and write,MinMax,Upazila,BBS (2011)


### Load variabes of Disaster-related Statistics (BBS, 2015)
- Original data is from [Bangladesh Disaster-related Statistics 2015: Climate Change and Natural Disaster Perspectives](http://203.112.218.65:8008/PageWebMenuContent.aspx?MenuKey=242).
- The extraction of the data is done by [LoadDisasterStatistics.ipynb](LoadDisasterStatistics.ipynb)

In [None]:
load

In [None]:
# Downscale data from Zila to Upazila

### Nomalization and Save variables

In [39]:
data = census.copy()
data_name = census_name.copy()

# Flip signs of the indicators
for index, row in data_name.iterrows():
    if row['Sign'] == 'neg':
        data[row['Name']] = -data[row['Name']].values
    elif row['Sign'] == 'pos':
        pass
    else:
        raise Exception("problem")
# Scaling to 0-1 with Max/Min values
scaler = MinMaxScaler()
data[data.columns] = scaler.fit_transform(data[data.columns], )



./data/data.hdf is saved.
./data/data_table.hdf is saved.


In [None]:
# Save data
if True:
    fn = './data/data.hdf'
    data.to_hdf(fn, 'data')
    print('%s is saved.' % fn)
    fn = './data/data_table.hdf'
    data_name.to_hdf(fn, 'name')
    print('%s is saved.' % fn)

## Load physical data

### Correlation Matrix

In [None]:
### Type of House and Tenancy
df = pd.read_excel('./data/union/Type of House and Tenancy.xls',
                   skiprows=11,header=0,index_col=0,skipfooter=8)