In [242]:
import os
os.chdir('../quafing/')
print(f"Working directory: {os.getcwd()}")
import quafing as q


import random
import numpy as np

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import pearsonr, mode


plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({"font.size" : 15, 
                     "figure.dpi" : 100, 
                     "legend.fontsize" : 13, 
                     "grid.alpha" : 0.3, 
                     "axes.grid": True, 
                     "axes.axisbelow" : True, 
                     "figure.figsize":(6, 5)})

Working directory: /Users/charlesdupont/Desktop/Thesis/code/quafing


In [161]:
def load_data(path):
    """
    Loads .dta file using provided path.
    """
    return pd.read_stata(path, convert_categoricals=False)

In [162]:
DATA_DIR = "../../BCCASII/"
DATA_SUBDIR = "Household/"

## A: Location and identification detail

In [163]:
filename = "001_mod_a.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,vcode,ucode,tcode,dcode,phone,ezcode,hhid,rid_male,rid_female,hhhid,...,a14dd,a14mm,a14yy,a15dd,a15mm,a15yy,a16dd,a16mm,a16yy,flag
0,,1,1,1,,7,1.0,1.0,2.0,1.0,...,,,2012.0,,,2012.0,,,,
1,,1,1,1,,7,2.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
2,,1,1,1,,7,3.0,1.0,2.0,1.0,...,,,2012.0,,,2012.0,,,,
3,,1,1,1,,7,4.0,1.0,2.0,1.0,...,,,2012.0,,,2012.0,,,,
4,,1,1,1,,7,5.0,,2.0,1.0,...,,,2012.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,,40,39,31,,6,796.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
823,,40,39,31,,6,797.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
824,,40,39,31,,6,798.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
825,,40,39,31,,6,799.0,8.0,2.0,2.0,...,,,2012.0,,,,,,,


In [165]:
# keep household ID, agro-ecological zone
processed = df[["hhid", "ezcode"]]

# construct community code from union, thana, district codes
processed["community_code"] = df["ucode"].astype(str) + "_" + \
                              df["tcode"].astype(str) + "_" + \
                              df["dcode"].astype(str)

## B: Household composition and education

In [168]:
filename = "002_mod_b.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,mid,b01,b01_1,b02,b03,b04,b05,b06,b07,b08_1,b08_2,b08_3
0,1.0,1,1,0,1.0,47.0,2.0,4.0,6.0,2.0,4.0,5.0,25.0
1,1.0,2,2,0,2.0,40.0,2.0,4.0,6.0,2.0,21.0,5.0,22.0
2,1.0,3,1,0,3.0,19.0,1.0,4.0,12.0,1.0,20.0,4.0,6.0
3,1.0,4,2,0,3.0,17.0,1.0,4.0,9.0,1.0,20.0,5.0,4.0
4,1.0,5,2,0,3.0,13.0,1.0,4.0,6.0,1.0,20.0,99.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4536,799.0,9,2,2,4.0,16.0,2.0,4.0,5.0,2.0,21.0,99.0,99.0
4537,800.0,1,1,0,2.0,37.0,2.0,1.0,99.0,2.0,23.0,99.0,99.0
4538,800.0,2,2,0,1.0,32.0,2.0,1.0,99.0,2.0,21.0,4.0,10.0
4539,800.0,3,1,0,3.0,7.0,1.0,4.0,1.0,1.0,20.0,99.0,99.0


In [180]:
g = df.groupby("hhid").count()

# household size
processed = pd.DataFrame.from_dict({"household_size":g["mid"]})

# household id
processed["hhid"] = processed.index

unique_hhids = df["hhid"].unique()

# ratio male, female
males = {}
females = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    m = sum(household["b01"]==1)
    f = sum(household["b01"]==2)
    s = m+f
    males[hhid] = m/s
    females[hhid] = f/s
processed = processed.join(pd.DataFrame.from_dict({"ratio_male":males}))
processed = processed.join(pd.DataFrame.from_dict({"ratio_female":females}))

# number of children
num_children = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    c = sum(household["b03"] < 18)
    num_children[hhid] = c
processed = processed.join(pd.DataFrame.from_dict({"number_children":num_children}))

# highest level of education
education_ordering = [99, 66, 67, 68, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 22, 10, 33, 12, 15, 16, 17]
max_education = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    education_levels = [education_ordering.index(x)
                        for x in [x for x in list(household["b06"]) if not np.isnan(x)]]
    if len(education_levels):
        max_education[hhid] = max(education_levels)
    else:
        max_education[hhid] = -1
processed = processed.join(pd.DataFrame.from_dict({"highest_education":max_education}))

# highest level of literacy
literacy_ordering = [1, 2, 3, 4]
max_literacy = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    literacy_labels = [literacy_ordering.index(x)
                       for x in [x for x in list(household["b05"]) if not np.isnan(x)]]
    if len(literacy_labels):
        max_literacy[hhid] = max(literacy_labels)
    else:
        max_literacy[hhid] = -1
processed = processed.join(pd.DataFrame.from_dict({"highest_literacy":max_literacy}))

# primary occupation categories
categories = {
    "day_labor": [1,2,3],
    "self_employ_farm": [4,5,6,7,8],
    "self_employ_non_farm": [9],
    "low_income_profession": [10,11,12,13],
    "mechanic": [14,15],
    "other": [16, 17, 18, 19],
    "non_earning": [20,21,22,23,24,25,26]
}

for c, c_list in categories.items():
    c_dict = {hhid:0 for hhid in unique_hhids}
    for hhid in unique_hhids:
        household = df.loc[df.hhid==hhid]
        primary_occupation_labels = list(household["b08_1"])
        for item in c_list:
            if item in primary_occupation_labels:
                c_dict[hhid] += 1
    processed = processed.join(pd.DataFrame.from_dict({c:c_dict}))

## C: Roster of land and water bodies owned or under operation

In [184]:
filename = "003_mod_c.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,plotid,c101,c102,c103,c104,c105,c106_1,c106_2,c106_3,c107,c108_1,c108_2,c108_3,c109,c110,c111,c112,c113
0,1.0,1.0,1,9.0,0.0,1.0,4,,,,,1.0,,1.0,90000.0,3.0,1989.0,5.0,
1,1.0,2.0,2,66.0,150.0,2.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,200000.0,3.0,1989.0,5.0,
2,1.0,3.0,2,66.0,150.0,2.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,200000.0,3.0,1989.0,5.0,
3,1.0,4.0,2,66.0,180.0,2.0,4,1.0,1.0,6.0,1.0,1.0,,1.0,200000.0,3.0,1989.0,5.0,
4,1.0,5.0,2,33.0,200.0,2.0,4,1.0,1.0,1.0,10.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4841,800.0,3.0,2,33.0,500.0,1.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,100000.0,3.0,1980.0,5.0,
4842,800.0,4.0,2,33.0,1000.0,2.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,110000.0,3.0,1980.0,5.0,
4843,800.0,5.0,2,23.0,500.0,1.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,80000.0,3.0,1980.0,5.0,
4844,800.0,6.0,2,33.0,100.0,3.0,4,1.0,1.0,1.0,4.0,97.0,,1.0,100000.0,6.0,2009.0,3.0,


In [250]:
g = df.groupby("hhid")

# total land area
processed = pd.DataFrame.from_dict({"total_land_size":dict(g.sum()["c102"])})

# household id
processed["hhid"] = processed.index

# total value per month
processed = processed.join(pd.DataFrame.from_dict({"total_land_value_per_month":
                                                   dict(g.sum()["c113"])}))

# min land acquisition year
processed = processed.join(pd.DataFrame.from_dict({"min_land_acquisition_year":
                                                   dict(g.min()["c111"])}))

# max land acquisition year
processed = processed.join(pd.DataFrame.from_dict({"max_land_acquisition_year":
                                                   dict(g.max()["c111"])}))

# mean land acquisition year
processed = processed.join(pd.DataFrame.from_dict({"mean_land_acquisition_year":
                                                   dict(round(g.mean()["c111"]))}))

# most common answer from (plot type, soil type, operational status of land, acquisition means of land)
categories = {
    ("plot_type", "c101"): [i for i in range(1, 10)],
    ("soil_type", "c105"): [i for i in range(1, 6)],
    ("land_operational_status", "c107"): [i for i in range(1, 11)],
    ("land_acquisition_means", "c110"): [i for i in range(1, 8)],
}

for c, c_list in categories.items():
    c_dict = {hhid:0 for hhid in unique_hhids}
    c, col = c
    for hhid in unique_hhids:
        household = df.loc[df.hhid==hhid]
        labels = [x for x in list(household[col]) if not np.isnan(x)]
        if labels:
            c_dict[hhid] = mode(labels)[0][0]
        else:
            c_dict[hhid] = -1
    processed = processed.join(pd.DataFrame.from_dict({c:c_dict}))

## F1: Livestock and poultry

In [337]:
filename = "004_mod_f1.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117
0,1.0,1,2,20000.0,2,30000.0,0,0,0,0,0,0,0,0,,,,
1,1.0,2,3,4500.0,4,8000.0,3,0,0,0,0,2,0,0,,,,
2,1.0,4,15,1500.0,30,2200.0,30,0,0,0,0,7,3,5,4.0,17.00,50.0,950.0
3,2.0,1,3,20000.0,3,30000.0,0,0,0,0,0,0,0,0,,,,
4,2.0,4,10,1000.0,4,800.0,0,0,0,0,0,4,2,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,797.0,4,5,900.0,11,1300.0,20,0,0,0,0,14,0,0,,,,
1452,798.0,4,2,500.0,3,750.0,0,6,0,0,0,0,3,2,1.0,0.00,0.0,300.0
1453,799.0,2,5,6000.0,3,6000.0,0,0,0,0,0,1,0,1,1.0,0.25,5.0,1800.0
1454,799.0,4,12,2000.0,6,1200.0,10,0,0,0,0,4,12,0,,,,


In [338]:
# household id
processed = pd.DataFrame.from_dict({"hhid":list(unique_hhids)})

categories = {
    "cattle_buffalo":1,
    "goat_sheep":2,
    "pigs":3,
    "chidken_duck":4
}

cols = [("count_2011", "f102"), ("value_2011", "f103"), 
        ("count_now", "f104"), ("value_now", "f105"), ("total_selling_value", "f117")]   

results = {hhid:{} for hhid in unique_hhids}

for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    
    for c, number in categories.items():
        if number not in list(household["f101"]):
            for l, _ in cols:
                results[hhid][c+"_"+l] = 0
        else:
            for l, col in cols:
                value = household.loc[household["f101"]==number][col].iloc[0]
                if np.isnan(value):
                    results[hhid][c+"_"+l] = 0
                else:
                    results[hhid][c+"_"+l] = value
                    
for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]

## F3: Expenditure for livestock, Poultry/Duckery Production Last 12 Months

In [340]:
filename = "005_mod_f3.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,f301,f302,f303,f304_1,f304_2,f304_3,f304_4,f305_1,f305_2,f306_1,f306_2,f307,f308_1,f308_2,f308_3
0,1.0,1,1000,750,350.0,350,1.5,2.00,0,0,,,100,12,9.0,10.0
1,1.0,2,400,50,100.0,350,0.5,1.00,0,0,,,30,9,10.0,
2,1.0,4,500,100,0.0,400,,0.50,0,0,,,0,6,5.0,9.0
3,2.0,1,3000,650,700.0,400,1.5,1.50,0,0,,,200,9,10.0,12.0
4,2.0,4,0,0,0.0,380,,0.25,0,0,,,0,6,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1439,798.0,4,200,30,0.0,620,,0.50,0,0,,,0,6,4.0,
1440,799.0,2,0,200,150.0,700,1.0,1.00,0,0,,,100,4,,
1441,799.0,4,0,0,0.0,700,,1.00,0,0,,,0,5,6.0,4.0
1442,800.0,2,0,0,0.0,200,,0.50,0,0,,,0,4,,


In [356]:
df["f306"] = df["f306_1"].fillna(0) + df["f306_2"].fillna(0)
processed = df.groupby("hhid").sum()[["f302", "f303", "f306", "f307"]]
processed["hhid"] = processed.index

## G1: Credit obtained

In [359]:
filename = "006_mod_g.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,mid,loanid,g01,g02,g04,g05,g06,g07,g08,...,g26,g27,g28,g29,g30,g31,g32_1,g32_2,g33_1,g33_2
0,1.0,1,1,11,2,2011,3,2,21000.0,1,...,2,50.0,1.0,50.0,,,1,2.0,1,3.0
1,2.0,1,1,14,17,2010,7,20,50000.0,1,...,17,80.0,1.0,20.0,,,1,3.0,1,
2,3.0,1,1,10,4,1995,9,10,3500.0,1,...,4,100.0,,,,,33,,33,
3,4.0,5,1,9,14,2011,9,18,60000.0,1,...,14,50.0,4.0,50.0,,,1,5.0,1,
4,4.0,1,2,14,3,2012,5,20,30000.0,1,...,3,100.0,,,,,1,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,796.0,1,1,9,4,2001,5,20,16000.0,1,...,4,100.0,,,,,1,,1,
1148,797.0,1,1,14,3,2012,2,16,50000.0,1,...,3,100.0,,,,,1,,1,
1149,799.0,1,1,9,4,2006,8,10,16000.0,1,...,4,100.0,,,,,1,,1,
1150,799.0,2,2,14,15,2012,2,15,200000.0,1,...,15,50.0,11.0,50.0,,,1,2.0,33,8.0


In [361]:
processed = df.groupby("hhid").sum()[["g07", "g10", "g16", "g18"]]
processed["hhid"] = processed.index

## G2: Repayment and utilization of Loan

## H: Household Assets

In [363]:
filename = "007_mod_h.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,rid_male,rid_female,asset,h02,h03,h04,h05,h06,h07,h08,h09_1,h09_2,h10_1,h10_2,h11,h12
0,1.0,,2.0,2,1,4,2012,200,150,1,5,99.0,,1,,999,999
1,1.0,,2.0,3,3,8,2011,5500,3000,1,1,1.0,9.0,1,9.0,999,999
2,1.0,,2.0,10,4,6,2007,20000,35000,1,1,1.0,,2,,999,999
3,1.0,,2.0,11,1,6,1992,2250,1000,1,7,99.0,,1,3.0,999,999
4,1.0,,2.0,12,1,9,2009,2250,1200,1,1,1.0,,3,,999,999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345,799.0,,2.0,22,1,10,2007,500,100,2,1,33.0,,2,,999,999
6346,799.0,,2.0,28,3,5,2008,500,200,2,1,33.0,,2,,999,999
6347,800.0,,2.0,10,1,4,2001,500,700,1,1,1.0,,2,,999,999
6348,800.0,,2.0,19,6,6,2011,400,1500,2,1,2.0,,1,2.0,50,50


In [None]:
# household id
processed = pd.DataFrame.from_dict({"hhid":list(unique_hhids)})

categories = {
    "consumable_durables":[i for i in range(1, 12)],
    "transport":[i for i in range(12, 16)],
    "livestock_poultry":[i for i in range(16, 20)],
    "agricultural_equipments":[i for i in range(20,29)],
    "other":[0]
}

#h06

results = {hhid:{} for hhid in unique_hhids}

for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    
    for c, numbers in categories.items():
        
        if number not in list(household["f101"]):
            for l, _ in cols:
                results[hhid][c+"_"+l] = 0
        else:
            for l, col in cols:
                value = household.loc[household["f101"]==number][col].iloc[0]
                if np.isnan(value):
                    results[hhid][c+"_"+l] = 0
                else:
                    results[hhid][c+"_"+l] = value
                    
for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]

## H1: Assets: Sales of Assets and Change in Ownership

## I: Food Consumption: Purchases, Home Production, and Other Sources

## J1: Non-food Expenditure: Recall Period Last 1 month

## J2: Non-food Expenditure: Recall Period Last 1 month and last 12 months

## K: Housing, Source of Water and sanitation

## L : Weather events adversely affected the household or the farming

## L1: Weather events adversely affected the household or the farming cope

## L2: Shocks

## L3: Positive Economic Events

## M: Agricultural extension

## Q: Perceptions of Climate Change

## O: Changes in farming practices due to climate change

## O1: Any integration about Livestock/Poultry

## O2: Constraints to adaptation

## R1: Information of group based approach

## R2: Group activity related information

## R3: Network

## R4: Trust and Solidarity

## R5: Economic setbacks

## R6: Collective Action and Cooperation and Sociability

## R7: Information and Communication

## R8: Production and climate Information

## R9: Production and climate Information

## S: Empowerment and Political Action

## T: Employment

## U: Time use of male and female members in the household

## V: Anthropometry