In [2]:
import os
os.chdir('../quafing/')
print(f"Working directory: {os.getcwd()}")
import quafing as q


import random
import numpy as np
from scipy.stats import mode

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import matplotlib.pyplot as plt


plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({"font.size" : 15, 
                     "figure.dpi" : 100, 
                     "legend.fontsize" : 13, 
                     "grid.alpha" : 0.3, 
                     "axes.grid": True, 
                     "axes.axisbelow" : True, 
                     "figure.figsize":(6, 5)})

Working directory: /Users/charlesdupont/Desktop/Thesis/code/quafing


In [179]:
def load_data(path):
    """
    Loads .dta file using provided path.
    """
    return pd.read_stata(path, convert_categoricals=False)


def add_missing_hhids(all_hhids, df):
    """
    Adds empty rows for hhids missing from df.
    df must already have an "hhid" column!
    """
    missing = set(unique_hhids) - set(df["hhid"].unique())
    missing_dict = {"hhid":list(missing)}
    for col in df.columns:
        if col != "hhid":
            missing_dict[col] = [np.nan]*len(missing)
    missing_df = pd.DataFrame.from_dict(missing_dict)
    return pd.concat([df, missing_df])

In [180]:
DATA_DIR = "../../BCCASII/"
DATA_SUBDIR = "Household/"

## A: Location and identification detail

In [181]:
filename = "001_mod_a.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,vcode,ucode,tcode,dcode,phone,ezcode,hhid,rid_male,rid_female,hhhid,...,a14dd,a14mm,a14yy,a15dd,a15mm,a15yy,a16dd,a16mm,a16yy,flag
0,,1,1,1,,7,1.0,1.0,2.0,1.0,...,,,2012.0,,,2012.0,,,,
1,,1,1,1,,7,2.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
2,,1,1,1,,7,3.0,1.0,2.0,1.0,...,,,2012.0,,,2012.0,,,,
3,,1,1,1,,7,4.0,1.0,2.0,1.0,...,,,2012.0,,,2012.0,,,,
4,,1,1,1,,7,5.0,,2.0,1.0,...,,,2012.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,,40,39,31,,6,796.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
823,,40,39,31,,6,797.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
824,,40,39,31,,6,798.0,1.0,2.0,1.0,...,,,2012.0,,,,,,,
825,,40,39,31,,6,799.0,8.0,2.0,2.0,...,,,2012.0,,,,,,,


In [128]:
# keep household ID, agro-ecological zone
processed = df[["hhid", "ezcode"]]

# construct community code from union, thana, district codes
processed["community_code"] = df["ucode"].astype(str) + "_" + \
                              df["tcode"].astype(str) + "_" + \
                              df["dcode"].astype(str)

In [129]:
unique_hhids = df["hhid"].unique()

## B: Household composition and education

In [182]:
filename = "002_mod_b.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,mid,b01,b01_1,b02,b03,b04,b05,b06,b07,b08_1,b08_2,b08_3
0,1.0,1,1,0,1.0,47.0,2.0,4.0,6.0,2.0,4.0,5.0,25.0
1,1.0,2,2,0,2.0,40.0,2.0,4.0,6.0,2.0,21.0,5.0,22.0
2,1.0,3,1,0,3.0,19.0,1.0,4.0,12.0,1.0,20.0,4.0,6.0
3,1.0,4,2,0,3.0,17.0,1.0,4.0,9.0,1.0,20.0,5.0,4.0
4,1.0,5,2,0,3.0,13.0,1.0,4.0,6.0,1.0,20.0,99.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4536,799.0,9,2,2,4.0,16.0,2.0,4.0,5.0,2.0,21.0,99.0,99.0
4537,800.0,1,1,0,2.0,37.0,2.0,1.0,99.0,2.0,23.0,99.0,99.0
4538,800.0,2,2,0,1.0,32.0,2.0,1.0,99.0,2.0,21.0,4.0,10.0
4539,800.0,3,1,0,3.0,7.0,1.0,4.0,1.0,1.0,20.0,99.0,99.0


In [8]:
g = df.groupby("hhid").count()

# household size
processed = pd.DataFrame.from_dict({"household_size":g["mid"]})

# household id
processed["hhid"] = processed.index

# ratio male, female
males = {}
females = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    m = sum(household["b01"]==1)
    f = sum(household["b01"]==2)
    s = m+f
    males[hhid] = m/s
    females[hhid] = f/s
processed = processed.join(pd.DataFrame.from_dict({"ratio_male":males}))
processed = processed.join(pd.DataFrame.from_dict({"ratio_female":females}))

# number of children
num_children = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    c = sum(household["b03"] < 18)
    num_children[hhid] = c
processed = processed.join(pd.DataFrame.from_dict({"number_children":num_children}))

# highest level of education
education_ordering = [99, 66, 67, 68, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 22, 10, 33, 12, 15, 16, 17]
max_education = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    education_levels = [education_ordering.index(x)
                        for x in [x for x in list(household["b06"]) if not np.isnan(x)]]
    if len(education_levels):
        max_education[hhid] = max(education_levels)
    else:
        max_education[hhid] = -1
processed = processed.join(pd.DataFrame.from_dict({"highest_education":max_education}))

# highest level of literacy
literacy_ordering = [1, 2, 3, 4]
max_literacy = {}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    literacy_labels = [literacy_ordering.index(x)
                       for x in [x for x in list(household["b05"]) if not np.isnan(x)]]
    if len(literacy_labels):
        max_literacy[hhid] = max(literacy_labels)
    else:
        max_literacy[hhid] = -1
processed = processed.join(pd.DataFrame.from_dict({"highest_literacy":max_literacy}))

# primary occupation categories
categories = {
    "day_labor": [1,2,3],
    "self_employ_farm": [4,5,6,7,8],
    "self_employ_non_farm": [9],
    "low_income_profession": [10,11,12,13],
    "mechanic": [14,15],
    "other": [16, 17, 18, 19],
    "non_earning": [20,21,22,23,24,25,26]
}

for c, c_list in categories.items():
    c_dict = {hhid:0 for hhid in unique_hhids}
    for hhid in unique_hhids:
        household = df.loc[df.hhid==hhid]
        primary_occupation_labels = list(household["b08_1"])
        for item in c_list:
            if item in primary_occupation_labels:
                c_dict[hhid] += 1
    processed = processed.join(pd.DataFrame.from_dict({c:c_dict}))

## C: Roster of land and water bodies owned or under operation

In [9]:
filename = "003_mod_c.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,plotid,c101,c102,c103,c104,c105,c106_1,c106_2,c106_3,c107,c108_1,c108_2,c108_3,c109,c110,c111,c112,c113
0,1.0,1.0,1,9.0,0.0,1.0,4,,,,,1.0,,1.0,90000.0,3.0,1989.0,5.0,
1,1.0,2.0,2,66.0,150.0,2.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,200000.0,3.0,1989.0,5.0,
2,1.0,3.0,2,66.0,150.0,2.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,200000.0,3.0,1989.0,5.0,
3,1.0,4.0,2,66.0,180.0,2.0,4,1.0,1.0,6.0,1.0,1.0,,1.0,200000.0,3.0,1989.0,5.0,
4,1.0,5.0,2,33.0,200.0,2.0,4,1.0,1.0,1.0,10.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4841,800.0,3.0,2,33.0,500.0,1.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,100000.0,3.0,1980.0,5.0,
4842,800.0,4.0,2,33.0,1000.0,2.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,110000.0,3.0,1980.0,5.0,
4843,800.0,5.0,2,23.0,500.0,1.0,4,1.0,1.0,1.0,1.0,1.0,,1.0,80000.0,3.0,1980.0,5.0,
4844,800.0,6.0,2,33.0,100.0,3.0,4,1.0,1.0,1.0,4.0,97.0,,1.0,100000.0,6.0,2009.0,3.0,


In [10]:
g = df.groupby("hhid")

# total land area
processed = pd.DataFrame.from_dict({"total_land_size":dict(g.sum()["c102"])})

# household id
processed["hhid"] = processed.index

# total value per month
processed = processed.join(pd.DataFrame.from_dict({"total_land_value_per_month":
                                                   dict(g.sum()["c113"])}))

# min land acquisition year
processed = processed.join(pd.DataFrame.from_dict({"min_land_acquisition_year":
                                                   dict(g.min()["c111"])}))

# max land acquisition year
processed = processed.join(pd.DataFrame.from_dict({"max_land_acquisition_year":
                                                   dict(g.max()["c111"])}))

# mean land acquisition year
processed = processed.join(pd.DataFrame.from_dict({"mean_land_acquisition_year":
                                                   dict(round(g.mean()["c111"]))}))

# most common answer from (plot type, soil type, operational status of land, acquisition means of land)
categories = {
    ("plot_type", "c101"): [i for i in range(1, 10)],
    ("soil_type", "c105"): [i for i in range(1, 6)],
    ("land_operational_status", "c107"): [i for i in range(1, 11)],
    ("land_acquisition_means", "c110"): [i for i in range(1, 8)],
}

for c, c_list in categories.items():
    c_dict = {hhid:0 for hhid in unique_hhids}
    c, col = c
    for hhid in unique_hhids:
        household = df.loc[df.hhid==hhid]
        labels = [x for x in list(household[col]) if not np.isnan(x)]
        if labels:
            c_dict[hhid] = mode(labels)[0][0]
        else:
            c_dict[hhid] = -1
    processed = processed.join(pd.DataFrame.from_dict({c:c_dict}))

## F1: Livestock and poultry

In [11]:
filename = "004_mod_f1.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,f101,f102,f103,f104,f105,f106,f107,f108,f109,f110,f111,f112,f113,f114,f115,f116,f117
0,1.0,1,2,20000.0,2,30000.0,0,0,0,0,0,0,0,0,,,,
1,1.0,2,3,4500.0,4,8000.0,3,0,0,0,0,2,0,0,,,,
2,1.0,4,15,1500.0,30,2200.0,30,0,0,0,0,7,3,5,4.0,17.00,50.0,950.0
3,2.0,1,3,20000.0,3,30000.0,0,0,0,0,0,0,0,0,,,,
4,2.0,4,10,1000.0,4,800.0,0,0,0,0,0,4,2,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,797.0,4,5,900.0,11,1300.0,20,0,0,0,0,14,0,0,,,,
1452,798.0,4,2,500.0,3,750.0,0,6,0,0,0,0,3,2,1.0,0.00,0.0,300.0
1453,799.0,2,5,6000.0,3,6000.0,0,0,0,0,0,1,0,1,1.0,0.25,5.0,1800.0
1454,799.0,4,12,2000.0,6,1200.0,10,0,0,0,0,4,12,0,,,,


In [12]:
# household id
processed = pd.DataFrame.from_dict({"hhid":list(unique_hhids)})

categories = {
    "cattle_buffalo":1,
    "goat_sheep":2,
    "pigs":3,
    "chidken_duck":4
}

cols = [("count_2011", "f102"), ("value_2011", "f103"), 
        ("count_now", "f104"), ("value_now", "f105"), ("total_selling_value", "f117")]   

results = {hhid:{} for hhid in unique_hhids}

for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    
    for c, number in categories.items():
        if number not in list(household["f101"]):
            for l, _ in cols:
                results[hhid][c+"_"+l] = 0
        else:
            for l, col in cols:
                value = household.loc[household["f101"]==number][col].iloc[0]
                if np.isnan(value):
                    results[hhid][c+"_"+l] = 0
                else:
                    results[hhid][c+"_"+l] = value
                    
for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]

## F3: Expenditure for livestock, Poultry/Duckery Production Last 12 Months

In [13]:
filename = "005_mod_f3.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,f301,f302,f303,f304_1,f304_2,f304_3,f304_4,f305_1,f305_2,f306_1,f306_2,f307,f308_1,f308_2,f308_3
0,1.0,1,1000,750,350.0,350,1.5,2.00,0,0,,,100,12,9.0,10.0
1,1.0,2,400,50,100.0,350,0.5,1.00,0,0,,,30,9,10.0,
2,1.0,4,500,100,0.0,400,,0.50,0,0,,,0,6,5.0,9.0
3,2.0,1,3000,650,700.0,400,1.5,1.50,0,0,,,200,9,10.0,12.0
4,2.0,4,0,0,0.0,380,,0.25,0,0,,,0,6,5.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1439,798.0,4,200,30,0.0,620,,0.50,0,0,,,0,6,4.0,
1440,799.0,2,0,200,150.0,700,1.0,1.00,0,0,,,100,4,,
1441,799.0,4,0,0,0.0,700,,1.00,0,0,,,0,5,6.0,4.0
1442,800.0,2,0,0,0.0,200,,0.50,0,0,,,0,4,,


In [14]:
df["f306"] = df["f306_1"].fillna(0) + df["f306_2"].fillna(0)
processed = df.groupby("hhid").sum()[["f302", "f303", "f306", "f307"]]
processed["hhid"] = processed.index

## G1: Credit obtained

In [15]:
filename = "006_mod_g.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,mid,loanid,g01,g02,g04,g05,g06,g07,g08,...,g26,g27,g28,g29,g30,g31,g32_1,g32_2,g33_1,g33_2
0,1.0,1,1,11,2,2011,3,2,21000.0,1,...,2,50.0,1.0,50.0,,,1,2.0,1,3.0
1,2.0,1,1,14,17,2010,7,20,50000.0,1,...,17,80.0,1.0,20.0,,,1,3.0,1,
2,3.0,1,1,10,4,1995,9,10,3500.0,1,...,4,100.0,,,,,33,,33,
3,4.0,5,1,9,14,2011,9,18,60000.0,1,...,14,50.0,4.0,50.0,,,1,5.0,1,
4,4.0,1,2,14,3,2012,5,20,30000.0,1,...,3,100.0,,,,,1,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,796.0,1,1,9,4,2001,5,20,16000.0,1,...,4,100.0,,,,,1,,1,
1148,797.0,1,1,14,3,2012,2,16,50000.0,1,...,3,100.0,,,,,1,,1,
1149,799.0,1,1,9,4,2006,8,10,16000.0,1,...,4,100.0,,,,,1,,1,
1150,799.0,2,2,14,15,2012,2,15,200000.0,1,...,15,50.0,11.0,50.0,,,1,2.0,33,8.0


In [16]:
processed = df.groupby("hhid").sum()[["g07", "g10", "g16", "g18"]]
processed["hhid"] = processed.index

## G2: Repayment and utilization of Loan

In [None]:
# TODO

## H: Household Assets

In [17]:
filename = "007_mod_h.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,rid_male,rid_female,asset,h02,h03,h04,h05,h06,h07,h08,h09_1,h09_2,h10_1,h10_2,h11,h12
0,1.0,,2.0,2,1,4,2012,200,150,1,5,99.0,,1,,999,999
1,1.0,,2.0,3,3,8,2011,5500,3000,1,1,1.0,9.0,1,9.0,999,999
2,1.0,,2.0,10,4,6,2007,20000,35000,1,1,1.0,,2,,999,999
3,1.0,,2.0,11,1,6,1992,2250,1000,1,7,99.0,,1,3.0,999,999
4,1.0,,2.0,12,1,9,2009,2250,1200,1,1,1.0,,3,,999,999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345,799.0,,2.0,22,1,10,2007,500,100,2,1,33.0,,2,,999,999
6346,799.0,,2.0,28,3,5,2008,500,200,2,1,33.0,,2,,999,999
6347,800.0,,2.0,10,1,4,2001,500,700,1,1,1.0,,2,,999,999
6348,800.0,,2.0,19,6,6,2011,400,1500,2,1,2.0,,1,2.0,50,50


In [47]:
# household id
processed = pd.DataFrame.from_dict({"hhid":list(unique_hhids)})

# total value per household asset category
categories = {
    "value_consumable_durables":[i for i in range(1, 12)],
    "value_transport":[i for i in range(12, 16)],
    "value_livestock_poultry":[i for i in range(16, 20)],
    "value_agricultural_equipments":[i for i in range(20,29)],
    "value_other":[29]
}

results = {hhid:{} for hhid in unique_hhids}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    for c in categories:
        category_assets = household.loc[household["asset"].isin(categories[c])]
        if len(category_assets):
            results[hhid][c] = category_assets["h06"].sum()
        else:
            results[hhid][c] = 0

for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]

## H1: Assets: Sales of Assets and Change in Ownership

In [49]:
filename = "008_mod_h1.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,asset,h102,h103,h104,h105,h106,h107,h108,h109_1,h109_2,flag
0,1.0,2,1.0,1,2.0,,,,100.0,8.0,2011.0,
1,1.0,3,2.0,2,,,,,,,,
2,1.0,10,4.0,2,,,,,,,,
3,1.0,11,1.0,2,,,,,,,,
4,1.0,12,1.0,2,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
5191,799.0,20,1.0,1,2.0,,,,100.0,5.0,2011.0,
5192,799.0,22,1.0,2,,,,,,,,
5193,799.0,28,2.0,2,,,,,,,,
5194,800.0,19,5.0,2,,,,,,,,


In [58]:
processed = pd.DataFrame()
processed["price_sold_gift_consumed"] = df.groupby("hhid").sum()["h108"]
processed["hhid"] = processed.index

## I: Food Consumption: Purchases, Home Production, and Other Sources

In [60]:
filename = "009_mod_i.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,fcode,i01,i02,i03,i04,i05,i06,i07,i08,i09
0,1.0,1,1,56.0,1.0,,,,,56.0,
1,1.0,5,1,5.0,1.0,,5.0,32.0,160.0,,
2,1.0,6,1,1.0,1.0,,,,,1.0,
3,1.0,8,1,1.0,1.0,,1.0,100.0,100.0,,
4,1.0,11,1,1.0,1.0,,1.0,120.0,120.0,,
...,...,...,...,...,...,...,...,...,...,...,...
24642,800.0,151,1,100.0,2.0,,100.0,160.0,16.0,,
24643,800.0,153,1,250.0,2.0,,250.0,30.0,7.5,,
24644,800.0,159,1,1.0,1.0,,1.0,20.0,20.0,,
24645,800.0,171,1,500.0,2.0,,500.0,60.0,30.0,,


In [61]:
processed = pd.DataFrame()
processed["total_value_food_consumption"] = df.groupby("hhid").sum()["i07"]
processed["hhid"] = processed.index

## J1, J2: Non-food Expenditure

In [65]:
filename = "010_mod_j.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,nfcode,j02_1,j02_2,j03_1,j03_2,j04_1,j04_2
0,1.0,1,,,2000.0,,2.0,
1,1.0,3,,,500.0,,2.0,
2,1.0,4,,,300.0,,2.0,
3,1.0,5,260.0,,,,,
4,1.0,7,40.0,,,,,
...,...,...,...,...,...,...,...,...
20346,800.0,37,,1000.0,,,,
20347,800.0,38,,500.0,,,,
20348,800.0,39,,200.0,,,,
20349,800.0,68,10.0,30.0,,,,


In [66]:
processed = pd.DataFrame()
processed["non_food_cash_expenditure"] = df.groupby("hhid").sum()["j02_2"]
processed["non_food_imputed_value"] = df.groupby("hhid").sum()["j03_2"]
processed["hhid"] = processed.index

## K: Housing, Source of Water and sanitation

In [70]:
filename = "011_mod_k.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,k01,k02,k03,k03_1,k04,k05,k05_1,k06,k07,k08,k09,k10
0,1.0,4,2,2,3,3,3,1,5,3,5.0,2,1
1,2.0,3,2,2,3,3,3,1,5,4,4.0,2,1
2,3.0,3,2,2,3,3,3,1,5,2,2.0,1,1
3,4.0,5,2,2,3,2,3,1,5,3,2.0,2,1
4,5.0,2,2,2,3,3,3,1,5,2,5.0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,4,1,2,3,1,2,1,5,3,9.0,1,1
801,797.0,3,1,2,1,1,2,1,5,5,5.0,1,1
802,798.0,4,1,2,1,1,4,1,5,3,5.0,4,2
803,799.0,3,3,2,3,3,2,1,5,3,2.0,2,1


In [74]:
processed = df[["hhid", "k01", "k02", "k03", "k03_1", "k04", "k05", "k05_1", "k06", "k07", "k10"]]

## L : Weather events adversely affected the household or the farming

In [130]:
filename = "012_mod_l.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,l01,l02,l03,l04,l05,l06,l07,l08,l09,l10
0,1.0,3,1,1.0,2012.0,2.0,1.0,99.0,99.0,3.0,99.0
1,1.0,4,1,1.0,2011.0,1.0,1.0,1.0,2.0,2.0,99.0
2,2.0,4,1,1.0,2011.0,2.0,2.0,2.0,2.0,2.0,99.0
3,3.0,3,1,1.0,2011.0,99.0,2.0,99.0,99.0,99.0,99.0
4,4.0,3,1,1.0,2011.0,99.0,1.0,99.0,3.0,99.0,99.0
...,...,...,...,...,...,...,...,...,...,...,...
750,791.0,2,1,1.0,2012.0,99.0,99.0,99.0,99.0,99.0,99.0
751,793.0,10,1,1.0,2011.0,3.0,99.0,99.0,99.0,99.0,99.0
752,798.0,2,1,4.0,2012.0,99.0,1.0,99.0,99.0,99.0,99.0
753,799.0,10,1,1.0,2011.0,2.0,99.0,99.0,99.0,99.0,99.0


In [134]:
calamity_numbers = [i for i in range(1, 13)]

results = {hhid:{} for hhid in unique_hhids}

for hhid in unique_hhids:
    household = df.loc[df.hhid == hhid]
    nums = list(household["l01"])
    for n in calamity_numbers:
        if n not in nums:
            results[hhid]["calamity_"+str(n)] = 0
        else:
            if int(household.loc[household["l01"]==n]["l02"]) == 1:
                results[hhid]["calamity_"+str(n)] = 1
            else:
                results[hhid]["calamity_"+str(n)] = 0
                

processed = pd.DataFrame()
for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]
    
processed["hhid"] = unique_hhids

## L1: Weather events adversely affected the household or the farming cope

In [142]:
filename = "013_mod_l1.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,l1_01,l1_02,l1_03,l1_04,l1_05
0,1.0,3,1,,4,30000
1,1.0,4,1,,4,40000
2,2.0,4,6,1.0,4,15000
3,3.0,3,1,,5,12000
4,4.0,3,8,1.0,5,30000
...,...,...,...,...,...,...
720,791.0,2,5,1.0,4,12000
721,793.0,11,1,,3,1000
722,798.0,2,1,,3,8000
723,799.0,10,1,,2,1500


In [177]:
processed = df.groupby("hhid").sum()[["l1_05"]]
processed["hhid"] = processed.index

## L2: Shocks

In [187]:
filename = "014_mod_l2.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,l2_01,l2_02,l2_03,l2_04,l2_05,l2_06,l2_07a,l2_07b,l2_07c,l2_08,l2_09a,l2_09b
0,1.0,11,2,2,2012,1,30000.0,18,,,99,,
1,1.0,13,1,5,2012,2,2200.0,1,,,99,,
2,1.0,27,5,9,2012,2,2000.0,1,,,99,,
3,2.0,11,1,5,2011,1,15000.0,18,,,99,,
4,3.0,4,1,11,2011,1,10000.0,19,,,99,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,787.0,11,1,9,2011,2,20000.0,1,,,99,,
858,787.0,13,1,10,2011,2,15000.0,9,,,99,,
859,795.0,4,1,12,2011,1,15000.0,2,,,99,,
860,795.0,28,1,6,2012,1,4000.0,9,,,99,,


In [188]:
processed = df.groupby("hhid").sum()[["l2_06"]]
processed["hhid"] = processed.index

## L3: Positive Economic Events

In [190]:
filename = "015_mod_l3.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,l3_01,l3_02,l3_03,l3_04,l3_05,l3_06,l3_07
0,1.0,11,1,7,2012,4.0,300.0,2
1,1.0,13,1,7,2012,4.0,1750.0,1
2,3.0,11,1,7,2012,4.0,300.0,1
3,6.0,11,1,4,2011,3.0,300.0,1
4,11.0,11,1,2,2011,2.0,300.0,1
...,...,...,...,...,...,...,...,...
256,799.0,1,1,4,2011,,,1
257,799.0,2,1,4,2012,,,2
258,800.0,11,1,4,2012,2.0,300.0,1
259,845.0,11,1,7,2012,4.0,100.0,2


In [191]:
processed = df.groupby("hhid").sum()[["l3_06"]]
processed["hhid"] = processed.index

## M: Agricultural extension

In [194]:
filename = "017_mod_m.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,m01,m02_1,m02_2,m02_3,m02_4,m03,m04_1,m04_2,m04_3,...,m08_3,m08_4,m09,m10,m11_1,m11_2,m11_3,m11_4,m11_5,m11_6
0,1.0,1,1.0,6.0,,,4.0,2.0,,,...,,,4.0,2.0,2,1.0,,,,
1,2.0,1,1.0,6.0,,,3.0,2.0,,,...,5.0,,4.0,1.0,99,,,,,
2,3.0,2,,,,,,,,,...,,,4.0,3.0,2,1.0,9.0,,,
3,4.0,1,1.0,,,,3.0,1.0,,,...,,,4.0,3.0,1,5.0,,,,
4,5.0,2,,,,,,,,,...,,,4.0,3.0,2,4.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,2,,,,,,,,,...,1.0,,7.0,2.0,4,,,,,
801,797.0,2,,,,,,,,,...,,,4.0,2.0,2,3.0,,,,
802,798.0,1,6.0,,,,2.0,1.0,,,...,,,5.0,1.0,99,,,,,
803,799.0,2,,,,,,,,,...,,,7.0,2.0,4,,,,,


In [195]:
processed = df

## Q: Perceptions of Climate Change

In [196]:
filename = "018_mod_q.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,q01_1,q01_2,q01_3,q02,q03,q04,q05_1,q05_2,q05_3,q06,q07_1,q07_2,q07_3
0,1.0,2,8.0,3.0,1,2,1,5.0,1.0,,1,1.0,3.0,9.0
1,2.0,2,3.0,,1,2,1,1.0,5.0,,1,1.0,5.0,9.0
2,3.0,2,3.0,,1,2,1,3.0,5.0,,1,3.0,5.0,9.0
3,4.0,2,5.0,,1,2,1,1.0,5.0,6.0,1,1.0,5.0,3.0
4,5.0,2,,,1,2,1,1.0,5.0,,1,1.0,5.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,2,3.0,,1,2,1,1.0,2.0,3.0,1,1.0,5.0,
801,797.0,2,,,1,2,1,5.0,7.0,,1,3.0,9.0,
802,798.0,2,5.0,,1,2,1,5.0,1.0,,1,1.0,3.0,
803,799.0,2,3.0,8.0,1,2,1,1.0,2.0,3.0,1,1.0,5.0,


In [197]:
processed = df

## O: Changes in farming practices due to climate change

In [199]:
filename = "019_mod_o.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,rid,o01,o02_1,o02_21,o02_22,o02_31,o02_32,o03_1,o03_21,...,o26_1,o26_2,o26_3,o27_1,o27_2,o27_3,o28_1,o28_2,o28_3,o29
0,1.0,1,1,1.0,11.0,39.0,38.0,22.0,2.0,,...,1.0,1.0,2.0,2.0,,,1.0,1.0,2.0,1.0
1,2.0,1,1,1.0,11.0,,30.0,,1.0,19.0,...,1.0,1.0,2.0,2.0,,,1.0,1.0,2.0,1.0
2,3.0,1,1,1.0,11.0,,32.0,,2.0,,...,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0
3,4.0,1,1,1.0,39.0,,30.0,,2.0,,...,1.0,1.0,2.0,2.0,,,1.0,1.0,2.0,2.0
4,5.0,3,1,1.0,39.0,,49.0,,2.0,,...,1.0,1.0,2.0,2.0,,,1.0,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1,1,1.0,39.0,,11.0,,2.0,,...,1.0,1.0,2.0,1.0,1.0,2.0,2.0,,,2.0
801,797.0,1,1,2.0,,,,,2.0,,...,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0
802,798.0,1,1,1.0,3.0,,28.0,,2.0,,...,1.0,1.0,2.0,1.0,1.0,2.0,2.0,,,2.0
803,799.0,8,1,1.0,11.0,,29.0,,2.0,,...,1.0,1.0,2.0,1.0,1.0,2.0,2.0,,,2.0


In [210]:
keep = ["o0"+str(i)+"_1" for i in range(2, 9)] + ["o09"] + \
       ["o"+str(i) for i in range(10, 23)] + ["o"+str(i)+"_1" for i in range(23, 29)] + ["o29"]

In [213]:
processed = df[keep]

## O1: Any integration about Livestock/Poultry

In [228]:
filename = "020_mod_o1.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,o1_01_1,o1_02_1,o1_03_1,o1_01_2,o1_02_2,o1_03_2,o1_01_3,o1_02_3,o1_03_3,...,o1_03_6,o1_01_7,o1_02_7,o1_03_7,o1_01_8,o1_02_8,o1_03_8,o1_01_9,o1_02_9,o1_03_9
0,1.0,2,,,2,,,1,1.0,2.0,...,,1,9.0,10.0,2,,,2,,
1,2.0,2,,,2,,,1,1.0,2.0,...,,1,9.0,10.0,2,,,2,,
2,3.0,2,,,2,,,1,1.0,2.0,...,,1,9.0,10.0,2,,,2,,
3,4.0,2,,,2,,,2,,,...,8.0,1,9.0,10.0,2,,,2,,
4,5.0,2,,,2,,,2,,,...,,1,9.0,10.0,2,,,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,2,,,2,,,2,,,...,6.0,1,9.0,10.0,2,,,2,,
801,797.0,2,,,2,,,2,,,...,,2,,,2,,,2,,
802,798.0,2,,,2,,,2,,,...,,2,,,2,,,2,,
803,799.0,2,,,2,,,2,,,...,,2,,,2,,,2,,


In [230]:
keep = ["o1_01_"+str(i) for i in range(1, 10)]

In [231]:
processed = df[keep]

## O2: Constraints to adaptation

In [233]:
filename = "021_mod_o2.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,o2_01,o2_02_1,o2_02_2,o2_02_3,o2_02_4,o2_02_5,o2_03_1,o2_03_2,o2_03_3,o2_03_4,o2_03_5
0,1.0,1,3.0,7.0,8.0,,,1.0,2.0,3.0,,
1,1.0,3,8.0,4.0,3.0,,,1.0,2.0,3.0,,
2,1.0,4,9.0,,,,,1.0,,,,
3,1.0,12,7.0,8.0,3.0,,,1.0,2.0,3.0,,
4,1.0,12,8.0,3.0,7.0,,,1.0,2.0,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2430,799.0,1,8.0,4.0,,,,1.0,2.0,,,
2431,799.0,4,8.0,,,,,1.0,,,,
2432,799.0,11,8.0,,,,,1.0,,,,
2433,799.0,12,8.0,,,,,1.0,,,,


In [None]:
# TODO

## R1: Information of group based approach

In [235]:
filename = "022_mod_r1.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,rid_male,rid_female,r01,r1_01,r1_02,r1_04_1,r1_04_2,r1_05,r1_06,r1_07,r1_08,r1_09,r1_10_1,r1_10_2,r1_10_3
0,1.0,1.0,2.0,1,1.0,1.0,1.0,,1.0,1990.0,0.0,0.0,3.0,6.0,,
1,1.0,1.0,2.0,1,3.0,1.0,1.0,,1.0,2008.0,1.0,1.0,3.0,11.0,,
2,1.0,1.0,2.0,1,17.0,1.0,2.0,,3.0,2011.0,1.0,1.0,4.0,2.0,,
3,1.0,1.0,2.0,1,18.0,1.0,3.0,,3.0,2011.0,1.0,1.0,2.0,5.0,,
4,2.0,1.0,2.0,1,1.0,1.0,1.0,,1.0,1970.0,0.0,0.0,2.0,6.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1393,798.0,1.0,2.0,1,3.0,1.0,1.0,,1.0,2007.0,2.0,2.0,2.0,11.0,,
1394,798.0,1.0,2.0,1,9.0,1.0,1.0,,1.0,1995.0,1.0,1.0,2.0,10.0,,
1395,799.0,8.0,2.0,1,4.0,1.0,2.0,,2.0,2010.0,1.0,1.0,4.0,12.0,,
1396,799.0,8.0,2.0,1,16.0,1.0,8.0,,1.0,2011.0,1.0,1.0,2.0,7.0,,


In [267]:
groups = np.arange(1, 19)

results = {hhid:{} for hhid in unique_hhids}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    for g in groups:
        g_row = household.loc[household["r1_01"]==g]
        if len(g_row):
            membership_values = list(g_row["r1_02"])
            if 1 in membership_values:
                results[hhid][f"group_{g}_membership"] = 1
            else:
                results[hhid][f"group_{g}_membership"] = 0
        else:
            results[hhid][f"group_{g}_membership"] = 0
            
processed = pd.DataFrame()
for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]
    
processed["hhid"] = unique_hhids

## R2: Group activity related information

In [269]:
# TODO: skip?

## R3: Network

In [303]:
filename = "024_mod_r3.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r3_01_1,r3_02_1,r3_03_1,r3_04_1,r3_05_1,r3_06_1,r3_07_1,r3_01_2,r3_02_2,r3_03_2,r3_04_2,r3_05_2,r3_06_2,r3_07_2
0,1.0,1.0,8.0,4.0,0.0,1.0,1.0,10.0,2.0,15.0,10.0,0.0,3.0,1.0,8.0
1,2.0,1.0,13.0,7.0,2.0,3.0,1.0,5.0,2.0,5.0,2.0,1.0,3.0,1.0,3.0
2,3.0,1.0,15.0,10.0,0.0,1.0,1.0,6.0,2.0,2.0,2.0,0.0,3.0,1.0,2.0
3,4.0,1.0,12.0,4.0,3.0,1.0,1.0,8.0,2.0,2.0,1.0,1.0,3.0,1.0,0.0
4,5.0,,,,,,,,2.0,8.0,4.0,1.0,3.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0
801,797.0,1.0,3.0,2.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,1.0,0.0
802,798.0,1.0,2.0,2.0,1.0,3.0,1.0,7.0,2.0,3.0,2.0,0.0,3.0,1.0,2.0
803,799.0,8.0,3.0,2.0,1.0,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,3.0


In [304]:
processed = df.loc[:, ~df.columns.isin(["r3_01_1", "r3_01_2"])]

## R4: Trust and Solidarity

In [305]:
filename = "025_mod_r4.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r4_01_1,r4_02_1,r4_03_1,r4_04_1_1,r4_04_1_2,r4_04_1_3,r4_04_1_4,r4_04_1_5,r4_04_1_6,...,r4_04_2_1,r4_04_2_2,r4_04_2_3,r4_04_2_4,r4_04_2_5,r4_04_2_6,r4_04_2_7,r4_04_2_8,r4_04_2_9,r4_05_2
0,1.0,1.0,1.0,2.0,4.0,3.0,4.0,3.0,1.0,1.0,...,3.0,3.0,4.0,3.0,1.0,1.0,4.0,5.0,4.0,4.0
1,2.0,1.0,1.0,1.0,4.0,4.0,5.0,4.0,2.0,2.0,...,3.0,3.0,3.0,3.0,1.0,1.0,2.0,4.0,3.0,4.0
2,3.0,1.0,2.0,1.0,3.0,2.0,3.0,4.0,1.0,1.0,...,3.0,3.0,3.0,3.0,1.0,1.0,3.0,4.0,3.0,4.0
3,4.0,1.0,1.0,2.0,4.0,3.0,4.0,4.0,1.0,1.0,...,4.0,3.0,4.0,4.0,1.0,1.0,3.0,4.0,3.0,1.0
4,5.0,,,,,,,,,,...,2.0,2.0,4.0,4.0,1.0,1.0,4.0,4.0,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1.0,2.0,2.0,4.0,4.0,4.0,3.0,2.0,3.0,...,4.0,4.0,4.0,2.0,2.0,3.0,4.0,4.0,4.0,3.0
801,797.0,1.0,1.0,2.0,5.0,1.0,5.0,4.0,1.0,1.0,...,4.0,2.0,5.0,3.0,2.0,2.0,5.0,5.0,5.0,4.0
802,798.0,1.0,1.0,1.0,3.0,4.0,4.0,4.0,3.0,4.0,...,4.0,4.0,4.0,2.0,4.0,2.0,5.0,5.0,3.0,2.0
803,799.0,8.0,2.0,2.0,4.0,4.0,4.0,3.0,2.0,2.0,...,4.0,4.0,4.0,2.0,3.0,3.0,4.0,4.0,4.0,3.0


In [306]:
processed = df.loc[:, ~df.columns.isin(["r4_01_1", "r4_01_2"])]

## R5: Economic setbacks

In [307]:
filename = "026_mod_r5.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r5,r5_1,r5_2,r5_3_1,r5_3_2,r5_3_3,r5_4_1,r5_4_2,r5_4_3,r5_5_1,r5_5_2,r5_5_3,r5_6,r5_7_1,r5_7_2,r5_7_3
0,1.0,1,1.0,4.0,1.0,2.0,3.0,3.0,,,1.0,,,4.0,3.0,,
1,2.0,1,1.0,4.0,1.0,2.0,3.0,3.0,,,1.0,,,4.0,3.0,,
2,3.0,1,1.0,4.0,1.0,2.0,3.0,3.0,,,1.0,,,4.0,3.0,,
3,3.0,1,2.0,1.0,1.0,2.0,3.0,3.0,,,1.0,,,4.0,3.0,,
4,4.0,1,1.0,1.0,1.0,2.0,,3.0,,,1.0,,,1.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,796.0,2,,,,,,,,,,,,,,,
1111,797.0,2,,,,,,,,,,,,,,,
1112,798.0,1,1.0,4.0,1.0,,,4.0,,,1.0,,,4.0,4.0,,
1113,799.0,1,1.0,2.0,1.0,,,4.0,,,1.0,,,4.0,4.0,,


In [308]:
processed = df[["hhid", "r5"]]

## R6: Collective Action and Cooperation and Sociability

In [310]:
filename = "027_mod_r6.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r6_01_1,r6_02_1,r6_03_1_1,r6_03_1_2,r6_03_1_3,r6_04_1_1,r6_04_1_2,r6_04_1_3,r6_05_1,...,r6_03_2_1,r6_03_2_2,r6_03_2_3,r6_04_2_1,r6_04_2_2,r6_04_2_3,r6_05_2,r6_06_2,r6_07_2,r6_08_2
0,1.0,1.0,1.0,11.0,6.0,7.0,11.0,6.0,7.0,20.0,...,2.0,,,2.0,,,10.0,1.0,1.0,2.0
1,2.0,1.0,2.0,,,,,,,,...,,,,,,,,2.0,2.0,2.0
2,3.0,1.0,1.0,11.0,7.0,2.0,11.0,7.0,2.0,360.0,...,,,,,,,,2.0,1.0,1.0
3,4.0,1.0,1.0,12.0,9.0,,12.0,9.0,,20.0,...,,,,,,,,2.0,2.0,2.0
4,5.0,,,,,,,,,,...,9.0,,,9.0,,,2.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1.0,1.0,7.0,12.0,,7.0,,,2.0,...,,,,,,,,2.0,1.0,1.0
801,797.0,1.0,1.0,7.0,,,7.0,,,5.0,...,,,,,,,,1.0,1.0,1.0
802,798.0,1.0,1.0,7.0,9.0,12.0,7.0,9.0,12.0,7.0,...,9.0,,,9.0,,,,1.0,1.0,2.0
803,799.0,8.0,1.0,7.0,12.0,,7.0,,,3.0,...,,,,,,,,2.0,1.0,1.0


In [312]:
processed = df.loc[:, ~df.columns.isin(["r6_01_1", "r6_01_2"])]

## R7: Information and Communication

In [318]:
filename = "028_mod_r7.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r7_01_1,r7_02_1,r7_03_1,r7_04_1,r7_05_1,r7_06_1_1,r7_06_1_2,r7_06_1_3,r7_07_1_1,...,r7_06_2_3,r7_07_2_1,r7_07_2_2,r7_07_2_3,r7_08_2,r7_09_2,r7_10_2,r7_11_2_1,r7_11_2_2,r7_11_2_3
0,1.0,2.0,1.0,45.0,5.0,5.0,1.0,5.0,3.0,1.0,...,,1.0,,,1.0,2.0,1.0,1.0,,
1,2.0,2.0,1.0,30.0,5.0,5.0,1.0,3.0,,1.0,...,,1.0,,,2.0,2.0,2.0,1.0,,
2,3.0,3.0,1.0,80.0,5.0,5.0,8.0,1.0,3.0,1.0,...,,1.0,,,2.0,2.0,1.0,1.0,,
3,4.0,3.0,2.0,60.0,5.0,5.0,1.0,3.0,10.0,1.0,...,,1.0,,,2.0,1.0,1.0,1.0,,
4,5.0,,,,,,,,,,...,,1.0,,,2.0,1.0,1.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,4.0,3.0,10.0,5.0,4.0,1.0,3.0,,1.0,...,,1.0,,,1.0,2.0,1.0,1.0,,
801,797.0,4.0,4.0,70.0,5.0,2.0,1.0,7.0,3.0,1.0,...,,1.0,,,1.0,2.0,2.0,1.0,,
802,798.0,3.0,2.0,140.0,5.0,1.0,1.0,3.0,7.0,1.0,...,,1.0,7.0,,2.0,2.0,1.0,1.0,,
803,799.0,2.0,2.0,100.0,5.0,1.0,1.0,3.0,7.0,3.0,...,,1.0,,,1.0,2.0,1.0,1.0,,


In [319]:
processed = df.loc[:, ~df.columns.isin(["r7_01_1", "r7_01_2"])]

## R8: Production and climate Information

In [324]:
filename = "029_mod_r8.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r8_01_1,r8_02_1,r8_03_1,r8_04_1,r8_05_1,r8_06_1,r8_07_1,r8_08_1,r8_09_1,...,r8_12_2,r8_13_2,r8_14_2,r8_15_2,r8_16_2,r8_17_2,r8_18_2,r8_19_2,r8_20_2,r8_21_2
0,1.0,1.0,2.0,,,,1.0,4.0,5.0,5.0,...,,,2.0,,,,,,,
1,2.0,1.0,2.0,,,,1.0,1.0,2.0,5.0,...,,,2.0,,,,,,,
2,3.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,4.0,5.0,...,,,2.0,,,,,,,
3,4.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,3.0,6.0,...,,,2.0,,,,,,,
4,5.0,,,,,,,,,,...,,,2.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,2.0,5.0,...,,,1.0,2.0,5.0,9.0,9.0,1.0,1.0,
801,797.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,9.0,5.0,...,,,1.0,2.0,5.0,5.0,5.0,2.0,1.0,
802,798.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,...,,,1.0,2.0,5.0,9.0,9.0,2.0,1.0,
803,799.0,8.0,1.0,8.0,1.0,2.0,1.0,8.0,2.0,5.0,...,,,1.0,2.0,5.0,9.0,9.0,1.0,1.0,


In [326]:
keep = ["hhid", "r8_02_1", "r8_02_2", "r8_06_1", "r8_06_2", "r8_14_1", "r8_14_2"]
processed = df[keep]

## R9: Production and climate Information

In [329]:
filename = "030_mod_r9.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,r9_01_1,r9_02_1,r9_03_1,r9_04_1,r9_05_1,r9_06_1,r9_07_1,r9_08_1,r9_09_1,...,r9_06_2,r9_07_2,r9_08_2,r9_09_2,r9_10_2,r9_11_2,r9_12_2,r9_13_2,r9_14_2,r9_15_2
0,1.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
1,2.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
2,3.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
3,4.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
4,5.0,3.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
801,797.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
802,798.0,1.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,
803,799.0,8.0,2.0,,,,,,,2.0,...,,,,2.0,,,,,,


In [330]:
keep = ["hhid", "r9_02_1", "r9_02_2", "r9_09_1", "r9_09_2"]
processed = df[keep]

## S: Empowerment and Political Action

In [338]:
filename = "031_mod_s.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,s_01_1,s_02_1,s_03_1,s_04_1,s_05_1,s_06_1_1,s_06_2_1,s_06_3_1,s_06_4_1,...,s_05_2,s_06_1_2,s_06_2_2,s_06_3_2,s_06_4_2,s_06_5_2,s_06_6_2,s_07_2,s_08_2,s_09_2
0,1.0,1,1.0,5.0,1.0,,1.0,2.0,2.0,2.0,...,,1.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0
1,2.0,1,2.0,5.0,2.0,1.0,1.0,2.0,2.0,2.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,3.0
2,3.0,1,2.0,5.0,3.0,2.0,1.0,2.0,2.0,1.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0
3,4.0,1,1.0,5.0,3.0,3.0,1.0,1.0,2.0,1.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,3.0
4,5.0,99,,,,,,,,,...,,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,796.0,1,2.0,5.0,1.0,,1.0,2.0,2.0,2.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,3.0
801,797.0,1,2.0,5.0,1.0,,1.0,2.0,1.0,2.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
802,798.0,1,2.0,5.0,2.0,2.0,1.0,1.0,2.0,2.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0
803,799.0,8,1.0,4.0,1.0,,2.0,2.0,2.0,2.0,...,,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0


In [339]:
processed = df.loc[:, ~df.columns.isin(["s_01_1", "s_01_2"])]

## T: Employment

In [375]:
filename = "032_mod_t.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,hhid,mid,t01,t02,t03,t04,t05,t06,t07,t08,t09,t10,t11,t12,t13,t14
0,1.0,1,1,,,,64.0,1.0,5.0,2.0,3.0,,,,,5500.0
1,1.0,1,1,,,,70.0,2.0,5.0,0.5,3.0,,,,,0.0
2,1.0,2,1,,,,69.0,3.0,7.0,0.5,3.0,,,,,150.0
3,1.0,2,2,,,,70.0,4.0,7.0,1.0,4.0,,,,,
4,1.0,3,1,,,,46.0,5.0,6.0,4.0,3.0,,,,,1000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4850,800.0,1,9,,,,,,,,,,,,,
4851,800.0,2,1,,,,64.0,1.0,3.0,3.0,3.0,,,,,1000.0
4852,800.0,2,1,,,,70.0,2.0,7.0,0.5,3.0,,,,,0.0
4853,800.0,2,1,,,,69.0,3.0,7.0,0.5,3.0,,,,,0.0


In [390]:
job_categories = {
    "job_wage_labor":np.arange(1,12),
    "job_salaried_worker":np.arange(12,22),
    "job_self_employment":np.arange(22,48),
    "job_trader":np.arange(50,55),
    "job_production":np.arange(55,58),
    "job_livestock_poultry_service":np.arange(58,64),
    "job_farming":np.arange(64,73)
}

results = {hhid:{c:0 for c in job_categories} for hhid in unique_hhids}
for hhid in unique_hhids:
    household = df.loc[df.hhid==hhid]
    relevant = list(household["t05"])
    for c, numbers in job_categories.items():
        for n in numbers:
            if n in relevant:
                results[hhid][c] += 1
            
processed = pd.DataFrame.from_dict({"hhid":list(unique_hhids)})        
for column in results[1]:
    processed[column] = [results[hhid][column] for hhid in results]

## U: Time use of male and female members in the household

In [391]:
# TODO: discard?

## V: Anthropometry

In [392]:
# TODO: discard?

# Combine processed dataframes

In [186]:
# maintain list of all the processed dataframes
# make sure all dataframes have one row per household id
# add empty rows for missing household ids where necessary
# mind the treatment of MISSING VALUES!
    # do we really want to impute "0" when no information is present?
# combine all the dataframes into one dataframe (each hhid is a single feature vector/row)