In [1]:
import os
os.chdir('../quafing/')
print(f"Working directory: {os.getcwd()}")


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({"font.size" : 15, 
                     "figure.dpi" : 100, 
                     "legend.fontsize" : 13, 
                     "grid.alpha" : 0.3, 
                     "axes.grid": True, 
                     "axes.axisbelow" : True, 
                     "figure.figsize":(6, 5)})

Working directory: /Users/charlesdupont/Desktop/Thesis/code/quafing


In [2]:
def load_data(path):
    """
    Loads .dta file using provided path.
    """
    return pd.read_stata(path, convert_categoricals=False)


def plot_missing_prop(df):
    """
    Plots bar graph to visualize proportion of missing entries for each column of provided df.
    """
    missing_prop = {}
    L = len(df)
    for col in df.columns:
        missing_prop[col] = df[col].isna().sum()/L
    ordered_keys = sorted(missing_prop.keys(), key=lambda x: missing_prop[x], reverse=True)
    plt.figure(figsize=(10, 4))
    plt.bar(np.arange(len(missing_prop)), [missing_prop[k] for k in ordered_keys], color="dodgerblue")
    plt.xticks(np.arange(len(missing_prop)), ordered_keys, rotation=90)
    plt.ylabel("Proportion Missing")
    plt.show()
    
    
def plot_categorical_feature_frequency(df, col, sort=True, xlabel=None, xticks_dict={}, 
                                       save=False, save_filename=None):
    freq = Counter(df[col])
    freq = {k:v for k,v in freq.items() if not np.isnan(k)}
    L = len(freq)
    if sort:
        ordered_keys = sorted(freq.keys(), key=lambda x: freq[x], reverse=True)
    else:
        ordered_keys = sorted(freq.keys(), key=lambda x: x)
    plt.figure(figsize=(12,4))
    plt.bar(np.arange(L), [freq[k] for k in ordered_keys], color="dodgerblue", alpha=0.8)
    if xticks_dict:
        plt.xticks(np.arange(L), [xticks_dict[k] for k in ordered_keys], rotation=90)
    else:
        plt.xticks(np.arange(L), ordered_keys, rotation=90)
    plt.ylabel("Frequency")
    if xlabel:
        plt.xlabel(xlabel, labelpad=20)
    else:
        plt.xlabel(col, labelpad=20)
        
    if save:
        plt.savefig(save_filename)
        
    plt.show()

In [3]:
DATA_DIR = "../../BCCASII/"

# Community

In [4]:
DATA_SUBDIR = "Community/"

## Module A: Location identification and details

## Module E: Major Income Earning Opportunities for the Villagers

In [53]:
filename = "001_com_mod_a.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,sl,a01,a02,a03,a04,a05,a06,a07,a08,a09,...,e19,e21,e22,e23,e24,e25,e26,e27,e28,e29
0,1,,1,1,1,7,4,4.0,5.0,,...,,1,2,5.0,10.0,6.0,,,,
1,2,,2,2,2,7,2,2.0,4.0,2.0,...,,1,5,9.0,8.0,4.0,10.0,,,
2,3,,3,3,3,7,3,4.0,4.0,,...,,1,10,5.0,5.0,6.0,9.0,,,
3,4,,4,4,4,3,1,3.0,5.0,4.0,...,,1,11,11.0,,,,,,
4,5,,5,5,5,6,3,1.0,5.0,4.0,...,,1,10,8.0,,,,,,
5,6,,6,6,5,6,1,3.0,1.0,4.0,...,,1,10,,,,,,,
6,7,,7,7,6,4,2,4.0,3.0,,...,,1,8,,,,,,,
7,8,,8,8,6,3,1,3.0,4.0,,...,,1,6,7.0,3.0,4.0,,,,
8,9,,9,9,7,7,3,3.0,4.0,,...,,1,11,10.0,,,,,,
9,10,,10,10,8,7,1,2.0,4.0,3.0,...,,1,11,,,,,,,


In [55]:
communities = set()
for index, row in df.iterrows():
    communities.add((row["a02"], row["a03"], row["a04"]))

In [60]:
communities

{(1, 1, 1),
 (2, 2, 2),
 (3, 3, 3),
 (4, 4, 4),
 (5, 5, 5),
 (6, 6, 5),
 (7, 7, 6),
 (8, 8, 6),
 (9, 9, 7),
 (10, 10, 8),
 (11, 11, 9),
 (12, 12, 10),
 (13, 13, 11),
 (14, 14, 11),
 (15, 14, 11),
 (16, 15, 12),
 (17, 16, 13),
 (18, 17, 14),
 (19, 18, 15),
 (20, 19, 16),
 (21, 20, 17),
 (22, 21, 17),
 (23, 22, 18),
 (24, 23, 19),
 (25, 24, 20),
 (26, 25, 21),
 (27, 26, 22),
 (28, 27, 23),
 (29, 28, 23),
 (30, 29, 24),
 (31, 30, 25),
 (32, 31, 26),
 (33, 32, 27),
 (34, 33, 28),
 (35, 34, 29),
 (36, 35, 29),
 (37, 36, 30),
 (38, 37, 30),
 (39, 38, 30),
 (40, 39, 31)}

## Module B1: Factor prices

In [17]:
filename = "002_com_mod_b_ag.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,sl,tasks,b101,b102,b103,b104,b105,b106,b107,b108,b109,b110,b111,b112,b113,b114,b115,b116
0,1,1,250,100,250,100,999,999,999,999,150,100,150,100,999,999,999,999
1,1,2,400,0,400,0,999,999,999,999,999,999,999,999,999,999,999,999
2,1,3,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999,999
3,1,4,250,150,250,100,999,999,999,999,150,100,150,100,999,999,999,999
4,1,5,250,100,250,100,999,999,999,999,150,100,150,100,999,999,999,999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,40,6,200,0,200,0,999,999,999,999,200,0,200,0,999,999,999,999
396,40,7,200,0,200,0,999,999,999,999,200,0,200,0,999,999,999,999
397,40,8,200,0,200,0,999,999,999,999,200,0,200,0,999,999,999,999
398,40,9,200,0,200,0,999,999,999,999,200,0,200,0,999,999,999,999


## Module B2: Non-agriculture: Wages of hired labors employed in construction work

In [34]:
filename = "003_com_mod_b_nonag.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,sl,work,b201,b202,b205,b206,b209,b210,b213,b214
0,28,11,300,0,999,999,999,999,250,0
1,28,12,280,0,150,0,999,999,200,0
2,28,13,200,50,120,50,999,999,150,50
3,1,12,400,0,999,999,300,0,999,999
4,1,13,150,100,100,100,100,100,50,80
...,...,...,...,...,...,...,...,...,...,...
115,39,12,200,0,999,999,200,0,999,999
116,39,13,100,100,999,999,100,100,60,60
117,40,11,250,0,999,999,250,0,999,999
118,40,12,350,0,999,999,300,0,999,999


## Module C: Characteristics of the village

In [37]:
filename = "004_com_mod_c.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,sl,qid,c01,c02
0,1,2,2,3.0
1,1,3,1,
2,1,4,1,
3,1,8,1,
4,1,10,1,
...,...,...,...,...
1035,40,20,2,2.0
1036,40,23,2,8.0
1037,40,24,2,2.0
1038,40,25,2,8.0


## Module D: Natural disasters in past 5 years in community

In [39]:
filename = "005_com_mod_d.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,sl,qid,d01,d02
0,1,1,1,2.0
1,1,2,1,2.0
2,1,3,1,3.0
3,1,4,1,4.0
4,1,5,1,5.0
...,...,...,...,...
355,40,5,2,
356,40,6,2,
357,40,7,2,
358,40,8,2,


## Module F: Group information

In [51]:
filename = "006_com_mod_e.dta"
path = DATA_DIR + DATA_SUBDIR + filename
df = load_data(path)
df

Unnamed: 0,sl,group,code,f01,f02,f03_1,f03_2,f03_3,f04,f05_1,f05_2,f05_3
0,1,1,1,2,,,,,,,,
1,1,1,2,2,,,,,,,,
2,1,1,3,1,Masjid committee,13.0,,,1.0,1.0,,
3,1,1,4,2,,,,,,,,
4,1,1,5,1,"ASA, BRAC",3.0,4.0,,1.0,2.0,8.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
715,40,1,14,2,,,,,,,,
716,40,1,15,2,,,,,,,,
717,40,1,16,1,,3.0,,,1.0,1.0,,
718,40,1,17,2,,,,,,,,


In [52]:
df.groupby("sl").count()

Unnamed: 0_level_0,group,code,f01,f02,f03_1,f03_2,f03_3,f04,f05_1,f05_2,f05_3
sl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,18,18,18,18,6,4,1,6,6,2,0
2,18,18,18,18,6,4,0,6,4,2,1
3,18,18,18,18,4,0,0,4,3,1,0
4,18,18,18,18,4,3,2,4,3,2,0
5,18,18,18,18,2,0,0,2,1,1,1
6,18,18,18,18,3,1,0,3,3,1,0
7,18,18,18,18,3,3,1,3,2,1,1
8,18,18,18,18,4,0,0,4,4,0,0
9,18,18,18,18,4,3,1,4,4,2,0
10,18,18,18,18,3,1,0,2,2,1,0
