# How do household coping strategies evolve with increased food insecurity. An examination of Food Price Shock of 2015-2024.


Data cleaning is a crucial step in data analysis, ensuring that the data is accurate, consistent, and ready for modeling or further analysis.

In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set the columns to display
pd.set_option("display.max_columns", 500)

### Utility Functions

In [17]:
# Define a function that removes numbers and dot at the start of string values
def remove_number_prefix(df: pd.DataFrame):
    """
    Removes numbers and dots at the start of string values in all columns of a DataFrame.
    """
    return df.replace(to_replace=r'^\d+\.\s*', value='', regex=True)



def cleaning_pipeline(data: pd.DataFrame):
    df = (data
          .pipe(remove_number_prefix)
          .query("consent == 'NO'") # Select household data that provided consent
          .replace({888: np.nan, 999: np.nan, 888000: np.nan, 999000: np.nan})
          .dropna(axis=1, how="all")
         )
    return df
          



## Load data and Select Relavant Columns

### Individual Level Data

#### Head of Household (`sect1_harvestw5`)
This section captures a roster of **`individuals`** living in the household, including their relationship to the household head, gender, year of birth, age, marital status, spouse identification, religion, parental status, and details for new members such as the date and reason for joining the household, as well as migration information.

In [50]:
# Household Roster (Household Size, Head)
sect1_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect1_harvestw5.csv",
                             low_memory=False)

# Apply data cleaning pipeline
# sect1_harvestw5 = cleaning_pipeline(sect1_harvestw5)

columns = ["hhid",
           "zone",
           "ea",
           "indiv",
           "s1q2", # Gender
           "s1q3", # relationship with hoh
           "s1q4", # still a member
           "s1q6", # age
           "s1q16", # marital status
           "s1q28", # father highest education
           "s1q29", # mother main job
           "s1q33", # Mother highest education
           "s1q34", # Mother mian job
          ]

# Dataframe of household head
df_h = sect1_harvestw5.loc[:, columns].query("s1q3 == '1. HEAD'").reset_index(drop=True)
df_1 = sect1_harvestw5.loc[:, columns]
print(df_1.shape)
print("hh version of data", df_h.shape)
df_h.tail(2)

(28256, 13)
hh version of data (4716, 13)


Unnamed: 0,hhid,zone,ea,indiv,s1q2,s1q3,s1q4,s1q6,s1q16,s1q28,s1q29,s1q33,s1q34
4714,379154,1. North Central,1102,1,1. MALE,1. HEAD,1. YES,59.0,1. MARRIED (MONOGAMOUS),0. NONE,"1. AGRICULTURE, FORESTRY &amp; FISHING",0. NONE,8. WHOLESALE &amp; RETAIL TRADE
4715,379155,1. North Central,1102,1,1. MALE,1. HEAD,1. YES,52.0,1. MARRIED (MONOGAMOUS),0. NONE,"1. AGRICULTURE, FORESTRY &amp; FISHING",0. NONE,8. WHOLESALE &amp; RETAIL TRADE


#### Education (`sect2_harvestw5`)
his section records `individual-level` information on educational attainment, school characteristics, expenditures, and repetition

In [60]:
sect2_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect2_harvestw5.csv",
                             low_memory=False)

# Select columns to merge by and education
columns = ["hhid", 
           "indiv",
          "s2q6", # has attended any schoool
          "s2q9", # highest education completed
           "s2q10" # highest qualification attained
          ]

# Create head of hh education dataframe
df_edu = sect2_harvestw5.loc[:, columns]

# Print the shape and display
print(df_edu.shape)
df_edu.head(2)

(25662, 5)


Unnamed: 0,hhid,indiv,s2q6,s2q9,s2q10
0,10001,1,1. YES,43. HIGHER DEGREE,11. PHD/MASTERS
1,10001,2,1. YES,43. HIGHER DEGREE,11. PHD/MASTERS


#### Women Dietary Diversity (`sect3d_harvestw5`)
This section records **`women-level`** information on women’s nutrition diet for the previous day.

In [6]:
sect3d_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect3d_harvestw5.csv",
                             low_memory=False)
sect3d_harvestw5.shape

(25662, 40)

### Household Data

#### Engaged in Agricultural activities (`secta_harvestw5`)
This section captures **`household-level`** information. The dataset includes household identifiers, enumerator and supervisor identifiers, the date and time of the interview, questions to determine whether the household is engaged in agricultural activities, and observation notes recorded by the enumerator during the interview.

In [67]:
# Household level information; Agricultural activities
secta_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/secta_harvestw5.csv")

# selected required variables
columns = ["zone", 
           "sector", 
           "ea",
           "hhid", 
           "wt_wave5",
           "wt_longpanel_wave5", 
           "wt_cross_wave5",
           "old_new",
           "ag1", # cultivated crop
           "ag2", # own land
           "ag5", # fishing activities
           "consent"
          ]

dfa = secta_harvestw5.loc[:, columns]
print(dfa.shape)

(4771, 12)


#### Food Consumption (`sect5b_harvestw5`)
This section records `household-food` item level information on the quantity and value of food consumed within the household during the past 7 days, as well as the quantity and value of food purchased during the past 30 days.

* Compute the food expenditure per household.

In [90]:
sect5b_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect5b_harvestw5.csv")


columns = ["hhid", "ea", "item_cd", "item_cd_alt", "s5bq8"]

df_5b = sect5b_harvestw5.loc[:, columns]

df_food_exp = df_5b.groupby(by="hhid").agg({"s5bq8": "sum"}).rename(columns={"s5bq8": "food_exp"}).reset_index()
# df_5b.head()
print(df_food_exp.shape)
df_food_exp.head()

(4715, 2)


Unnamed: 0,hhid,food_exp
0,10001,25600.0
1,10002,20150.0
2,10004,19160.0
3,10005,15670.0
4,10008,8300.0


#### Household Non-Food Expenditure (Annual) (`sect6c_harvestw5`)
This section records **`household-nonfood`** item level information on non-food expenditure during the last 12 months.

In [94]:
sect6c_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect6c_harvestw5.csv")

# Select required variables.
columns = ["hhid",
           "s6q6", # total household spend
          ]
df6c = sect6c_harvestw5.loc[:, columns]
df6c = df6c.groupby("hhid").agg(non_food_exp=("s6q6", "sum")).reset_index()
print(df6c.shape)
df6c.head()

(4715, 2)


Unnamed: 0,hhid,non_food_exp
0,10001,400.0
1,10002,900.0
2,10004,11500.0
3,10005,26500.0
4,10008,100500.0


#### Food Security (`sect7_harvestw5`)
This section records **`household-level`** information on food security status of the household during the last 30 days/last 12 months

In [185]:
sect7_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect7_harvestw5.csv")

# 30 days recall
fi_status_30_cols = [col for col in sect7_harvestw5 if col.startswith("s7q1")]
# Food Insecurity in the previous 30 days
df_fi_coping = sect7_harvestw5.loc[:, fi_status_30_cols]

# Not enough food to eat in the past 12 months 
fi_status_12_col = ["hhid","s7q3"]
# Causes of 12 month recall
fi_status_cause_cols = [col for col in sect7_harvestw5 if col.startswith("s7q5")]
# Food Insecurity causes in the previous 12 months
df_f1_cause_12 = sect7_harvestw5.loc[:, fi_status_cause_cols]

# Subset all food Insecurity variables
df7 = sect7_harvestw5.loc[:, fi_status_12_col+fi_status_cause_cols+fi_status_30_cols ]

df7.shape
df7.head(2)

Unnamed: 0,hhid,s7q3,s7q5_1,s7q5_2,s7q5_3,s7q5_os,s7q1a,s7q1b,s7q1c,s7q1d,s7q1e,s7q1f,s7q1g,s7q1h,s7q1i,s7q1j
0,10001,2. NO,,,,,2. NO,2. NO,2. NO,1. YES,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO
1,10002,2. NO,,,,,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO


In [189]:
# Rename to columns names
df7.rename(columns={"s7q1a": "limit_type",
                   "s7q1b": "less_preferred",
                   "s7q1c": "limited_portion",
                   "s7q1d": "reduce",
                   "s7q1e": "ateless",
                   "s7q1f": "ranout",
                   "s7q1g": "none_did_not_eat",
                   "s7q1h": "whole_day",
                   "s7q1i": "restrict_for_children",
                   "s7q1j": "borrowed"})

Unnamed: 0,hhid,s7q3,s7q5_1,s7q5_2,s7q5_3,s7q5_os,limit_type,less_preferred,s7q1c,s7q1d,s7q1e,s7q1f,s7q1g,s7q1h,s7q1i,s7q1j
0,10001,2. NO,,,,,2. NO,2. NO,2. NO,1. YES,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO
1,10002,2. NO,,,,,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO
2,10004,2. NO,,,,,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO
3,10005,2. NO,,,,,1. YES,2. NO,1. YES,2. NO,1. YES,1. YES,1. YES,2. NO,2. NO,2. NO
4,10008,1. YES,5. FOOD IN THE MARKET WAS VERY EXPENSIVE,12. LACK OF MONEY,13. LACK OF ACCESS TO MONEY AT THE BANK DUE TO...,,1. YES,1. YES,2. NO,2. NO,1. YES,2. NO,2. NO,2. NO,1. YES,2. NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4710,379144,2. NO,,,,,1. YES,1. YES,1. YES,1. YES,1. YES,2. NO,2. NO,2. NO,1. YES,2. NO
4711,379146,2. NO,,,,,2. NO,1. YES,1. YES,2. NO,1. YES,2. NO,2. NO,2. NO,2. NO,2. NO
4712,379151,2. NO,,,,,2. NO,1. YES,1. YES,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO
4713,379154,2. NO,,,,,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO,2. NO


#### Non Farm Enterprise (`sect8a_harvestw5`)
This section records **`household-level`** information on non-farm enterprises/activities (NFE) run by members of the household.

In [9]:
sect8a_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect8a_harvestw5.csv")

columns = ["hhid", "s8q1__1",]
df_8a = sect8a_harvestw5.loc[:, columns]
df_8a["s8q1__1"] = df_8a["s8q1__1"].astype('category').cat.codes
print(df_8a.shape)
df_8a.head()

(4715, 2)


Unnamed: 0,hhid,s8q1__1
0,10001,0
1,10002,0
2,10004,0
3,10005,0
4,10008,0


#### Housing (`sect9_harvestw5`)
This section records **`household-level`** information on homeownership and characteristics of home (type of roof, floor, outside wall, number of rooms, type of cookstove and cooking fuel, electricity connection, drinking water source, sanitation facility, refuse collection)


In [10]:
sect9_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect9_harvestw5.csv",
                             low_memory=False)

columns = ["hhid",
           "s9q3", # own , rent ...
          ]

df_9 = sect9_harvestw5.loc[:, columns]
df_9.shape

(4715, 2)

#### Food Consumption Shocks (`sect10_harvestw5`)
This section records **`household-level`** information on the effects of external conditions on a household’s normal food consumption practices.

* Face fcs because of plant disease
* 

In [13]:
sect10_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect10_harvestw5.csv",
                             low_memory=False)
print(sect10_harvestw5.shape)
sect10_harvestw5.head(2)

(4715, 54)


Unnamed: 0,zone,state,lga,sector,ea,hhid,s10q1a,s10q1b,s10q2a,s10q2b,s10q3a,s10q3b,s10q4a,s10q4b,filter1,s10q5a,s10q5b,s10q6a,s10q6b,s10q7a,s10q7b,s10q8,s10q9a,s10q9b,s10q10a,s10q10b,filter2,s10q11a,s10q11b,s10q12a,s10q12c,s10q13a,s10q13b,s10q14a,s10q14b,s10q15a,s10q15b,s10q16a,s10q16b,s10q17a,s10q17b,s10q18a,s10q18b,s10q19a,s10q19b__1,s10q19b__2,s10q19b__3,s10q19b__4,s10q19c,s10q20a,s10q20b,filter3,s10q42,s10q43
0,South East,Abia,UMUAHIA NORTH,URBAN,670,10001,NO,,NO,,NO,,NO,,2,,,,,,,NO,,,,,2,,,,,NO,,NO,,NO,,NO,,NO,,NO,,NO,,,,,,NO,,2,,
1,South East,Abia,UMUAHIA NORTH,URBAN,670,10002,NO,,NO,,NO,,NO,,2,,,,,,,YES,NO,,NO,,1,YES,Somewhat severe,NO,,NO,,NO,,YES,Somewhat severe,NO,,NO,,NO,,NO,,,,,,NO,,1,Same as before,Better than before


#### Economic Shocks (`sect12_harvestw5`)
This section records `household-shock` level information on economic shocks affecting the household and coping mechanism adopted by the household.

* climatic shock: Floods, Droughts, Fi
* Increase in price of farming/business inputs (excluding petrol and other fuels)
* Fall in the price of farming/business output
* Increase in price of major food items usually consumed by the household 

In [172]:
sect12_harvestw5 = pd.read_csv("../data/pre_harvest_w5/household/sect12_harvestw5.csv",
                             low_memory=False)

In [163]:
# Utitily function to filter by shocks
def filter_and_select_columns(dataframe, query_str, new_col):
    """
    Filters a DataFrame based on a query condition and selects specific columns.
    """
    columns = ["hhid", "s12q1"]
    query_condition = "shock_cd == @query_str"
    filtered_df = dataframe.query(query_condition)
    selected_df = filtered_df.loc[:, columns].rename(columns={"s12q1": new_col}).reset_index(drop=True)
    return selected_df

In [179]:
# Climate Change
# 1. Floods 
# 3. Droughts
# 4. Fire  
# 6. Pests and Plant Diseases 
# 2. Irregular rains (including unexpected variation in timing and rainfall amount)

df_flood = filter_and_select_columns(sect12_harvestw5, "1. Floods", "shock_flood")
df_drought = filter_and_select_columns(sect12_harvestw5, "3. Droughts", "shock_drought")
df_fire = filter_and_select_columns(sect12_harvestw5, "4. Fire", "shock_fire")

df_pst_disease = filter_and_select_columns(sect12_harvestw5, 
                                           "6. Pests and Plant Diseases",
                                          "shock_pest_disease")
df_rainfall = filter_and_select_columns(sect12_harvestw5, 
                                       "2. Irregular rains (including unexpected variation in timing and rainfall amount)",
                                       "shock_irr_rain")

# Merge the dataframes
df_climate = pd.merge(pd.merge(pd.merge(pd.merge(df_flood, df_drought, on='hhid', how='left'), 
         df_fire, on='hhid', how='left'),df_pst_disease, on="hhid", how="left"),
        df_rainfall, on="hhid", how="left")

shock_columns = ["shock_flood", "shock_drought", "shock_fire", "shock_pest_disease", "shock_irr_rain"]
df_climate['shock_climate'] = df_climate[shock_columns].apply(lambda row: any(value == "1. YES" for value in row), axis=1)

df_climate.shape

(4715, 7)

In [166]:
## Energy
# 27. Shortage/scarcity of petrol 
# 26. Increase in prices of other fuels (excluding petrol) (e.g. cooking gas, kerosene, firewood, charcoal)
# 25. Increase in price of oil and fuel  

df_scarcity_petrol = filter_and_select_columns(sect12_harvestw5, 
                                               "27. Shortage/scarcity of petrol", 
                                               "shock_petrol_scarcity")
df_other_fuel = filter_and_select_columns(sect12_harvestw5, 
                          "26. Increase in prices of other fuels (excluding petrol) (e.g. cooking gas, kerosene, firewood, charcoal)", 
                          "shock_increase_other_fuel")
df_fuel = filter_and_select_columns(sect12_harvestw5,  
                                      "25. Increase in price of oil and fuel",
                                      "shock_incr_oil_fuel")

# Merge the dataframe
df_shock_energy = pd.merge(pd.merge(df_scarcity_petrol, df_other_fuel, 
                                    on='hhid', how='left'), 
                           df_fuel, on='hhid', how='left')


shock_columns = ["shock_petrol_scarcity", 
                 "shock_increase_other_fuel", 
                 "shock_incr_oil_fuel"]

df_shock_energy['shock_energy'] = (df_shock_energy[shock_columns]
                                   .apply(lambda row: any(value == "1. YES" for value in row), axis=1)
                                  )
df_shock_energy.shape

(4715, 5)

In [167]:
# 24. Increase in price of major food items usually consumed by the household 
df_shock_inc_price_food = filter_and_select_columns(sect12_harvestw5, 
                                               "24. Increase in price of major food items usually consumed by the household", 
                                               "shock_increase_price_food")

df_shock_inc_price_food.shape

(4715, 2)

In [None]:
# Output Related shocks
# 23. Fall in the price of farming/business output 


df_shock_fall_price_out = filter_and_select_columns(sect12_harvestw5, 
                                               "23. Fall in the price of farming/business output", 
                                               "shock_fall_price_out")

df_shock_fall_price_out.shape

In [177]:
# 22. Increase in price of farming/business inputs (excluding petrol and other fuels)
df_shock_incr_price_input = filter_and_select_columns(sect12_harvestw5, 
                                               "22. Increase in price of farming/business inputs (excluding petrol and other fuels)", 
                                               "shock_incr_price_input")

df_shock_incr_price_input.shape

(4715, 2)

In [171]:
# InSecurity
# 19. Kidnapping/Abduction for ransom 
# 18. Hijacking/robbery/assault of a household member
# 17. Theft/looting of cash and other property  


df_kidnap = filter_and_select_columns(sect12_harvestw5, 
                                      "19. Kidnapping/Abduction for ransom", 
                                      "shock_kidnap")
df_robbery = filter_and_select_columns(sect12_harvestw5, 
                                       "18. Hijacking/robbery/assault of a household member", 
                                       "shock_robbery")
df_theft = filter_and_select_columns(sect12_harvestw5,  
                                      "17. Theft/looting of cash and other property",
                                      "shock_theft")

# Merge the dataframe
df_shock_insecurity = pd.merge(pd.merge(df_kidnap, df_robbery, 
                                        on='hhid', how='left'), 
                               df_theft, on='hhid', how='left')


shock_columns = ["shock_kidnap", 
                 "shock_robbery", 
                 "shock_theft"]

df_shock_insecurity['shock_insecurity'] = (df_shock_insecurity[shock_columns]
                                   .apply(lambda row: any(value == "1. YES" for value in row), axis=1)
                                  )
df_shock_insecurity.shape

(4715, 5)

### Agriculture Data (Household level)

#### Agriculture (`sectaa_harvestw5`)
This section captures `household-level` information. The dataset includes household identifiers, enumerator and supervisor identifiers, the date and time of the interview, questions to determine whether the household is engaged in agricultural activities.


In [178]:
sectaa_harvestw5 = pd.read_csv("../data/pre_harvest_w5/agriculture/sectaa_harvestw5.csv",
                             low_memory=False)


columns = ["hhid", "ag1", "ag2", "ag5", "wt_wave5"]

print(sectaa_harvestw5.shape)
sectaa_harvestw5.head(2)

(4717, 14)


Unnamed: 0,zone,lga,state,ea,sector,cluster,strata,hhid,wt_wave5,old_new,InterviewStart,ag1,ag2,ag5
0,South East,UMUAHIA NORTH,Abia,670,URBAN,115-670,4,10001,15885.095703,1. Old Panel household,2024-02-28T18:16:12,NO,YES,NO
1,South East,UMUAHIA NORTH,Abia,670,URBAN,115-670,4,10002,16588.6875,1. Old Panel household,2024-02-29T17:53:09,NO,YES,NO


### Community Level Data

#### Infrastructure and Transportation (`sectc5_plantingw5`)
Data collected through Post Harvest `Community`.This data records the availability of infrastructure and the means of transportation to access it within the community.

In [9]:
sectc5_plantingw5 = pd.read_csv("../data/post_planting_w5/community/sectc5_plantingw5.csv",
                             low_memory=False)

# Main crops cultivated in this community
columns = ["zone", "state", "ea", "cluster_id", 
           "infra_code", "c5q1",
          "c5q3", # how far is the closest ;distance to market
          ]

sectc5_plantingw5.head(2)

Unnamed: 0,zone,state,lga,sector,ea,cluster_id,cluster,infra_code,c5q1,c5q3,c5q4,c5q4_os,c5q5,c5q5a,c5q6a,c5q6b,c5q7,c5q8
0,5. South South,10. Delta,1002. ANIOCHA SOUTH,1. Urban,114,1002-114,1002-114,201. Nursery school,1. YES,0. WITHIN THE COMMUNITY,,,,,,,2. NO,5.0
1,5. South South,10. Delta,1002. ANIOCHA SOUTH,1. Urban,114,1002-114,1002-114,202. Government primary school,1. YES,0. WITHIN THE COMMUNITY,,,,,,,2. NO,5.0


#### Community Main Crops (`sectc2a_harvestw5`)
Data collected through Post Harvest `Community`.
This data records the activities for which individuals in this community hire labor.

In [5]:
sectc2a_harvestw5 = pd.read_csv("../data/pre_harvest_w5/community/sectc2a_harvestw5.csv",
                             low_memory=False)

# Main crops cultivated in this community
columns = ["zone", "state", "lga", "sector", "c2q2b_1", "c2q2b_2"]

#### Key Events (`sectc5_harvestw5`)
This data records the key events in the `community` in the last 3 years.


In [31]:
sectc5_harvestw5 = pd.read_csv("../data/pre_harvest_w5/community/sectc5_harvestw5.csv",
                             low_memory=False)

columns = ["zone", "state", "lga", "sector","cluster_id", "c2q2b_1", "c2q2b_2"]

print(sectc5_harvestw5.shape)
sectc5_harvestw5.head(2)

(9652, 16)


Unnamed: 0,zone,state,lga,sector,ea,cluster_id,cluster,event_cd,c5q1,c5q1_os_1,c5q1_os_2,c5q2__2021,c5q2__2022,c5q2__2023,c5q2__2024,c5q3
0,5. South South,10. Delta,1002. ANIOCHA SOUTH,1. Urban,114,1002-114,1002-114,1. Drought,2. NO,,,,,,,
1,5. South South,10. Delta,1002. ANIOCHA SOUTH,1. Urban,114,1002-114,1002-114,2. Flood,2. NO,,,,,,,


#### Food Price (`sectc8_harvestw5`)
This section records prices of food items in the `community`.

In [32]:
sectc8_harvestw5 = pd.read_csv("../data/pre_harvest_w5/community/sectc8_harvestw5.csv",
                             low_memory=False)



print(sectc8_harvestw5.shape)
sectc8_harvestw5.head(2)

(56388, 15)


Unnamed: 0,zone,state,lga,sector,ea,cluster_id,cluster,item_cd,c8aq1,c8aq2_a,c8aq2_b,c8aq2_b_os,c8aq2_c,c8aq2_cvn,c8aq3
0,5. South South,10. Delta,1002. ANIOCHA SOUTH,1. Urban,114,1002-114,1002-114,10. Guinea corn/sorghum,2. NO,,,,,,
1,5. South South,10. Delta,1002. ANIOCHA SOUTH,1. Urban,114,1002-114,1002-114,11. Millet,1. YES,,13. CIGARETTE CUP,,,0.205,150.0


## Data Merging


### Merge Individual level data: 
Merge `df_h`, `df_edu` to include the education of head of household, to the other household head variables. Such as:
* marital status
* Gender
* age

In [183]:
df_hh_h = pd.merge(df_h, df_edu, how='left', on=["hhid", "indiv"])
df_hh_h.shape

(4716, 16)

### Merge Household level data on `hhid`
Merge the following dataframe


In [None]:
### Merge Community level data

## Data Preparation
Create the important `Household` variables required for statistical analysis

In [82]:
# identify members less than 18
sect1_harvestw5["under_18"] = sect1_harvestw5["s1q6"] < 18

# Identify members over 85
sect1_harvestw5["over_85"] = sect1_harvestw5["s1q6"] > 85

# n_elderly : Number of household members over the age of 85
# hh_size: Household size
# n_childrem: Number of household members less than 18
sect1_summary = sect1_harvestw5.groupby(by="hhid").agg(hh_size=("s1q6", "count"),
                                                       no_children=("under_18", "sum"),
                                                       n_elderly=("over_85", "sum")
                                                      ).reset_index()

# prop_children : Proportion of household members less than 18 years of age
sect1_summary["prop_children"] = sect1_summary["no_children"] \
/ sect1_summary["hh_size"]

print("Shape of the dataset",sect1_summary.shape )
print("---------------------")
sect1_summary.head(7)

Shape of the dataset (4716, 5)
---------------------


Unnamed: 0,hhid,hh_size,no_children,n_elderly,prop_children
0,10001,7,3,0,0.428571
1,10002,6,3,0,0.5
2,10004,3,0,0,0.0
3,10005,6,1,0,0.166667
4,10008,7,2,0,0.285714
5,10009,6,4,0,0.666667
6,10010,3,0,0,0.0


### Wealth Index (`wealth_index`)
This section records **`household-level`** information on household wealth index based on the characteristics of home (type of roof, floor, outside wall, number of rooms, type of cookstove and cooking fuel, electricity connection, drinking water source, sanitation facility, refuse collection)


In [8]:
# Create key variables
df_1["DOMESTIC"] = np.where(df_1["s1q3"] == "DOMESTIC HELP (RESIDENT)", 1, 0)
sect9_harvestw5["HOME"] = np.where(sect9_harvestw5["s9q3"] == "OWNED", 1, 0)
sect9_harvestw5["LAND"] = np.where((df["owns_land"] == "yes" | df["land_worked"] == "family_land"), 1, 0)


NameError: name 'df' is not defined

In [None]:
columns = ["hhid", "s9q3", "s9q10", "s9q11", "s9q12", "s9q12a", "s9q13",
          "s9q14", "s9q16_1", "s9q16_2", "s9q19_1", "s9q19_2",
          "s9q20", "s9q27", "s9q32", "s9q40", "s9q44", "s9q48"]

In [94]:
# hh_educ : Educational attainment of the household head: 
# No education; Senior Secondary School Certificate (SSSC) or less; 
# Undergraduate, professional or technical degree; Graduate degree

# Own_home : Home ownership = 1 and 0 otherwise

# Single_parent : Lone adult as head of household that is divorced, 
# separated or widowed and lives with children = 1, 0 otherwise

# Agriculture : Household member(s) involved in agriculture = 1 and 0 otherwise

# Create annual total expenditure (non food)
# sect6c_harvestw5["tot_exp"] = sect6c_harvestw5.groupby(by="hhid")["s6q6"].transform("sum")

In [None]:
# Explore the Food consumption shocks by involvement in agricultural activities or otherwise
# Explore economic Shock by involvement in agricultural activities or otherwise.

## Coping Strategies

sect5a_harvestw5