## Load and inspect dataset


In [39]:
import pandas as pd

# Replace with your full file path
df = pd.read_stata("../data/raw/CMHR22FL.DTA")

# See all column names
print(df.columns.tolist()[:50])   # show first 50 column names


['hhid', 'hv000', 'hv001', 'hv002', 'hv003', 'hv004', 'hv005', 'hv006', 'hv007', 'hv008', 'hv009', 'hv010', 'hv011', 'hv012', 'hv013', 'hv014', 'hv015', 'hv016', 'hv017', 'hv018', 'hv019', 'hv020', 'hv021', 'hv022', 'hv023', 'hv024', 'hv025', 'hv026', 'hv027', 'hv028', 'hv201', 'hv202', 'hv203', 'hv204', 'hv205', 'hv206', 'hv207', 'hv208', 'hv209', 'hv210', 'hv211', 'hv212', 'hv213', 'hv214', 'hv215', 'hv216', 'hv217', 'hv218', 'hv219', 'hv220']


## selecting relevant variables

In [49]:
# Select relevant columns using Cameroon 2022 DHS names
selected_cols = [
    'hv001',   # cluster number
    'hv024',   # region
    'hv025',   # urban/rural
    'hv009',   # household size
    'hv219',   # sex of household head
    'hv220',   # age of household head
    'hv206',   # electricity access
    'shniv_01',  # education level
    'shecr_01',  # years of education
    'sh30h',     # cooking fuel type
    'sh27' ,      # wealth category
    'sh30m'
]

# Filter the dataset
data = df[selected_cols].copy()
print(data.head())


   hv001                hv024  hv025  hv009 hv219 hv220 hv206 shniv_01  \
0      1  adam /nord/ext-nord  rural      5  male  36.0    no      NaN   
1      1  adam /nord/ext-nord  rural      6  male  35.0    no      NaN   
2      1  adam /nord/ext-nord  rural      4  male  50.0    no      NaN   
3      1  adam /nord/ext-nord  rural      8  male  48.0    no      NaN   
4      1  adam /nord/ext-nord  rural      5  male  45.0    no      NaN   

  shecr_01  sh30h  sh27  sh30m  
0       no   10.0   3.0   45.0  
1       no    9.0   6.0   30.0  
2       no    9.0   4.0   37.0  
3       no   10.0   8.0   50.0  
4       no    9.0   5.0   50.0  


## inapecting the sh30h variable

In [50]:
data['sh30h'].value_counts(dropna=False)
data['sh30h'].unique()



array([10.,  9., 17., 12., 13., 14., 15., 19., 16., 18., 11., nan,  7.,
        8., 20.,  6.])

In [51]:
data['sh30h'] = pd.to_numeric(data['sh30h'], errors='coerce')


## mapping numeric codes to readable texts

## simpliying into firewood vs gas category

In [52]:
print("sh30h unique:", df['sh30h'].unique()[:10])
print("sh30m unique:", df['sh30m'].unique()[:10])


sh30h unique: [10.  9. 17. 12. 13. 14. 15. 19. 16. 18.]
sh30m unique: [45. 30. 37. 50. 10. 13. 35. 40. 19. 32.]


In [53]:
df[['sh30h', 'sh30m']].value_counts().head(10)


sh30h  sh30m
10.0   20.0     34
       30.0     34
       0.0      31
       15.0     31
11.0   40.0     31
9.0    15.0     30
       30.0     29
10.0   45.0     28
12.0   10.0     28
11.0   10.0     28
Name: count, dtype: int64

In [54]:
print(df[['sh30h', 'sh30m']].dropna().head(20))


    sh30h  sh30m
0    10.0   45.0
1     9.0   30.0
2     9.0   37.0
3    10.0   50.0
4     9.0   50.0
5    17.0   10.0
6    17.0   13.0
7    12.0   45.0
8    13.0   35.0
9    12.0   50.0
10   12.0   40.0
11   13.0   19.0
12   12.0   32.0
13   12.0   35.0
14   14.0   35.0
15   15.0   12.0
16   15.0   46.0
17   14.0   42.0
18   15.0    5.0
19   14.0   35.0


In [56]:
import numpy as np

# Clean sh30m: convert to numeric and fill missing values temporarily
data['sh30m'] = pd.to_numeric(data['sh30m'], errors='coerce')

# Compute fuel_code only for valid rows
data['fuel_code'] = np.floor(data['sh30m'] / 10).astype('Int64')   # Int64 allows NaN


fuel_mapping = {
    1: 'electricity',
    2: 'gas',
    3: 'biogas',
    4: 'kerosene',
    5: 'charcoal',
    6: 'wood',
    7: 'crop_residue',
    8: 'animal_dung',
    9: 'other'
}

data['fuel_type'] = data['fuel_code'].map(fuel_mapping)


In [58]:
fuel_mapping = {
    1: 'electricity',
    2: 'gas',
    3: 'biogas',
    4: 'kerosene',
    5: 'charcoal',
    6: 'wood',
    7: 'crop_residue',
    8: 'animal_dung',
    9: 'other'
}

data['fuel_type'] = data['fuel_code'].map(fuel_mapping)

def classify_fuel(f):
    if f in ['wood', 'charcoal', 'crop_residue', 'animal_dung']:
        return 'Firewood'
    elif f in ['gas', 'biogas','kerosene']:
        return 'Gas'
    else:
        return 'Other'

data['fuel_category'] = data['fuel_type'].apply(classify_fuel)

print(data['fuel_category'].value_counts(normalize=True))


fuel_category
Gas         0.500283
Other       0.354438
Firewood    0.145280
Name: proportion, dtype: float64


## selecting relevant columns

In [59]:
predictor_cols = [
    'hv001',     # cluster/community
    'hv024',     # region
    'hv025',     # urban/rural
    'hv009',     # household size
    'hv206',     # electricity access
    'sh27',      # wealth category
    'shniv_01',  # education level (niveau)
    'shecr_01',  # years of education
    'hv219',     # sex of household head
    'hv220',     # age of household head
    'fuel_category'  # from your previous step
]

socio_df = data[predictor_cols].copy()
print(socio_df.head())


   hv001                hv024  hv025  hv009 hv206  sh27 shniv_01 shecr_01  \
0      1  adam /nord/ext-nord  rural      5    no   3.0      NaN       no   
1      1  adam /nord/ext-nord  rural      6    no   6.0      NaN       no   
2      1  adam /nord/ext-nord  rural      4    no   4.0      NaN       no   
3      1  adam /nord/ext-nord  rural      8    no   8.0      NaN       no   
4      1  adam /nord/ext-nord  rural      5    no   5.0      NaN       no   

  hv219 hv220 fuel_category  
0  male  36.0           Gas  
1  male  35.0           Gas  
2  male  50.0           Gas  
3  male  48.0      Firewood  
4  male  45.0      Firewood  


# clean and standardize predictors

In [62]:
# converting numeric codes to readable text
socio_df['urban_rural'] = socio_df['hv025'].replace({1: 'Urban', 2: 'Rural'})
socio_df['electricity'] = socio_df['hv206'].replace({1: 'Yes', 0: 'No'})
socio_df['sex_head'] = socio_df['hv219'].replace({1: 'Male', 2: 'Female'})


In [64]:
#handling education and wealth
socio_df['sh27'] = pd.to_numeric(socio_df['sh27'], errors='coerce')        # wealth
socio_df['shecr_01'] = pd.to_numeric(socio_df['shecr_01'], errors='coerce')# years of education
socio_df['shniv_01'] = pd.to_numeric(socio_df['shniv_01'], errors='coerce')# education level
socio_df['hv009'] = pd.to_numeric(socio_df['hv009'], errors='coerce')      # household size
socio_df['hv220'] = pd.to_numeric(socio_df['hv220'], errors='coerce')      # age

socio_df = socio_df.dropna(subset=['fuel_category'])


In [65]:
#aggregate to community level
agg_df = (
    socio_df
    .groupby(['hv024', 'urban_rural'])
    .agg(
        firewood_share=('fuel_category', lambda x: (x == 'Firewood').mean()),
        gas_share=('fuel_category', lambda x: (x == 'Gas').mean()),
        avg_household_size=('hv009', 'mean'),
        avg_education=('shecr_01', 'mean'),
        avg_education_level=('shniv_01', 'mean'),
        avg_wealth=('sh27', 'mean'),
        electricity_rate=('hv206', lambda x: (x == 1).mean()),
        avg_age=('hv220', 'mean')
    )
    .reset_index()
)

agg_df.rename(columns={'hv024': 'region'}, inplace=True)
print(agg_df.head())


                region urban_rural  firewood_share  gas_share  \
0      yaound /douala       urban        0.150538   0.494624   
1      yaound /douala       rural             NaN        NaN   
2  adam /nord/ext-nord       urban        0.155620   0.481268   
3  adam /nord/ext-nord       rural        0.154818   0.527646   
4      centre /sud/est       urban        0.081481   0.555556   

   avg_household_size  avg_education  avg_education_level  avg_wealth  \
0            5.634409            NaN                  NaN    2.997849   
1                 NaN            NaN                  NaN         NaN   
2            6.325648            NaN                  NaN    3.247839   
3            5.635071            NaN                  NaN    3.113924   
4            6.770370            NaN                  NaN    3.088889   

   electricity_rate    avg_age  
0               0.0  38.239177  
1               NaN        NaN  
2               0.0  42.406340  
3               0.0  43.370253  
4    

  .groupby(['hv024', 'urban_rural'])


In [66]:
agg_df.to_csv(r"C:\Users\Owner\OneDrive\Desktop\Data\cleaned_socioeconomic_cameroon.csv", index=False)
