In [1]:
import pandas as pd 
import numpy as np 

In [2]:
country = 'mwi_mics19-20'

In [3]:
data = pd.read_csv('./'+ country)

In [4]:
data

Unnamed: 0,hh_id,psu,strata,weight,d_cm,d_nutr,d_satt,d_educ,d_elct,d_sani,...,cookingfuel,television,radio,telephone,refrigerator,car,bicycle,motorbike,animal_cart,computer
0,101,1,Chitipa Rural,0.432607,0.0,,0.0,0.0,1.0,0.0,...,WOOD,No,No,No,No,No,No,No,No,No
1,102,1,Chitipa Rural,0.432607,0.0,1.0,0.0,1.0,1.0,1.0,...,WOOD,No,No,Yes,No,No,Yes,No,No,No
2,103,1,Chitipa Rural,0.432607,0.0,1.0,0.0,0.0,1.0,1.0,...,WOOD,No,No,Yes,No,No,No,No,No,No
3,104,1,Chitipa Rural,0.432607,0.0,0.0,0.0,0.0,1.0,0.0,...,WOOD,No,Yes,Yes,No,No,Yes,No,No,No
4,106,1,Chitipa Rural,0.432607,0.0,0.0,0.0,1.0,1.0,1.0,...,WOOD,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25414,111220,1112,Blantyre City,1.204571,0.0,0.0,0.0,1.0,1.0,1.0,...,WOOD,No,Yes,No,No,No,No,No,No,No
25415,111221,1112,Blantyre City,1.204571,0.0,1.0,0.0,0.0,1.0,1.0,...,WOOD,Yes,Yes,Yes,No,No,Yes,No,No,No
25416,111222,1112,Blantyre City,1.204571,0.0,0.0,1.0,0.0,1.0,1.0,...,CHARCOAL,Yes,Yes,Yes,Yes,No,No,Yes,No,No
25417,111223,1112,Blantyre City,1.204571,0.0,0.0,0.0,0.0,1.0,1.0,...,WOOD,No,No,No,No,No,No,No,No,No


In [None]:
PSU = data.groupby('psu').agg({'weight' : 'max' ,'strata' : 'max', 'region' : 'max'}).reset_index()

In [6]:
# Convert variables to correct data type:
for col in data.columns:
    data[col] = data[col].astype('category')

In [7]:
cols_to_float = ['weight','eduyears']
for col in cols_to_float:
    data[col] = data[col].astype('Float32')
    
cols_to_int = ['sex_Female','sex_Male',
                'agec7_0-4','agec7_5-9', 'agec7_10-14',
                'agec7_15-17', 'agec7_18-59', 'agec7_60+',
                'hhsize','child_eligible','child_mortality']
for col in cols_to_int:
    data[col] = data[col].astype('Int32')

In [8]:
trn_df = data.drop(columns=['strata','no_missing_edu',
    'attendance_not currently attending', 'attendance_currently attending',
    'no_missing_atten', 'timetowater', 'underweight_0.0', 'underweight_1.0',
    'stunting_0.0', 'stunting_1.0', 'wasting_0.0', 'wasting_1.0'], axis=1)

# Experiments

Electricity indicator has a $\phi_K$ values greater than 0.6 with Housing indicator. Years of education has more relationship with Electricity, Assets and Housing, according to $\phi_K$ value. We will use these variables for imputation values experiments

In [9]:
pd.crosstab(data['d_elct'], data['d_hsg'])

d_hsg,0,1
d_elct,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3543,1551
1.0,3554,16736


In [10]:
for i in ['d_elct', 'd_hsg', 'd_asst']:
    print("=== d_educ vs ", i, "==="  )
    print(pd.crosstab(data['d_educ'], data[i]))

=== d_educ vs  d_elct ===
d_elct   0.0    1.0
d_educ             
0.0     4554  13623
1.0      534   6630
=== d_educ vs  d_hsg ===
d_hsg      0      1
d_educ             
0.0     6420  11779
1.0      678   6498
=== d_educ vs  d_asst ===
d_asst     0     1
d_educ            
0.0     8974  9225
1.0     1528  5648


#### Select experiment

In [11]:
import Experiments

In [12]:
Experiment = input("Enter the experiment number: ")
Percentage_of_NA = input("Enter the percentage of NA values: ")

In [13]:
trn_df = Experiments.select_experiment(data=trn_df, experiment=int(Experiment), percentage=int(Percentage_of_NA))

In [14]:
# Add the deprivations score as control for better MPI prediction
indicators = ['d_cm','d_nutr', 'd_satt','d_educ', 'd_elct', 'd_wtr', 'd_sani', 'd_hsg', 'd_ckfl', 'd_asst']
weights  = np.array([1/6, 1/6, 1/6, 1/6, 1/18, 1/18, 1/18, 1/18, 1/18, 1/18])

In [15]:
trn_df['score'] = round((trn_df.loc[:,indicators].astype(float) * weights).sum(axis=1, skipna=False),3) 

In [None]:
from mostlyai import engine
from pathlib import Path

In [None]:
ws = Path("temp-ws_EXP"+Experiment+"_"+Percentage_of_NA)
engine.init_logging()

In [None]:
# Split data
engine.split(                         # split data as PQT files for trn + val to {ws}/OriginalData/tgt-data
    workspace_dir=ws,   
    tgt_data=trn_df,
    ctx_data=PSU,
    tgt_primary_key='hh_id',
    ctx_primary_key='psu',
    tgt_context_key='psu',
    model_type="TABULAR"
    )

In [None]:
engine.analyze(workspace_dir=ws,
            value_protection = False)    # Avoid value protetion

In [None]:
engine.encode(workspace_dir=ws) 

In [None]:
    
engine.train(                         # train model and store to {ws}/ModelStore/model-data
    workspace_dir=ws,
    model = "MOSTLY_AI/Large",
    max_training_time=10,              # limit TRAIN to 1 minute for demo purposes
    device = 'cuda'  # Use GPU if available, 
    )

In [None]:
engine.generate(workspace_dir=ws, imputation={'columns' : ["d_hsg"]}) 

In [None]:
gen_df = pd.read_parquet(ws / "SyntheticData")

In [None]:
PSU['psu'] = PSU['psu'].astype('category')

In [None]:
# Generate synthetic context data
ws_ctx = Path("temp-ws_ctx")
engine.init_logging()
# Split data
engine.split(                         # split data as PQT files for trn + val to {ws}/OriginalData/tgt-data
    workspace_dir=ws_ctx,   
    tgt_data=PSU,
    model_type="TABULAR"
    )

In [None]:
engine.analyze(workspace_dir=ws_ctx, value_protection=False)   
engine.encode(workspace_dir=ws_ctx)  

In [None]:
engine.train(                         
    workspace_dir=ws_ctx,
    model = "MOSTLY_AI/Medium",  # Use a smaller model for context data
)

In [None]:
engine.generate(workspace_dir=ws_ctx)

In [None]:
synthetic_context_data = pd.read_parquet(ws_ctx / "SyntheticData")

In [None]:
gen_df.to_csv('./syn_'+country+"_EXP"+Experiment+"_"+Percentage_of_NA)

In [None]:
pd.crosstab(gen_df['d_asst'], gen_df['d_hsg'], normalize=True)

In [None]:
pd.crosstab(data['d_asst'], data['d_hsg'],  normalize=True)

In [None]:
from mostlyai import qa

In [None]:
# analyze sequential data with context
report_path, metrics = qa.report(
    syn_tgt_data = gen_df,
    trn_tgt_data = trn_df
)

In [None]:
# pretty print metrics
print(metrics.model_dump_json(indent=4))

In [None]:
gen_df.columns