In [1]:
import numpy as np
import pandas as pd
from missmecha.generator import MissMechaGenerator
from missmecha.impute import SimpleSmartImputer

np.random.seed(42)

data = pd.DataFrame({
    "age": np.random.randint(20, 65, size=100), 
    "income": np.random.normal(60000, 10000, 100), 
    "gender": np.random.choice([0, 1], size=100)
})
mecha = MissMechaGenerator(mechanism="mcar", missing_rate=0.5)
mcar_missing = mecha.fit_transform(data)

mcar_missing.head()

Unnamed: 0,age,income,gender
0,58.0,69305.844008,
1,48.0,,
2,34.0,,1.0
3,,61736.020637,1.0
4,,66622.845136,


### Compute missing rate

In [2]:
from missmecha.analysis import compute_missing_rate

missing_rate = compute_missing_rate(mcar_missing)

Overall missing rate: 51.00%
153 / 300 total values are missing.

Top variables by missing rate:


Unnamed: 0_level_0,n_missing,missing_rate (%),n_unique,dtype,n_total
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
gender,55,55.0,2,float64,100
income,51,51.0,49,float64,100
age,47,47.0,32,float64,100


### Impute with SimpleSmartImputer

In [3]:
imp = SimpleSmartImputer(cat_cols = ["gender"])
data_imputed = imp.fit_transform(mcar_missing)

[SimpleSmartImputer] Column 'age' treated as numerical. Fill value = 41.45283018867924
[SimpleSmartImputer] Column 'income' treated as numerical. Fill value = 60593.133223389246
[SimpleSmartImputer] Column 'gender' treated as categorical. Fill value = 1.0


### Evaluate using RMSE and AvgErr for Mixtype

In [4]:
from missmecha.analysis import evaluate_imputation

eval_results = evaluate_imputation(data, 
                                   data_imputed, 
                                   mecha.bool_mask,
                                   method = "rmse")


--------------------------------------------------
Column                 RMSE   Scaled (0-1)
--------------------------------------------------
age                  13.060          0.297
income             9781.570          0.210
gender                0.661          0.661
--------------------------------------------------
Overall            3265.097          0.389


In [5]:
eval_results = evaluate_imputation(data, 
                                   data_imputed, 
                                   mecha.bool_mask,
                                   cat_cols = ["gender"])

--------------------------------------------------
Column               AvgErr   Scaled (0-1)
--------------------------------------------------
age                  13.060          0.297
income             9781.570          0.210
gender                0.564          0.564
--------------------------------------------------
Overall            3265.065          0.357


### Run MCARTest

In [6]:
from missmecha.analysis import MCARTest


In [7]:
MCARTest(method = "little")(mcar_missing)

Method: Little's MCAR Test
Test Statistic p-value: 0.251537
Decision: Fail to reject the null hypothesis (α = 0.05)
→ There is insufficient evidence to reject MCAR.


0.25153689351029707

In [8]:
mecha = MissMechaGenerator(mechanism="mar", mechanism_type = 5, missing_rate=0.2)
mar_missing = mecha.fit_transform(data)

MCARTest(method = "little")(mar_missing)

[MARType5] Selected column 1 as dependency (xd).
Method: Little's MCAR Test
Test Statistic p-value: 0.017166
Decision: Reject the null hypothesis (α = 0.05)
→ The data is unlikely to be Missing Completely At Random (MCAR).


0.017165606302943948