## AutoGluonImputer

This package offers a sophisticated solution for handling missing data in datasets using the AutoGluon TabularPredictor. It's adept at working with both numerical and categorical data and provides a machine-learning-driven approach for imputation.


In [None]:
#!pip install --upgrade pandas numpy scikit-learn autogluon
%load_ext autoreload
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from autogluon.tabular import TabularDataset
from scripts.autogluonImputer import Imputer 
import importlib


In [None]:
# get help about Imputer
help(Imputer)

# print the content of Imputer
import inspect
print(inspect.getsource(Imputer))

#### Step 2: Prepare the Data


In [None]:

# Load the data
X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)

# combine X and y in one dataframe
df=X.copy()
df['target']=y
df.head()
# drop name and ticket

df.drop(['name','ticket'],axis=1,inplace=True)


In [None]:
df=TabularDataset(df)
df.dtypes
# convert object to category
for col in df.columns:
    if df[col].dtype=='object':
        df[col]=df[col].astype('category')

# convert integer to float
for col in df.columns:
    if df[col].dtype=='int64':
        df[col]=df[col].astype('float64')

df.dtypes


In [None]:

# Split the data into train and test sets
train, test = train_test_split(df, test_size=0.3, random_state=42)

# Introduce missingness
train_missing = train.mask(np.random.random(train.shape) < 0.2)
test_missing = test.mask(np.random.random(test.shape) < 0.2)


## Step 3: Impute Missing Values

In [None]:
imputer = Imputer(num_iter=2, time_limit=5)
train_imputed = imputer.fit(train_missing)
test_imputed = imputer.transform(test_missing)


In [None]:
# Compare imputed values with original values for the target variable
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Identify missing indices in test dataset
missing_indices_test = test_missing['age'].index[test_missing['age'].apply(np.isnan)]

# Plot imputed values against original values
plt.scatter(test_imputed['age'][missing_indices_test], test['age'][missing_indices_test])
plt.xlabel('Imputed Values')
plt.ylabel('Original Values')
plt.title('Imputed Values vs Original Values')
sns.regplot(x=test_imputed['age'][missing_indices_test], y=test['age'][missing_indices_test], scatter=False, color='red')
# Calculate and display the correlation coefficient
# put test_imputed['age'][missing_indices_test], test['age'][missing_indices_test] in a dataframe
df=pd.DataFrame({'imputed':test_imputed['age'][missing_indices_test], 'original':test['age'][missing_indices_test]})
# remove rows with missing values
df=df.dropna()
# calculate correlation coefficient
corr = np.corrcoef(df['imputed'], df['original'])[0,1]
plt.text(.6, .75, f'Correlation Coefficient = {round(corr, 2)}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes, color='black')
plt.show()

In [None]:
test_imputed.head()

In [None]:
imputer.evaluate_imputation(train, percentage=.2, ntimes=3)


# multiple imputation

In [None]:
from scripts.autogluonImputer import multiple_imputation
num_iter=2
time_limit=10
train_imputed = multiple_imputation(train_missing, n_imputations=10, num_iter=num_iter, time_limit=time_limit, fitonce=True)


In [None]:
train_imputed[0].head()

In [None]:
train_imputed[1].head()

In [None]:
train_imputed[1].dtypes