# Setup 
install packages

In [None]:
!pip install miceforest catboost wget


# Setup 

 Load in the dataset, clean it, and split it into test and training sets. 

 We then display the Standard deviation, mean and median

In [None]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load the dataset
# remove unwanted columns
data = pd.read_csv('energy_efficiency_data.csv')
data = data.drop('Cooling_Load', axis=1)

# split the data into train and test sets
train_df, test_df = train_test_split(data,test_size=0.25,random_state=0)

In [None]:
# display the standard deviation, mean and median
original_desc = train_df.describe().T[['std', 'mean', '50%' ]]
original_desc

# Load in and instantiate the CatBoostRegressor model

In [None]:
# import required packages
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

# get the features and targets
# for train and test sets
x_train = train_df.drop('Heating_Load', axis=1)
y_train = train_df['Heating_Load']
x_test = test_df.drop('Heating_Load', axis=1)
y_test = test_df['Heating_Load']

# Build and train the model 
model = CatBoostRegressor()
model.fit(x_train, y_train)

# Evaluate the model on test set
original_score = mean_absolute_error(y_test, model.predict(x_test))
print(f'The MAE of the model using the original data is {original_score:.2f}') 

# Simulating Missing At Random dataset


In [None]:
# import required packages
import torch
import wget
wget.download('https://raw.githubusercontent.com/BorisMuzellec/MissingDataOT/master/utils.py')
from utils import *

# Function produce_NA for generating missing values

def produce_NA(X, p_miss, mecha="MAR", opt=None, p_obs=None, q=None):
    
    to_torch = torch.is_tensor(X)
    
    if not to_torch:
        X = X.astype(np.float32)
        X = torch.from_numpy(X)
    
    if mecha == "MAR":
        mask = MAR_mask(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "logistic":
        mask = MNAR_mask_logistic(X, p_miss, p_obs).double()
    elif mecha == "MNAR" and opt == "quantile":
        mask = MNAR_mask_quantiles(X, p_miss, q, 1-p_obs).double()
    elif mecha == "MNAR" and opt == "selfmasked":
        mask = MNAR_self_mask_logistic(X, p_miss).double()
    else:
        mask = (torch.rand(X.shape) < p_miss).double()
    
    X_nas = X.clone()
    X_nas[mask.bool()] = np.nan
    
    return X_nas.double()

In [None]:
# get sample MAR data 
data_MAR = produce_NA(train_df.to_numpy(), p_miss=0.20, p_obs=0.75)

# load MAR data into dataframe
MAR_df = pd.DataFrame(data_MAR.numpy(), columns= train_df.columns)
MAR_df.sample(5)


In [None]:
# Total Percentage of missing values
total_miss_percent = (sum(MAR_df.isnull().sum()) / (MAR_df.shape[0] *                                                             MAR_df.shape[1])) * 100
print(f'Percentage of total missing data: {total_miss_percent:.0f}%')

In [None]:
percent_missing = MAR_df.isnull().sum() * 100 / len(MAR_df)
pd.DataFrame(percent_missing, columns = ['Percent_Missing'])

In [None]:
Percent_miss_samples = ((len(MAR_df) - len(MAR_df.dropna())) / len(MAR_df)) *  100 
print(f'Percentage of samples with missing data: {Percent_miss_samples:.0f}%')

# Applying techniques for handling missing data

## Listwise deletion

In [None]:
# drop sample with missing values
listwise_df = MAR_df.dropna()

# get data description after deletion
listwise_desc = listwise_df.describe().T[['std', 'mean', '50%' ]]

print(f'Number of samples before deletion: {len(MAR_df)}')
print(f'Number of samples after deletion: {len(listwise_df)}')
print(f'Number of samples after lost to listwise deletion: {len(MAR_df) - len(listwise_df)}')

In [None]:
## how does this affect performance?

# Get the features and target variables
# of the dataset after listwise deletion
x_train = listwise_df.drop('Heating_Load', axis=1)
y_train = listwise_df['Heating_Load']

# building and evaluating the model
model = CatBoostRegressor()
model.fit(x_train, y_train)
listwise_score = mean_absolute_error(y_test, model.predict(x_test))
print(f'The MAE of the model using listwise deletion is {listwise_score :.2f}')

# Imputation

## simple imputation

In [None]:
# import required package
from sklearn.impute import SimpleImputer

# define mean imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# get mean of variables and substitute missing values
mean_imp_data = imputer.fit_transform(MAR_df.values)

# get dataframe with mean imputed values
mean_imp_df = pd.DataFrame(mean_imp_data, columns= MAR_df.columns)

# get data description after mean imputation
mean_imp_desc = mean_imp_df.describe().T[['std', 'mean', '50%' ]]

In [None]:
# get the train features and target variables
# of the mean imputed dataset
x_train = mean_imp_df.drop('Heating_Load', axis=1)
y_train = mean_imp_df['Heating_Load']

# building and evaluating the model
model = CatBoostRegressor()
model.fit(x_train, y_train)
mean_imp_score = mean_absolute_error(y_test, model.predict(x_test))
print(f'The MAE of the model using mean imputation is {mean_imp_score :.2f}')

## Model-Based Imputation

In [None]:
# import required packages
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import KNeighborsRegressor

# define model for imputation
impute_estimator = KNeighborsRegressor()
imputer = IterativeImputer(estimator=impute_estimator, max_iter=25, tol= 1e-1, random_state=0)

# impute missing values using model imputer
model_imp_data = np.round(imputer.fit_transform(MAR_df), 6)

# get datafraame with model imputation
model_imp_df = pd.DataFrame(model_imp_data, columns= MAR_df.columns)

# get description of model imputed data
model_imp_desc = model_imp_df.describe().T[['std', 'mean', '50%' ]]

In [None]:
# get the train features and target variables
# of the model imputed dataset
x_train = model_imp_df.drop('Heating_Load', axis=1)
y_train = model_imp_df['Heating_Load']

# building and evaluating the model
model = CatBoostRegressor()
model.fit(x_train, y_train)
model_imp_score = mean_absolute_error(y_test,model.predict(x_test))
print(f'The MAE of the model using model based imputation is {model_imp_score :.2f}')

## Multiple Imputation

In [None]:
# import required package
import miceforest as mf

# create kernel for MI
kernel = mf.ImputationKernel(MAR_df, datasets=20, random_state= 0)

# Run the MICE algorithm for 5 iterations
#  on each of the datasets
kernel.mice(5)

# show the number of datasets
print(f'Number of datasets with imputations: {kernel.dataset_count()}')

In [None]:
# For each imputed dataset, train a catbost regressor
predictions = []
for i in range(kernel.dataset_count()):
    MICE_imp_df = kernel.complete_data(i)
    x_train = MICE_imp_df.drop('Heating_Load', axis=1)
    y_train = MICE_imp_df['Heating_Load']
    model = CatBoostRegressor()
    model.fit(x_train, y_train)
    predictions.append(model.predict(x_test)) # add test prediction to list
    
# get mean of predictions and evaluate on test set    
mean_predictions = (np.array(predictions)).mean(axis=0) 
MICE_imp_score = mean_absolute_error(y_test, mean_predictions)
print(f'The MAE of the model using multiple imputation is {MICE_imp_score :.2f}')