In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# Parameters
n_samples = 1000
n_rows = 100
n_columns = 10
missing_rate_label_1 = 0.2
missing_rate_label_0 = 0.1
other_columns_missing_rate = 0.1

# Generate labels (0 or 1)
labels = np.random.randint(2, size=n_samples)

# Generate data with the specific missing pattern
def generate_data(n_samples, n_rows, n_columns, labels):

    data = np.random.normal(loc=0, scale=1, size=(n_samples, n_rows, n_columns))

    for i in range(n_samples):
        # Apply missing rate for the last column based on the label
        if labels[i] == 1:
            missing_mask = np.random.rand(n_rows) < missing_rate_label_1
        else:
            missing_mask = np.random.rand(n_rows) < missing_rate_label_0
        data[i, :, -1][missing_mask] = np.nan

        # Apply random missing rate for other columns
        for j in range(n_columns - 1):
            missing_mask = np.random.rand(n_rows) < other_columns_missing_rate
            data[i, :, j][missing_mask] = np.nan

    return data

# Generate dataset
data = generate_data(n_samples, n_rows, n_columns, labels)

# Flatten each matrix into a vector (to use with XGBoost)
data_flattened = data.reshape(n_samples, -1)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(data_flattened, labels, test_size=0.2, random_state=42)

# XGBoost model
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")

# Fit the model
xgboost_model.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = xgboost_model.predict_proba(X_test)[:, 1]

# Calculate AUROC
auroc = roc_auc_score(y_test, y_pred_proba)

auroc




0.8456321490029355

In [33]:
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Zero Imputer
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

# Mean Imputer
mean_imputer = SimpleImputer(strategy='mean')

# Linear Imputer (custom)
class LinearImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.means = np.nanmean(X, axis=0)
        self.single_values = [np.nan if np.sum(~np.isnan(X[:, i])) != 1 else X[~np.isnan(X[:, i]), i][0] for i in range(X.shape[1])]
        return self

    def transform(self, X):
        X_filled = X.copy()
        for i in range(X.shape[1]):
            if np.isnan(self.single_values[i]):
                X_filled[:, i] = np.where(np.isnan(X[:, i]), self.means[i], X[:, i])
            else:
                X_filled[:, i] = np.where(np.isnan(X[:, i]), self.single_values[i], X[:, i])
        return X_filled

linear_imputer = LinearImputer()

# Define a function to train and test using XGBoost, and return AUROC score
def train_and_test(X_train, y_train, X_test, y_test):
    xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    xgboost_model.fit(X_train, y_train)
    y_pred_proba = xgboost_model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred_proba)

# Fit imputers to training data and transform both train and test sets
X_train_zero = zero_imputer.fit_transform(X_train)
X_test_zero = zero_imputer.transform(X_test)
X_train_mean = mean_imputer.fit_transform(X_train)
X_test_mean = mean_imputer.transform(X_test)
X_train_linear = linear_imputer.fit_transform(X_train)
X_test_linear = linear_imputer.transform(X_test)

# Train and test using each imputer
auroc_zero = train_and_test(X_train_zero, y_train, X_test_zero, y_test)
auroc_mean = train_and_test(X_train_mean, y_train, X_test_mean, y_test)
auroc_linear = train_and_test(X_train_linear, y_train, X_test_linear, y_test)

auroc_zero, auroc_mean, auroc_linear




(0.5729324830448427, 0.6042109525255592, 0.6042109525255592)

In [59]:
# Parameters
n_samples = 1000
n_rows = 100
n_columns = 10
missing_rate = 0.1

# Generate data with the specific missing pattern
def generate_data2(n_samples, n_rows, n_columns, labels):

    # Oscillating missing rate function for class 1
    def oscillating_missing_rate(t):
        return missing_rate + 0.05 * np.sin(np.pi * t / (n_rows - 1))  # t is normalized to [0, 1]

    data = np.random.normal(loc=0, scale=1, size=(n_samples, n_rows, n_columns))

    for i in range(n_samples):
        # Apply missing rate for the last column based on the label
        if labels[i] == 1:
            # Oscillating missing rate for the last column
            for t in range(n_rows):
                if np.random.rand() < oscillating_missing_rate(t):
                    data[i, t, -1] = np.nan
        else:
            missing_mask = np.random.rand(n_rows) < missing_rate
            data[i, :, -1][missing_mask] = np.nan

        # Apply random missing rate for other columns
        for j in range(n_columns - 1):
            missing_mask = np.random.rand(n_rows) < missing_rate
            data[i, :, j][missing_mask] = np.nan

    return data

# Generate dataset
data = generate_data2(n_samples, n_rows, n_columns, labels)

# Flatten each matrix into a vector (to use with XGBoost)
data_flattened = data.reshape(n_samples, -1)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(data_flattened, labels, test_size=0.2, random_state=42)

# XGBoost model
xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")

# Fit the model
xgboost_model.fit(X_train, y_train)

# Predict probabilities
y_pred_proba = xgboost_model.predict_proba(X_test)[:, 1]

# Calculate AUROC
auroc = roc_auc_score(y_test, y_pred_proba)

auroc



0.4907379289401761

In [60]:
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

# Zero Imputer
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

# Mean Imputer
mean_imputer = SimpleImputer(strategy='mean')

# Linear Imputer (custom)
class LinearImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.means = np.nanmean(X, axis=0)
        self.single_values = [
            np.nan if np.sum(~np.isnan(X[:, i])) != 1 else X[~np.isnan(X[:, i]), i][0]
            for i in range(X.shape[1])
        ]
        return self

    def transform(self, X):
        X_filled = X.copy()
        for i in range(X.shape[1]):
            non_missing_indices = np.where(~np.isnan(X[:, i]))[0]
            missing_indices = np.where(np.isnan(X[:, i]))[0]
            
            # Case 1: At least two non-missing values -> Linear interpolation
            if len(non_missing_indices) >= 2:
                X_filled[:, i] = pd.Series(X[:, i]).interpolate(method='linear', limit_direction='both').to_numpy()

            # Case 2: Exactly one non-missing value -> Impute with that single value
            elif len(non_missing_indices) == 1:
                X_filled[missing_indices, i] = self.single_values[i]

            # Case 3: No non-missing values -> Impute with the column mean from training data
            else:
                X_filled[missing_indices, i] = self.means[i]

        return X_filled

linear_imputer = LinearImputer()

# Define a function to train and test using XGBoost, and return AUROC score
def train_and_test(X_train, y_train, X_test, y_test):
    xgboost_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
    xgboost_model.fit(X_train, y_train)
    y_pred_proba = xgboost_model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred_proba)

# Fit imputers to training data and transform both train and test sets
X_train_zero = zero_imputer.fit_transform(X_train)
X_test_zero = zero_imputer.transform(X_test)
X_train_mean = mean_imputer.fit_transform(X_train)
X_test_mean = mean_imputer.transform(X_test)
X_train_linear = linear_imputer.fit_transform(X_train)
X_test_linear = linear_imputer.transform(X_test)

# Train and test using each imputer
auroc_zero = train_and_test(X_train_zero, y_train, X_test_zero, y_test)
auroc_mean = train_and_test(X_train_mean, y_train, X_test_mean, y_test)
auroc_linear = train_and_test(X_train_linear, y_train, X_test_linear, y_test)

auroc_zero, auroc_mean, auroc_linear




(0.48061544690758173, 0.5390221682356514, 0.4790970746026925)