In [1]:
import copy
import tqdm
import optuna
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from load_dataset import load
from classifier import *
from utils import *
from metrics import *  # include fairness and corresponding derivatives
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.metrics import mutual_info_score, auc, roc_curve, roc_auc_score, f1_score, accuracy_score
from scipy.stats import wasserstein_distance
from optuna.samplers import *
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
from diffprep.prep_space import space
from diffprep.experiment.diffprep_experiment import DiffPrepExperiment
from diffprep.pipeline.diffprep_fix_pipeline import DiffPrepFixPipeline
from diffprep.trainer.diffprep_trainer import DiffPrepSGD
from diffprep.model import LogisticRegression
from diffprep.experiment.experiment_utils import min_max_normalize, set_random_seed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
X_train, X_test, y_train, y_test = load('adult')
X_train_orig, X_test_orig = X_train.copy(), X_test.copy()
nan_indices = np.random.choice(X_train.index, 10000, replace=False)
X_train.loc[nan_indices, 'education'] = np.nan
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.5, random_state=42
)
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [3]:
params = {
    "num_epochs": 2000,
    "batch_size": 512,
    "device": "cpu",
    "model_lr": 0.01,
    # "model_lr": [0.01],
    "weight_decay": 0,
    "model": "log",
    "train_seed": 1,
    "split_seed": 1,
    "method": "diffprep_fix",
    "save_model": True,
    "logging": False,
    "no_crash": False,
    "patience": 3,
    "momentum": 0.9,
    "prep_lr": None,
    "temperature": 0.1,
    "grad_clip": None,
    "pipeline_update_sample_size": 512,
    "init_method": "default",
    "diff_method": "num_diff",
    "sample": False
}

In [4]:

X_train, X_val, X_test = min_max_normalize(X_train, X_val, X_test)
params["patience"] = 10
params["num_epochs"] = 30
set_random_seed(params)
prep_pipeline = DiffPrepFixPipeline(space, temperature=params["temperature"],
                                             use_sample=params["sample"],
                                             diff_method=params["diff_method"],
                                             init_method=params["init_method"])
prep_pipeline.init_parameters(X_train, X_val, X_test)
print("Train size: ({}, {})".format(X_train.shape[0], prep_pipeline.out_features))

# model
input_dim = prep_pipeline.out_features
output_dim = len(set(y_train.values.ravel()))

# model = TwoLayerNet(input_dim, output_dim)
set_random_seed(params)
model = LogisticRegression(input_dim, output_dim)
model = model.to(params["device"])
 # loss
loss_fn = nn.CrossEntropyLoss()

# optimizer
model_optimizer = torch.optim.SGD(
    model.parameters(),
    lr=params["model_lr"],
    weight_decay=params["weight_decay"],
    momentum=params["momentum"]
)

if params["prep_lr"] is None:
    prep_lr = params["model_lr"]
else:
    prep_lr = params["prep_lr"]

prep_pipeline_optimizer = torch.optim.Adam(
    prep_pipeline.parameters(),
    lr=prep_lr,
    betas=(0.5, 0.999),
    weight_decay=params["weight_decay"]
)

# scheduler
# model_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, patience=patience, factor=0.1, threshold=0.001)
prep_pipeline_scheduler = None
model_scheduler = None

if params["logging"]:
    logger = SummaryWriter()
else:
    logger = None

diff_prep = DiffPrepSGD(prep_pipeline, model, loss_fn, model_optimizer, prep_pipeline_optimizer,
            model_scheduler, prep_pipeline_scheduler, params, writer=logger)

result, best_model = diff_prep.fit(X_train, y_train, X_val, y_val, X_test, y_test)

Train size: (15081, 8)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.95it/s, next_eval_time=16s, tr_loss=0.408, val_loss=0.411]


In [5]:
best_result = None
best_model = None
best_logger = None
best_val_loss = float("inf")
best_params = None

In [6]:
if result["best_val_loss"] < best_val_loss:
    best_val_loss = result["best_val_loss"]
    best_result = result
    best_model = model
    best_logger = logger
    best_params = params

In [7]:
best_result

{'best_epoch': 29,
 'best_val_loss': 0.4110908508300781,
 'best_tr_acc': 0.7972283005105762,
 'best_val_acc': 0.7966315231085471,
 'best_test_acc': 0.8083001328021249,
 'best_test_auc': 0.8554021221926151}

In [8]:
auc_score = best_result["best_test_auc"]
accuracy = best_result["best_test_acc"]

print(f"ACC: {accuracy:.4f}")
print(f"AUC: {auc_score:.4f}")

ACC: 0.8083
AUC: 0.8554
