In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from dataloader import load_data
from helpers import get_cat_dims

from models import WGANGP

import logging
logging.getLogger().setLevel(logging.INFO)

dataset = 'homeeq' # baesens et al. Home Equity

# load data
df, cat_cols, num_cols, target_col = load_data(dataset)
X = df.loc[:, num_cols + cat_cols]
y = df.loc[:, target_col]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2020)
cat_dims = get_cat_dims(X_train, cat_cols)

# preprocess data
num_prep = make_pipeline(SimpleImputer(strategy='mean'),
                         MinMaxScaler())
cat_prep = make_pipeline(SimpleImputer(strategy='most_frequent'),
                         OneHotEncoder(handle_unknown='ignore', sparse=False))
prep = ColumnTransformer([
    ('num', num_prep, num_cols),
    ('cat', cat_prep, cat_cols)],
    remainder='drop')
X_train_trans = prep.fit_transform(X_train)

gan = WGANGP(write_to_disk=True, # whether to create an output folder. Plotting will be surpressed if flase
            compute_metrics_every=1250, print_every=2500, plot_every=10000,
            num_cols = num_cols, cat_dims=cat_dims,
            # pass the one hot encoder to the GAN to enable count plots of categorical variables
            transformer=prep.named_transformers_['cat']['onehotencoder'],
            # pass column names to enable
            cat_cols=cat_cols,
            use_aux_classifier_loss=True,
            d_updates_per_g=3, gp_weight=15)

gan.fit(X_train_trans, y=y_train.values, 
        condition=True,
        epochs=300,  
        batch_size=64,
        netG_kwargs = {'hidden_layer_sizes': (128,64), 
                        'n_cross_layers': 1,
                        'cat_activation': 'gumbel_softmax',
                        'num_activation': 'none',
                        'condition_num_on_cat': True, 
                        'noise_dim': 30, 
                        'normal_noise': False,
                        'activation':  'leaky_relu',
                        'reduce_cat_dim': True,
                        'use_num_hidden_layer': True,
                        'layer_norm':False,},
        netD_kwargs = {'hidden_layer_sizes': (128,64,32),
                        'n_cross_layers': 2,
                        'embedding_dims': 'auto',
                        'activation':  'leaky_relu',
                        'sigmoid_activation': False,
                        'noisy_num_cols': True,
                        'layer_norm':True,}
       )

X_res, y_res = gan.resample(X_train_trans, y=y_train)
print(f'Original imbalance ratio was:{y_train.mean():.2f}\nAfter resampling it is:{y_res.mean():.2f}')

X_test_trans = prep.transform(X_test)
clf = RandomForestClassifier(n_estimators=300, min_samples_leaf=1, max_features='sqrt', bootstrap=True,
                             random_state=2020, n_jobs=2)

clf.fit(X_res, y_res)
preds_oversampled = clf.predict_proba(X_test_trans)[:,1]

clf.fit(X_train_trans, y_train)
preds_imbalanced = clf.predict_proba(X_test_trans)[:,1]

print(f'AUC-ROC Random Forest:\n'
      f'Balanced data:\t\t{roc_auc_score(y_test, preds_oversampled):.4f}\n'
      f'Imbalanced data:\t{roc_auc_score(y_test, preds_imbalanced):.4f}')

ModuleNotFoundError: No module named 'torch'

In [2]:
!pip install torch

Collecting torch
  Downloading torch-2.0.0-cp38-none-macosx_10_9_x86_64.whl (139.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.5/139.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting filelock
  Downloading filelock-3.10.0-py3-none-any.whl (9.9 kB)
Collecting sympy
  Downloading sympy-1.11.1-py3-none-any.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting networkx
  Downloading networkx-3.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting mpmath>=0.19
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.2/536.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: mpmath, sympy, networkx, filelo