In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

import pandas as pd
import numpy as np
import pickle
import random
import json
import gc
import os
import re

from imblearn.over_sampling import SMOTE
from datetime import datetime
import time

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

def seed_everything(seed=42):
    print('Setting Random Seed')
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
TARGET = 'service_canceled'

fts_continuous = ['customer_age_appl', 'time_start_process', 'operator_count', 'previous_customer_count']
fts_categorical = ['date', 'branch_name', 'customer_gender', 'customer_city', 'service_name_organization',
                   'service_name', 'service_name_2']

print('Categorical Features', fts_categorical)
print('Continuous Features', fts_continuous)

print('Categorical Feature Count', len(fts_categorical))
print('Continuous Feature Count', len(fts_continuous))

## Configuration

In [None]:
PATH = "./"

SAVE_PATH = "./outputs/"
os.makedirs(SAVE_PATH, exist_ok=True)

run_key = 'DAE_TRIAL' #due to long run time prefer to split into sections of 600 epochs

DAE_CFG = {'debug': False,
           'batch_size': 384, 
           'init_lr': 0.0003, 
           'lr_decay': 0.998,  # rate of decrease of learning rate
           'noise_decay': 0.999,  # rate of decrease of noise level
           'max_epochs': 2000, 
           'save_freq': 100, 
           'hidden_size': 1024,  # hidden_size == embed_dim * num_subspaces
           'num_subspaces': 8, 
           'embed_dim': 128, 
           'num_heads': 8, 
           'dropout': 0, 
           'feedforward_dim': 512, 
           'emphasis': 0.75, #weighing of loss to 'corrupted' data points - i tried varying over time, did not show immediate improvement
           'task_weights': [len(fts_categorical), len(fts_continuous)], #weighting for categorical vs continuous
           'mask_loss_weight': 2, #weighting of mask prediction vs prediction of reconstructed original data values
           'prob_categorical': 0.2, #probability of noise in categoricals
           'prob_continuous': 0.1, #probability of noise in continuous
           'random_state': 2021,
           'run_key': run_key
          }

if DAE_CFG['debug']:
    DAE_CFG['max_epochs'] = 10
    
with open(SAVE_PATH + f"{DAE_CFG['run_key']}_DAE_CFG.pickle", 'wb') as f:
    pickle.dump(DAE_CFG, f)
with open(SAVE_PATH + f"{DAE_CFG['run_key']}_DAE_CFG.json", 'w') as f:
    json.dump(DAE_CFG, f)

## Check Noise and Learning Rate

In [None]:
tracking_df = pd.DataFrame(index=range(DAE_CFG['max_epochs']),
                           columns=['loss', 'lr', 'run_code', 'time', 'elapsed', 'noise_categorical', 'noise_continuous'],
                           data=0.0)

tracking_df['lr'] = DAE_CFG['init_lr'] * (DAE_CFG['lr_decay']**tracking_df.index)
tracking_df['noise_categorical'] = DAE_CFG['prob_categorical'] * (DAE_CFG['noise_decay']**tracking_df.index)
tracking_df['noise_continuous'] = DAE_CFG['prob_continuous'] * (DAE_CFG['noise_decay']**tracking_df.index)
tracking_df['run_code'] = DAE_CFG['run_key']

sns.set(font_scale=1.4)
fig,axes=plt.subplots(nrows=1,ncols=3,figsize=(18,6))

axes[0].plot(tracking_df.index, tracking_df['lr'], color='Blue')
axes[0].set_ylim(0,)
axes[0].set_title('Learning Rate')
axes[0].set_xlabel('Epochs')

axes[1].plot(tracking_df.index, tracking_df['noise_categorical'], color='Red')
axes[1].set_ylim(0,1)
axes[1].set_title('Categorical Noise Prob')

axes[2].plot(tracking_df.index, tracking_df['noise_continuous'], color='Red')
axes[2].set_ylim(0,1)
axes[2].set_title('Continuous Noise Prob')

plt.tight_layout()

# Functions to Get Data

In [None]:
def feats_engineering(train, test):
    unique_counts = list()
    all_df = pd.concat([train, test]).reset_index(drop=True)
    
    all_df['customer_age_appl'].fillna(all_df['customer_age_appl'].mode()[0], inplace=True)
    all_df['time_start_process'].fillna(all_df['time_start_process'].mode()[0], inplace=True)

    all_df['customer_age_appl'] = all_df['customer_age_appl'].apply(lambda x: (int(x.split('-')[0])+int(x.split('-')[1]))/2)
    all_df['time_start_process'] = all_df['time_start_process'].apply(lambda x: int(x[:2]))

    for col in fts_categorical:
        unique_counts.append(all_df[col].nunique())
        
    df_train = all_df[:train.shape[0]]
    df_test = all_df[train.shape[0]:].reset_index(drop=True)
        
    return df_train, df_test, unique_counts

In [None]:
def get_data():
    train = pd.read_csv(PATH + "queue_dataset_train_small_sample.csv")
    test = pd.read_csv(PATH + "queue_dataset_test.csv")
    
    train_data, test_data, unique_counts = feats_engineering(train, test)
    
    #combine train and test data vertically
    X_nums = np.vstack([
        train_data.loc[:, fts_continuous].to_numpy(),
        test_data.loc[:, fts_continuous].to_numpy()
    ])
    X_nums = (X_nums - X_nums.mean(0)) / X_nums.std(0) #normalize
    
    #stack the categorical data
    X_cat = np.vstack([
        train_data.loc[:, fts_categorical].to_numpy(),
        test_data.loc[:, fts_categorical].to_numpy()
    ])
    #encode the categoricals
    encoder = OneHotEncoder(sparse=False)
    X_cat = encoder.fit_transform(X_cat)
    
    #join the categorical and continuous data horizontally
    X = np.hstack([X_cat, X_nums])
    y = train_data[TARGET].to_numpy().reshape(-1, 1)
    
    return X, y, X_cat.shape[1], X_nums.shape[1], unique_counts #this lets us know how many categorical and continuous features there are

class SingleDataset(Dataset):
    def __init__(self, x, is_sparse=False):
        self.x = x.astype('float32')
        self.is_sparse = is_sparse

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        x = self.x[index]
        if self.is_sparse: x = x.toarray().squeeze()
        return x    

# Losses

In [None]:
bce_logits = torch.nn.functional.binary_cross_entropy_with_logits
mse = torch.nn.functional.mse_loss

# AutoEncoder

In [None]:
#torch docs

#embed_dim – total dimension of the model.
#num_heads – parallel attention heads.
#dropout – a Dropout layer on attn_output_weights. Default: 0.0.
#bias – add bias as module parameter. Default: True.
#add_bias_kv – add bias to the key and value sequences at dim=0.
#add_zero_attn – add a new batch of zeros to the key and value sequences at dim=1.
#kdim – total number of features in key. Default: None.
#vdim – total number of features in value. Default: None.

class TransformerEncoder(torch.nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, feedforward_dim):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.linear_1 = torch.nn.Linear(embed_dim, feedforward_dim)
        self.linear_2 = torch.nn.Linear(feedforward_dim, embed_dim)
        self.layernorm_1 = torch.nn.LayerNorm(embed_dim)
        self.layernorm_2 = torch.nn.LayerNorm(embed_dim)
    
    def forward(self, x_in):        
        attn_out, _ = self.attn(x_in, x_in, x_in)        
        x = self.layernorm_1(x_in + attn_out)
        ff_out = self.linear_2(torch.nn.functional.relu(self.linear_1(x)))
        x = self.layernorm_2(x + ff_out)
        return x

In [None]:
class TransformerAutoEncoder(torch.nn.Module):
    def __init__(
            self, 
            num_inputs, 
            n_cats, 
            n_nums, 
            hidden_size=1024, 
            num_subspaces=8,
            embed_dim=128, 
            num_heads=8, 
            dropout=0, 
            feedforward_dim=512, 
            emphasis=.75, 
            task_weights=[len(fts_categorical), len(fts_continuous)],
            mask_loss_weight=2,
        ):
        super().__init__()
        assert hidden_size == embed_dim * num_subspaces
        self.n_cats = n_cats
        self.n_nums = n_nums
        self.num_subspaces = num_subspaces
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.emphasis = emphasis
        self.task_weights = np.array(task_weights) / sum(task_weights)
        self.mask_loss_weight = mask_loss_weight

        self.excite = torch.nn.Linear(in_features=num_inputs, out_features=hidden_size)
        self.encoder_1 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
        self.encoder_2 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)
        self.encoder_3 = TransformerEncoder(embed_dim, num_heads, dropout, feedforward_dim)        
        
        self.mask_predictor = torch.nn.Linear(in_features=hidden_size, out_features=num_inputs)
        self.reconstructor = torch.nn.Linear(in_features=hidden_size + num_inputs, out_features=num_inputs)

    def divide(self, x):
        batch_size = x.shape[0]
        x = x.reshape((batch_size, self.num_subspaces, self.embed_dim)).permute((1, 0, 2))
        return x

    def combine(self, x):
        batch_size = x.shape[1]
        x = x.permute((1, 0, 2)).reshape((batch_size, -1))
        return x

    def forward(self, x):
        x = torch.nn.functional.relu(self.excite(x))
        
        x = self.divide(x)
        x1 = self.encoder_1(x)
        x2 = self.encoder_2(x1)
        x3 = self.encoder_3(x2)
        x = self.combine(x3)
        
        predicted_mask = self.mask_predictor(x)
        reconstruction = self.reconstructor(torch.cat([x, predicted_mask], dim=1))
        return (x1, x2, x3), (reconstruction, predicted_mask)

    def split(self, t):
        return torch.split(t, [self.n_cats, self.n_nums], dim=1)

    def feature(self, x):
        attn_outs, _ = self.forward(x)
        return torch.cat([self.combine(x) for x in attn_outs], dim=1) #the feature is the data extracted for use in inference

    def loss(self, x, y, mask, reduction='mean'):   
        _, (reconstruction, predicted_mask) = self.forward(x)
        
        x_cats, x_nums = self.split(reconstruction)
        y_cats, y_nums = self.split(y)
        
        #weights are detemined by the emphasis - which is currently heavier weights for corrupted data (mask = 1)
        w_cats, w_nums = self.split(mask * self.emphasis + (1 - mask) * (1 - self.emphasis))
        
        #BCE loss for reconstructed vs actual categoricals
        cat_loss = self.task_weights[0] * torch.mul(w_cats, bce_logits(x_cats, y_cats, reduction='none')) 
        
        #mse loss for reconstructed vs actual continuous
        num_loss = self.task_weights[1] * torch.mul(w_nums, mse(x_nums, y_nums, reduction='none'))
        
        #BCE+MSE = reconstruction loss
        reconstruction_loss = torch.cat([cat_loss, num_loss], dim=1) if reduction == 'none' else cat_loss.mean() + num_loss.mean()
        
        #mask loss = how well the model predicts which values are corrupted - can use BCE as this is 0/1
        mask_loss = self.mask_loss_weight * bce_logits(predicted_mask, mask, reduction=reduction)
        
        return reconstruction_loss + mask_loss if reduction == 'mean' else [reconstruction_loss, mask_loss]

# Noise Masker

In [None]:
class SwapNoiseMasker(object):
    def __init__(self, probas, decay_rate):
        self.probas = torch.from_numpy(np.array(probas))
        self.decay_rate = decay_rate

    def apply(self, X, epoch_number):
        epoch_probas = self.probas * (self.decay_rate ** epoch_number)
        
        #generates Y/N for swap / dont swap
        should_swap = torch.bernoulli(epoch_probas.to(X.device) * torch.ones((X.shape)).to(X.device))
        
        #switches data where swap = Y
        corrupted_X = torch.where(should_swap == 1, X[torch.randperm(X.shape[0])], X)
        
        #mask is whether data has been changed or not
        #nb for one hot categorical data, presumably quite often mask != shouldswap, as 1 is swapped for 1 or 0 swapped for 0
        mask = (corrupted_X != X).float()
        return corrupted_X, mask

# Prepare Data

In [None]:
#  get data
X, Y, n_cats, n_nums, unique_counts = get_data()

seed_everything(DAE_CFG['random_state'])

train_dl = DataLoader(
    dataset=SingleDataset(X),
    batch_size=DAE_CFG['batch_size'],
    shuffle=True,
    pin_memory=True,
    drop_last=True
)

print(X.shape, Y.shape, n_cats, n_nums)
print(unique_counts)

# Define Column Noise Probabilities

In [None]:
#repeats of probabilities for one hot encoding which creates new columns for categoricals
repeats = [x for x in unique_counts] + [1 for x in range(len(fts_continuous))]

#probabilities for original columns
probas = [DAE_CFG['prob_categorical'] for x in range(len(fts_categorical))] + [DAE_CFG['prob_continuous'] for x in range(len(fts_continuous))]

#expand these to the one hot columns
swap_probas = sum([[p] * r for p, r in zip(probas, repeats)], [])

print('length', len(swap_probas))
print('examples', swap_probas[0:10], swap_probas[-len(fts_continuous):])

# Prepare DAE Model

In [None]:
# setup model

model_params = dict(
    hidden_size=DAE_CFG['hidden_size'],
    num_subspaces=DAE_CFG['num_subspaces'],
    embed_dim=DAE_CFG['embed_dim'],
    num_heads=DAE_CFG['num_heads'],
    dropout=DAE_CFG['dropout'],
    feedforward_dim=DAE_CFG['feedforward_dim'],
    emphasis=DAE_CFG['emphasis'],
    mask_loss_weight=DAE_CFG['mask_loss_weight']
)

dae = TransformerAutoEncoder(
    num_inputs=X.shape[1],
    n_cats=n_cats,
    n_nums=n_nums,
    **model_params
).cuda()
model_checkpoint = 'model_checkpoint.pth'

optimizer = torch.optim.Adam(dae.parameters(), lr=DAE_CFG['init_lr'])
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=DAE_CFG['lr_decay'])

# Training DAE Model

In [None]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
noise_maker = SwapNoiseMasker(swap_probas, DAE_CFG['noise_decay'])
s0 = time.time()

for epoch in range(DAE_CFG['max_epochs']):    
       
    t0 = time.time()
    dae.train()
    meter = AverageMeter()
    
    for i, x in enumerate(train_dl):
        x = x.cuda()
        x_corrputed, mask = noise_maker.apply(x, epoch) #added epoch to allow noise level to decrease over time
        optimizer.zero_grad()
        
        loss = dae.loss(x_corrputed, x, mask)
        
        loss.backward()
        optimizer.step()

        meter.update(loss.detach().cpu().numpy())

    delta1 = (time.time() - s0)
    delta2 = (time.time() - t0)
    scheduler.step()
    
    remain = (DAE_CFG['max_epochs'] - (epoch+1)) * delta2
    print(f"\r epoch {epoch:5d} - loss {meter.avg:.6f} - {delta2:4.2f} sec per epoch - {delta1:6.2f} sec elapsed - {remain:6.2f} sec remaining", end="")  
    
    model_checkpoint = f'model_checkpoint_{epoch}.pth'

    tracking_df.loc[epoch, 'loss'] = meter.avg
    tracking_df.loc[epoch, 'time'] = round(delta2, 2)
    tracking_df.loc[epoch, 'elapsed'] = round(delta1, 2)
    
    if epoch%DAE_CFG['save_freq']==0:
        
        ## print('Saving to checkpoint')
        #as i have flat noise level across all columns, i just print the noise average
        ## print('average noise level', np.array(swap_probas).mean()*(DAE_CFG['noise_decay']**epoch))
        model_checkpoint = SAVE_PATH + f"{DAE_CFG['run_key']}_model_checkpoint_{epoch}.pth"
        torch.save({
                "optimizer": optimizer.state_dict(),
                "scheduler": scheduler.state_dict(),
                "model": dae.state_dict()
            }, model_checkpoint
        )
        

model_checkpoint = SAVE_PATH + f"{DAE_CFG['run_key']}_model_checkpoint_final.pth"
torch.save({
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "model": dae.state_dict()
    }, model_checkpoint
)

tracking_df.to_csv(f"{DAE_CFG['run_key']}_tracking_loss.csv")

In [None]:
# extract features
dl = DataLoader(dataset=SingleDataset(X), batch_size=DAE_CFG['batch_size'], shuffle=False, pin_memory=True, drop_last=False)

In [None]:
features = []
dae.eval()

with torch.no_grad():
    for x in dl:
        features.append(dae.feature(x.cuda()).detach().cpu().numpy())
features = np.vstack(features)

In [None]:
display(features.shape)
np.save(SAVE_PATH + f"{DAE_CFG['run_key']}_dae_features_epoch{DAE_CFG['max_epochs']}.npy", features)