In [1]:
%reload_ext autoreload
%reload_ext autoreload

In [2]:
import time
import os
import sys
import copy
import time
import datetime
import random
import math
import warnings
from functools import partial
# warnings.filterwarnings('ignore')

In [3]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

In [4]:
import numpy as np
import scipy
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm, trange

In [5]:
sys.path.append("../")

In [6]:
from chaosmining.data_utils import read_formulas, create_simulation_data
from chaosmining.simulation.models import MLPRegressor
from chaosmining.simulation.functions import abs_argmax_topk
from chaosmining.utils import radar_factory

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, precision_score, recall_score, accuracy_score, roc_curve, auc, balanced_accuracy_score

In [8]:
from captum.attr import IntegratedGradients, Saliency, DeepLift, FeatureAblation

In [9]:
import matplotlib
# mpl.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt 
from matplotlib.colors import ListedColormap

matplotlib.rcParams['lines.linewidth'] = 1
matplotlib.rcParams['lines.markersize'] = 5
plt.rcParams['figure.figsize'] = [4, 4]

# Santander customer dataset

In [10]:
house_train_path = '../data/house_prices_advanced_regression/train.csv'
house_test_path = '../data/house_prices_advanced_regression/test.csv'

In [11]:
customer_train_path = '../data/santander_customer_satisfaction/train.csv'
customer_test_path = '../data/santander_customer_satisfaction/test.csv'
sample_path = '../data/santander_customer_satisfaction/sample_submission.csv'

In [13]:
train  = pd.read_csv(customer_train_path,index_col=0)
test   = pd.read_csv(customer_test_path, index_col=0)
sample = pd.read_csv(sample_path)

In [14]:
train.dtypes.value_counts()

int64      259
float64    111
Name: count, dtype: int64

In [15]:
train.isnull().values.any()

False

In [16]:
train.select_dtypes(include=['int64']).nunique()

var3                             208
var15                            100
ind_var1_0                         2
ind_var1                           2
ind_var2_0                         1
                                ... 
num_var45_ult3                   172
saldo_var2_ult1                    1
saldo_medio_var13_medio_hace3      1
saldo_medio_var13_medio_ult1       3
TARGET                             2
Length: 259, dtype: int64

a good many of the integer features have one single value. Such columns have zero variance and thus have no predictive value, In https://www.kaggle.com/code/carlmcbrideellis/tabular-classification-with-neural-networks-keras/notebook they drop these columns from the train, as well as the test data to maintain consistency.

In [None]:
# features_to_drop = train.nunique()
# features_to_drop = features_to_drop.loc[features_to_drop.values==1].index
# # now drop these columns from both the training and the test datasets
# train = train.drop(features_to_drop,axis=1)
# test  = test.drop(features_to_drop,axis=1)

In [17]:
train.dtypes.value_counts()

int64      259
float64    111
Name: count, dtype: int64

In [18]:
train.shape

(76020, 370)

In [19]:
X = train.iloc[:,:-1]
y = train['TARGET']

In [20]:
from imblearn.under_sampling import RandomUnderSampler

In [21]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
X_resampled = X_resampled.to_numpy()
y_resampled = y_resampled.to_numpy().reshape(y_resampled.shape[0],-1)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                  train_size=0.8,
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  shuffle=True)

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test   = scaler.transform(X_test)
# test    = scaler.transform(test)

# Training and Test a MLP

In [24]:
train_set = TensorDataset(Tensor(X_train), Tensor(y_train))
train_loader = DataLoader(train_set, batch_size=1000, shuffle=True)
test_set = TensorDataset(Tensor(X_test), Tensor(y_test))
test_loader = DataLoader(test_set, batch_size=y_test.shape[0])

In [25]:
hidden_layer_sizes = (100,100,100)
num_epochs = 500

In [26]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [27]:
from typing import List

class LinearBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_channels, out_channels),
            nn.ReLU(),
        )

    def forward(self, x: Tensor)-> Tensor:
        x = self.net(x)
        return x
    
class MLPClassifier(nn.Module):
    def __init__(self, in_channels: int, sizes: List[int], p: float=0.0):
        super().__init__()
        self.blocks = nn.ModuleList([
            LinearBlock(in_channels, sizes[0]),
            *[LinearBlock(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)]
        ])
        self.dropout = nn.Dropout(p)
        self.project = nn.Sequential(
            nn.Linear(sizes[-1], 1),
            nn.Sigmoid()
        )
    def forward(self, x: Tensor)-> Tensor:
        for block in self.blocks:
            x = block(x)
        x = self.dropout(x)
        x = self.project(x)
        return x

In [28]:
model = MLPClassifier(X_train.shape[-1], hidden_layer_sizes, p=0.0)
model.to(device)
model.train()

MLPClassifier(
  (blocks): ModuleList(
    (0): LinearBlock(
      (net): Sequential(
        (0): Linear(in_features=369, out_features=100, bias=True)
        (1): ReLU()
      )
    )
    (1-2): 2 x LinearBlock(
      (net): Sequential(
        (0): Linear(in_features=100, out_features=100, bias=True)
        (1): ReLU()
      )
    )
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (project): Sequential(
    (0): Linear(in_features=100, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [29]:
# criterion = nn.MSELoss(reduction='mean')
criterion = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), 0.001)

In [30]:
def train(model, dataloader, num_epochs, optimizer):
    pbar = trange(num_epochs, desc='Train', unit='epoch', initial=0, disable=False)
    for epoch in pbar:  # loop over the dataset multiple times
        running_loss = 0.0
        for inputs, targets in dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()
            inputs = inputs.to(device)
            targets = targets.to(device)
            # forward pass
            outputs = model(inputs)
            # defining loss
            # print(outputs.dtype,targets.dtype)
            # print(outputs.shape,targets.shape)
            # print(outputs[0],targets[0])
            loss = criterion(outputs, targets)

            # computing gradients
            loss.backward()
            # accumulating running loss
            running_loss += loss.item()
            # updated weights based on computed gradients
            optimizer.step()
        pbar.set_postfix(loss = '%.3f' % running_loss)
        # print(loss.item())
    print('train loss:', running_loss)

In [31]:
train(model, train_loader, num_epochs, optimizer)

Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 1.679726392030716


In [32]:
y_pred = model(Tensor(X_test).to(device)).detach().cpu().numpy()

In [33]:
y_test =y_test.astype(int)

In [34]:
fpr, tpr, thresholds = roc_curve(y_test,y_pred)
print('Test auc is', auc(fpr, tpr))

Test auc is 0.7670218999082438


In [35]:
balanced_accuracy_score(y_test,np.around(y_pred))

0.7256071945786395

In [36]:
y_train_pred = model(Tensor(X_train[0:1]).to(device))

In [37]:
fpr, tpr, thresholds =  roc_curve(y_train, y_train_pred)
print('Train auc is', auc(fpr, tpr))

ValueError: Found input variables with inconsistent numbers of samples: [4812, 1]

# Iterative Method

In [None]:
for xai_method in [Saliency,DeepLift,FeatureAblation,IntegratedGradients]:
    print(xai_method.get_name())
    reduce_rate = 0.8
    best_score = 0
    num_cur_features = X_train.shape[-1]
    select_arr = np.ones(num_cur_features)
    remaining_inds = np.nonzero(select_arr)[0]
    num_select = 73
    while num_cur_features>num_select:
        bool_arr = np.array(select_arr, dtype='bool') 
        print(X_train[...,bool_arr].shape)
        train_set = TensorDataset(Tensor(X_train[...,bool_arr]), Tensor(y_train))
        train_loader = DataLoader(train_set, batch_size=y_train.shape[0], shuffle=True)
        test_set = TensorDataset(Tensor(X_test[...,bool_arr]), Tensor(y_test))
        test_loader = DataLoader(test_set, batch_size=y_test.shape[0])
    
        model = MLPClassifier(int(np.sum(select_arr)), hidden_layer_sizes, p=0.0)
        model.to(device)
        model.train()
    
        # criterion = nn.MSELoss(reduction='mean')
        criterion = nn.BCELoss(reduction='mean')
        optimizer = torch.optim.Adam(model.parameters(), 0.001)
    
        train(model, train_loader, num_epochs, optimizer)
    
        model.eval()
        y_pred = model(Tensor(X_test[...,bool_arr]).to(device)).detach().cpu().numpy()
        fpr, tpr, thresholds = roc_curve(y_test,y_pred)
        score = auc(fpr, tpr)
        auc_score = auc(fpr, tpr)
        print('Test auc is', auc_score)
        ACC = balanced_accuracy_score(y_test,np.around(y_pred))
        
        print('Test ACC is', ACC)
    
        xai = xai_method(model)
        # xai = DeepLift(model)
        # xai = FeatureAblation(model)
        # xai = IntegratedGradients(model)
    
        num_remove = int(num_cur_features*(1-reduce_rate))
        if num_cur_features - num_remove<num_select:
            num_remove = num_cur_features - num_select
        print('num_remove', num_remove)
        xai_attr_test = xai.attribute(Tensor(X_test[...,bool_arr]).to(device))
        abs_xai_attr_test = np.abs(xai_attr_test.detach().cpu().numpy()).mean(0)
        inds = np.argpartition(abs_xai_attr_test, num_remove)[:num_remove]
        inds_to_remove = remaining_inds[inds]
        select_arr[inds_to_remove] = 0
        
        remaining_inds = np.nonzero(select_arr)[0]
        num_cur_features -= num_remove
        print('remaining', len(remaining_inds), num_cur_features)
    # print('The best score is:', best_score)
    # print('best features:', np.where(select_arr==1)[0])
    np.save(f'./{xai_method.get_name()}_feature.npy',np.where(select_arr==1)[0])

In [None]:
feature_saliency = np.load('Saliency_feature.npy')

In [None]:
feature_saliency.shape

In [None]:
bool_arr = np.array(select_arr, dtype='bool')

In [None]:
X_test[...,bool_arr].shape

In [None]:
select_arr.shape

In [None]:
print('The best score is:', best_score)
print('best features:', np.where(select_arr==1)[0])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# XGboost and SVM

In [None]:
clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, random_state=156).fit(X_train, np.squeeze(y_train))
clf.score(X_test, np.squeeze(y_test))

In [None]:
y_pred_proba = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(gamma='auto')

In [None]:
clf.fit(X_train, np.squeeze(y_train))
y_pred_proba = clf.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

In [None]:
y_pred_proba

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(penalty='l1',solver='liblinear')
clf.fit(X_train, np.squeeze(y_train))
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print(y_pred_proba)
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

In [None]:
coef = clf.coef_

In [None]:
coef[:,334]

In [None]:
np.where(coef!=0)[1]

In [None]:
non_zero_coef = coef[:,np.where(coef!=0)[1]]

In [None]:
non_zero_coef.shape

# RFE logstic regression

In [None]:
estimator = LogisticRegression(penalty=None)
selector = RFE(estimator, n_features_to_select=73, step=1)
selector = selector.fit(X_train, np.squeeze(y_train))

In [None]:
selected_RFE = selector.ranking_
selected_RFE[selected_RFE != 1] = 0

In [None]:
bool_arr = np.array(selected_RFE, dtype='bool') 
X_train_selected = X_train[...,bool_arr]
X_test_selected = X_test[...,bool_arr]

In [None]:
X_train_selected.shape

In [None]:
clf = LogisticRegression(penalty=None)
clf.fit(X_train_selected, np.squeeze(y_train))

y_pred_proba_linear = clf.predict_proba(X_test_selected)[:,1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba_linear)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba_linear))
print(f'balanced accuarcy score:{acc}')

# RFE svc

In [None]:
estimator = SVC(                    
            kernel = 'linear',
            probability = True,
            random_state = 42) 
selector = RFE(estimator, n_features_to_select=73, step=1)
selector = selector.fit(X_train, np.squeeze(y_train))

In [None]:
selected_RFE = selector.ranking_
selected_RFE[selected_RFE != 1] = 0

In [None]:
bool_arr = np.array(selected_RFE, dtype='bool') 
X_train_selected = X_train[...,bool_arr]
X_test_selected = X_test[...,bool_arr]

In [None]:
X_train_selected.shape

In [None]:
clf = LogisticRegression(penalty=None)
clf.fit(X_train_selected, np.squeeze(y_train))

y_pred_proba_linear = clf.predict_proba(X_test_selected)[:,1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba_linear)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba_linear))
print(f'balanced accuarcy score:{acc}')