In [1]:
%reload_ext autoreload
%reload_ext autoreload

In [2]:
import time
import os
import sys
import copy
import time
import datetime
import random
import math
import warnings
from functools import partial
# warnings.filterwarnings('ignore')

In [3]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

In [4]:
import numpy as np
import scipy
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm, trange

In [5]:
sys.path.append("../")

In [6]:
from chaosmining.data_utils import read_formulas, create_simulation_data
from chaosmining.simulation.models import MLPRegressor
from chaosmining.simulation.functions import abs_argmax_topk
from chaosmining.utils import radar_factory

In [358]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, precision_score, recall_score, accuracy_score, roc_curve, auc, balanced_accuracy_score

In [8]:
from captum.attr import IntegratedGradients, Saliency, DeepLift, FeatureAblation

In [9]:
import matplotlib
# mpl.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt 
from matplotlib.colors import ListedColormap

matplotlib.rcParams['lines.linewidth'] = 1
matplotlib.rcParams['lines.markersize'] = 5
plt.rcParams['figure.figsize'] = [4, 4]

# Santander customer dataset

In [10]:
house_train_path = '../data/house_prices_advanced_regression/train.csv'
house_test_path = '../data/house_prices_advanced_regression/test.csv'

In [13]:
customer_train_path = '../data/santander_customer_satisfaction/train.csv'
customer_test_path = '../data/santander_customer_satisfaction/test.csv'
sample_path = '../data/santander_customer_satisfaction/sample_submission.csv'

In [14]:
train  = pd.read_csv(customer_train_path,index_col=0)
test   = pd.read_csv(customer_test_path, index_col=0)
sample = pd.read_csv(sample_path)

In [15]:
sample['TARGET']

0        0
1        0
2        0
3        0
4        0
        ..
75813    0
75814    0
75815    0
75816    0
75817    0
Name: TARGET, Length: 75818, dtype: int64

In [16]:
train.dtypes.value_counts()

int64      259
float64    111
Name: count, dtype: int64

In [17]:
train.isnull().values.any()

False

In [18]:
train.select_dtypes(include=['int64']).nunique()

var3                             208
var15                            100
ind_var1_0                         2
ind_var1                           2
ind_var2_0                         1
                                ... 
num_var45_ult3                   172
saldo_var2_ult1                    1
saldo_medio_var13_medio_hace3      1
saldo_medio_var13_medio_ult1       3
TARGET                             2
Length: 259, dtype: int64

a good many of the integer features have one single value. Such columns have zero variance and thus have no predictive value, In https://www.kaggle.com/code/carlmcbrideellis/tabular-classification-with-neural-networks-keras/notebook they drop these columns from the train, as well as the test data to maintain consistency.

In [19]:
features_to_drop = train.nunique()
features_to_drop = features_to_drop.loc[features_to_drop.values==1].index
# now drop these columns from both the training and the test datasets
train = train.drop(features_to_drop,axis=1)
test  = test.drop(features_to_drop,axis=1)

In [98]:
train.dtypes.value_counts()

int64      225
float64    111
Name: count, dtype: int64

In [20]:
X = train.iloc[:,:-1]
y = train['TARGET']

In [29]:
from imblearn.under_sampling import RandomUnderSampler

In [89]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
X_resampled = X_resampled.to_numpy()
y_resampled = y_resampled.to_numpy().reshape(y_resampled.shape[0],-1)

In [227]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                  train_size=0.8,
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  shuffle=True)

In [228]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test   = scaler.transform(X_test)
# test    = scaler.transform(test)

# Training and Test a MLP

In [346]:
train_set = TensorDataset(Tensor(X_train), Tensor(y_train))
train_loader = DataLoader(train_set, batch_size=1000, shuffle=True)
test_set = TensorDataset(Tensor(X_test), Tensor(y_test))
test_loader = DataLoader(test_set, batch_size=y_test.shape[0])

In [347]:
hidden_layer_sizes = (100,100,100)
num_epochs = 500

In [348]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [349]:
from typing import List

class LinearBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_channels, out_channels),
            nn.ReLU(),
        )

    def forward(self, x: Tensor)-> Tensor:
        x = self.net(x)
        return x
    
class MLPClassifier(nn.Module):
    def __init__(self, in_channels: int, sizes: List[int], p: float=0.0):
        super().__init__()
        self.blocks = nn.ModuleList([
            LinearBlock(in_channels, sizes[0]),
            *[LinearBlock(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)]
        ])
        self.dropout = nn.Dropout(p)
        self.project = nn.Sequential(
            nn.Linear(sizes[-1], 1),
            nn.Sigmoid()
        )
    def forward(self, x: Tensor)-> Tensor:
        for block in self.blocks:
            x = block(x)
        x = self.dropout(x)
        x = self.project(x)
        return x

In [350]:
model = MLPClassifier(X_train.shape[-1], hidden_layer_sizes, p=0.0)
model.to(device)
model.train()

MLPClassifier(
  (blocks): ModuleList(
    (0): LinearBlock(
      (net): Sequential(
        (0): Linear(in_features=335, out_features=100, bias=True)
        (1): ReLU()
      )
    )
    (1-2): 2 x LinearBlock(
      (net): Sequential(
        (0): Linear(in_features=100, out_features=100, bias=True)
        (1): ReLU()
      )
    )
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (project): Sequential(
    (0): Linear(in_features=100, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [351]:
# criterion = nn.MSELoss(reduction='mean')
criterion = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), 0.001)

In [352]:
def train(model, dataloader, num_epochs, optimizer):
    pbar = trange(num_epochs, desc='Train', unit='epoch', initial=0, disable=False)
    for epoch in pbar:  # loop over the dataset multiple times
        running_loss = 0.0
        for inputs, targets in dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()
            inputs = inputs.to(device)
            targets = targets.to(device)
            # forward pass
            outputs = model(inputs)
            # defining loss
            # print(outputs.dtype,targets.dtype)
            # print(outputs.shape,targets.shape)
            # print(outputs[0],targets[0])
            loss = criterion(outputs, targets)

            # computing gradients
            loss.backward()
            # accumulating running loss
            running_loss += loss.item()
            # updated weights based on computed gradients
            optimizer.step()
        pbar.set_postfix(loss = '%.3f' % running_loss)
        # print(loss.item())
    print('train loss:', running_loss)

In [353]:
train(model, train_loader, num_epochs, optimizer)

Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 1.6411129534244537


In [375]:
y_pred = model(Tensor(X_test).to(device)).detach().cpu().numpy()

In [376]:
y_test =y_test.astype(int)

In [377]:
fpr, tpr, thresholds = roc_curve(y_test,y_pred)
print('Test auc is', auc(fpr, tpr))

Test auc is 0.7596952143007175


In [378]:
balanced_accuracy_score(y_test,np.around(y_pred))

0.7250737919674541

In [268]:
y_train_pred = model(Tensor(X_train[0:1]).to(device))

In [289]:
fpr, tpr, thresholds =  roc_curve(y_train, y_train_pred)
print('Train auc is', auc(fpr, tpr))

Train auc is 0.8424127457992557


# Iterative Method

In [473]:
for xai_method in [Saliency,DeepLift,FeatureAblation,IntegratedGradients]:
    print(xai_method.get_name())
    reduce_rate = 0.8
    best_score = 0
    num_cur_features = X_train.shape[-1]
    select_arr = np.ones(num_cur_features)
    num_select = 73
    while num_cur_features>num_select:
        bool_arr = np.array(select_arr, dtype='bool') 
        print(X_train[...,bool_arr].shape)
        train_set = TensorDataset(Tensor(X_train[...,bool_arr]), Tensor(y_train))
        train_loader = DataLoader(train_set, batch_size=y_train.shape[0], shuffle=True)
        test_set = TensorDataset(Tensor(X_test[...,bool_arr]), Tensor(y_test))
        test_loader = DataLoader(test_set, batch_size=y_test.shape[0])
    
        model = MLPClassifier(int(np.sum(select_arr)), hidden_layer_sizes, p=0.0)
        model.to(device)
        model.train()
    
        # criterion = nn.MSELoss(reduction='mean')
        criterion = nn.BCELoss(reduction='mean')
        optimizer = torch.optim.Adam(model.parameters(), 0.001)
    
        train(model, train_loader, num_epochs, optimizer)
    
        model.eval()
        y_pred = model(Tensor(X_test[...,bool_arr]).to(device)).detach().cpu().numpy()
        fpr, tpr, thresholds = roc_curve(y_test,y_pred)
        score = auc(fpr, tpr)
        auc_score = auc(fpr, tpr)
        print('Test auc is', auc_score)
        ACC = balanced_accuracy_score(y_test,np.around(y_pred))
        
        print('Test ACC is', ACC)
    
        xai = xai_method(model)
        # xai = DeepLift(model)
        # xai = FeatureAblation(model)
        # xai = IntegratedGradients(model)
    
        num_remove = int(num_cur_features*(1-reduce_rate))
        if num_cur_features - num_remove<num_select:
            num_remove = num_cur_features - num_select
        print('num_remove', num_remove)
        xai_attr_test = xai.attribute(Tensor(X_test[...,bool_arr]).to(device))
        abs_xai_attr_test = np.abs(xai_attr_test.detach().cpu().numpy()).mean(0)
        inds = np.argpartition(abs_xai_attr_test, num_remove)[:num_remove]
        print('inds', inds.shape)
        select_arr[inds] = 0
        num_cur_features -= num_remove
        print(num_cur_features)
        print(np.where(select_arr==1)[0].shape)
        print(inds.shape)
        print(inds)
    # print('The best score is:', best_score)
    print('best features:', np.where(select_arr==1)[0])
    np.save(f'./{xai_method.get_name()}_feature.npy',np.where(select_arr==1)[0])

Saliency
(4812, 335)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38576629757881165
Test auc is 0.7743527311319192
Test ACC is 0.7215417270084128
num_remove 66
inds (66,)
269
(269,)
(66,)
[ 63 231 168 329 158 173 174 325 175 323 322 178 319 317 149 316 315 182
 146 184 185 287 186 190  24 194 197 132 131 130 198 123 122 278  34 200
 276 206 208 109 107  41  42 215 218 219 101 100 221  94  93 226 228  83
  82  55 237 239  66  65  64 163 222 284  29 106]
(4812, 269)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3938729763031006
Test auc is 0.7747976939319234
Test ACC is 0.7142813712592724
num_remove 53
inds (53,)
216
(227,)
(53,)
[258 164 116 149 174  60 229 128 139 260  92  89 206 227  78 109  32  29
 262 186 159 230  46 162 243  20 187  27 157 263  42 228  84  81 210  49
 261  75  48  23 173  62  83  44  22 267  71  50 137  35 185 156 169]
(4812, 227)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.387012243270874
Test auc is 0.7703936677095194
Test ACC is 0.7141542390306996
num_remove 43
inds (43,)
173
(200,)
(43,)
[ 34  59 187 188 125 224 221 220 219 218 168 167 126 186 110 104 103 128
 131  88  20  87 133 141  72  25 151  57  49  48  38  37  36 119  40  83
  93  50  47 142 162  68  69]
(4812, 200)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3904813230037689
Test auc is 0.7776581690748089
Test ACC is 0.71242966271267
num_remove 34
inds (34,)
139
(182,)
(34,)
[161 112  81 160  31  38 125  56 194 193 124 116 108 101 100 140  95 158
  36 183 141  32 192  39  43  61  37  64 106  74 135  48  82  30]
(4812, 182)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3680095672607422
Test auc is 0.7716774268436936
Test ACC is 0.717951623423284
num_remove 27
inds (27,)
112
(174,)
(27,)
[ 28  90 175 173 171 143 142 140 107 106  98  94  88  85  68  57  31   0
  63 130  20 101  82  47  27 117  83]
(4812, 174)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37432578206062317
Test auc is 0.774781111467327
Test ACC is 0.7198862442928684
num_remove 22
inds (22,)
90
(164,)
(22,)
[ 19  42  82 134  80  75  98 114  30 115  26  90  99 168 167  29  96 133
  54 109  34 166]
(4812, 164)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3682009279727936
Test auc is 0.7781155687232608
Test ACC is 0.7224095426556264
num_remove 17
inds (17,)
73
(160,)
(17,)
[ 65  72 125  23 124 123 155  36 158 122  52  88  64  89  70  67  26]
best features: [  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  21  33  45  51  53  58  73  76  77  79  86  91  97 102 105 111 113 118
 120 121 127 129 136 138 144 145 147 148 150 152 153 154 165 170 172 176
 177 179 180 181 189 191 195 196 199 201 202 203 204 205 207 209 211 212
 213 214 216 217 223 225 232 233 234 235 236 238 240 241 242 244 245 246
 247 248 249 250 251 252 253 254 255 256 257 259 264 265 266 268 269 270
 271 272 273 274 275 277 279 280 281 282 283 285 286 288 289 290 291 292
 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
 311 312 313 314 318 320 321 324 326 327 328 330 331 332 333 334]
Deep Lift
(4812, 335)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.385741263628006
Test auc is 0.7830156870115081
Test ACC is 0.7212764075748699
num_remove 66
inds (66,)
269
(269,)
(66,)
[196 198  83  93  94 100 101 111 122 123 128 129 130 131 132 133 325 324
 323 322 149 154 167 163 166 168 174 175 178 181 182 183 184 185 186 190
 193 194 195 326 197 240 239 316 315 289 288 287 286 285 284 283 278 277
 276 319 260 259 257 241 158 207 206 213 208 215]
(4812, 269)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3839403986930847
Test auc is 0.7702734448411953
Test ACC is 0.714693169130084
num_remove 53
inds (53,)
216
(228,)
(53,)
[ 34  62 163  82 261 164 167 262 263 173  66  65  64  63  61  60  55  54
 176 177 178 179 180 181 182  42  41 184  35 257  24  23 141 140 195 258
 259 264 265 170  40 267 266 151  95 168 254 162 157 183 260 232  76]
(4812, 228)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3921399712562561
Test auc is 0.7794988226450137
Test ACC is 0.7236090075947688
num_remove 43
inds (43,)
173
(196,)
(43,)
[132 136 222 146 147 218 148 223 224 149 216 220 150 151 153 142 217 139
 221 163 145 226 225 133  76 213  79 131 164 126 119 118  77 219 191 152
 212 166  37  38 137  78 122]
(4812, 196)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.40740177035331726
Test auc is 0.7765830726201399
Test ACC is 0.7182141791127277
num_remove 34
inds (34,)
139
(177,)
(34,)
[194 187 185 184 181 188 131 121 119 116 113 112 189 190 191 192 193 186
 107  99  71 111  72  36 132 180 100 106  73 159  74 134 103 182]
(4812, 177)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3914170563220978
Test auc is 0.7777797738151828
Test ACC is 0.7186757243773285
num_remove 27
inds (27,)
112
(165,)
(27,)
[175 162 161 113 112 165 102 100  97  94  93  92 166  85 167 169 170 171
 172 173 174 168  88 115  35 140  91]
(4812, 165)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3980121314525604
Test auc is 0.7815191195816797
Test ACC is 0.716019766297799
num_remove 22
inds (22,)
90
(159,)
(22,)
[162  81  82 154 153  61 150 149  80 156 157 158 159  88  85  90 155 160
 100 161 101 163]
(4812, 159)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38927581906318665
Test auc is 0.7794877676686159
Test ACC is 0.7240788440916679
num_remove 17
inds (17,)
73
(157,)
(17,)
[153 151 148  94  84  76  75  79 152 147 155 156  82 157 149 154 150]
best features: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  25  26  27  28  29  30  31  32  33  39  43  44  45
  46  47  48  49  50  51  52  53  56  57  58  59  67  68  69  70  86  87
  89  96  98 104 105 108 109 110 114 117 120 124 125 127 135 138 143 144
 199 200 201 202 203 204 205 209 210 211 214 227 228 229 230 231 233 234
 235 236 237 238 242 243 244 245 246 247 248 249 250 251 252 253 255 256
 268 269 270 271 272 273 274 275 279 280 281 282 290 291 292 293 294 295
 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
 314 317 318 320 321 327 328 329 330 331 332 333 334]
Feature Ablation
(4812, 335)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3799121379852295
Test auc is 0.771271156461081
Test ACC is 0.7182888002034116
num_remove 66
inds (66,)
269
(269,)
(66,)
[200 201 100 101 111 122 123 128 129 130 131 132 133 325 324 323 149 154
 167 163 166 322 168 174 175 178 180 181 182 183 184 185 186 190 192 193
 194 195 196 197 198  93 257 241 316 315 289 288 287 286 285 284 283 278
 277 276 275 268 267 319 260 259 158 208 212 214]
(4812, 269)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37432581186294556
Test auc is 0.7750975601667092
Test ACC is 0.7256652332047271
num_remove 53
inds (53,)
216
(232,)
(53,)
[ 42  83 165 162 158 168 170 172 173 256 257 176 177 178 179 180 181 182
 183 184 195  93 197  82 260 261 262 263 198 200  66  65  64  63  62  61
  60 264 265  55  54 167  41  34 231  24  23  35 252 253 144  40  97]
(4812, 232)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39292439818382263
Test auc is 0.7801897586698652
Test ACC is 0.7250765557115535
num_remove 43
inds (43,)
173
(196,)
(43,)
[137 230 226 225 224 223 169 168 166 156  75 154 153 152 151 150 149 148
 145 144 143 220 229 219 140 139 228 113 227  59 142 134 124 155 216 130
 171 215  38 194  37  81  80]
(4812, 196)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3800888657569885
Test auc is 0.7742076345667002
Test ACC is 0.7215389632643134
num_remove 34
inds (34,)
139
(181,)
(34,)
[123 116 188  69 191 192 187 106 110 113 115 184 118 193 120 122 189 124
 183 158 132 102 133 130 135 190 180  36  35  75 179  74 181 182]
(4812, 181)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38691002130508423
Test auc is 0.7787443205058757
Test ACC is 0.726936555490454
num_remove 27
inds (27,)
112
(170,)
(27,)
[177 168 118 117 115 109 108 107 105 103 101 100  98 172  66 173 174 175
 176 169   7   8  95 120 143 164 165]
(4812, 170)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39333227276802063
Test auc is 0.7785715864996628
Test ACC is 0.7202786959549841
num_remove 22
inds (22,)
90
(160,)
(22,)
[157  94 163 162  64 161 167 158  87  89  90 164  96 166  92  98 104 106
 107  97 165  84]
(4812, 160)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38809263706207275
Test auc is 0.7832506052599577
Test ACC is 0.7188857689288833
num_remove 17
inds (17,)
73
(155,)
(17,)
[ 84  86 153 152 151  79  77 148  80 147 154  87  97  96  88  94 158]
best features: [  0   1   2   3   4   5   6   9  10  11  12  13  14  15  16  17  18  19
  20  21  22  25  26  27  28  29  30  31  32  33  39  43  44  45  46  47
  48  49  50  51  52  53  56  57  58  67  68  70  71  72  73  76  78  85
  91  99 112 114 119 121 125 126 127 136 138 141 146 159 160 199 202 203
 204 205 206 207 209 210 211 213 217 218 221 222 232 233 234 235 236 237
 238 239 240 242 243 244 245 246 247 248 249 250 251 254 255 258 266 269
 270 271 272 273 274 279 280 281 282 290 291 292 293 294 295 296 297 298
 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 317 318
 320 321 326 327 328 329 330 331 332 333 334]
Integrated Gradients
(4812, 335)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38588470220565796
Test auc is 0.7724733851443227
Test ACC is 0.7272626772941839
num_remove 66
inds (66,)
269
(269,)
(66,)
[195 197  83  93  94 100 101 111 122 123 324 128 129 130 131 132 133 323
 322 149 154 167 158 163 166 168 319 174 175 178 181 182 183 184 185 186
 190 193 194 325 196 239 237 289 288 287 286 285 284 283 278 277 276 317
 260 259 257 318 241 240 315 206 204 208 207 214]
(4812, 269)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3926178216934204
Test auc is 0.7814182429220514
Test ACC is 0.7264031528792686
num_remove 53
inds (53,)
216
(226,)
(53,)
[184  66 257 142 254 157 162 163 164 165 170 232  82 260 261 262 263 173
 174  65  64  63  62  23  24 253  61 198  60 177 264 265  55  54  34  35
 185 178 179 180 181  41  42 182 183 255  45 267 266 256  98 211  97]
(4812, 226)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.4032813012599945
Test auc is 0.7784624186077362
Test ACC is 0.719543540024542
num_remove 43
inds (43,)
173
(195,)
(43,)
[140  79 211 189 210 214 217 218 219 220 166 221 222 121 126 154 153 152
 151 150 128 224 223 149 148 147 146 143 142 133 134 135  78 212  60 163
 120 132  38 213  59 112 184]
(4812, 195)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3926112949848175
Test auc is 0.7823896989729928
Test ACC is 0.7252036879401262
num_remove 34
inds (34,)
139
(178,)
(34,)
[ 37 181 180 179 158 183 135 123 122 120 115 114 113 109 107 102 186  75
  74 187 188 193 192 189 190 191 182  99 184 145 108 132 101  70]
(4812, 178)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39070290327072144
Test auc is 0.7665244259703506
Test ACC is 0.7112910001437147
num_remove 27
inds (27,)
112
(164,)
(27,)
[163  97  96 141 174  98  69  94 103 169  89 105 170 166 165 118 171 172
 106 173 164 176 175 162  36  66   5]
(4812, 164)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39716386795043945
Test auc is 0.7784430723990404
Test ACC is 0.7281995865438827
num_remove 22
inds (22,)
90
(154,)
(22,)
[162 152  82  83  84 155 156 157 158 159 160  89 127  91  92 149 161 148
 104 150  79 151]
(4812, 154)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3999764025211334
Test auc is 0.7762293133754159
Test ACC is 0.7176890677338403
num_remove 17
inds (17,)
73
(149,)
(17,)
[142  79 146  82 145 150 151 147 148  73  74 149 152  72 139  81 117]
best features: [  0   1   2   3   4   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  25  26  27  28  29  30  31  32  33  39  40  43  44  46
  47  48  49  50  51  52  53  56  57  58  67  68  71  76  77  80  85  86
  87  88  90  95 110 116 119 124 125 136 137 138 144 199 200 201 202 203
 205 209 215 216 225 226 227 228 229 230 231 233 234 235 236 238 242 243
 244 245 246 247 248 249 250 251 252 258 268 269 270 271 272 273 274 275
 279 280 281 282 290 291 292 293 294 295 296 297 298 299 300 301 302 303
 304 305 306 307 308 309 310 311 312 313 314 316 320 321 326 327 328 329
 330 331 332 333 334]


In [467]:
feature_saliency = np.load('Saliency_feature.npy')

In [469]:
feature_saliency.shape

(160,)

In [465]:
bool_arr = np.array(select_arr, dtype='bool')

In [466]:
X_test[...,bool_arr].shape

(1204, 164)

In [464]:
select_arr.shape

(335,)

In [296]:
print('The best score is:', best_score)
print('best features:', np.where(select_arr==1)[0])

The best score is: 0.7959417181644317
best features: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  25  26  27  28  29  30  31  32  33  36  37  38  39
  40  43  44  45  46  47  48  49  50  51  52  53  56  57  58  59  60  61
  62  67  68  69  70  71  72  74  75  76  77  78  79  80  81  82  83  84
  85  86  87  88  89  90  91  92  95  96  97  98  99 101 102 103 104 105
 106 107 108 109 110 112 113 114 115 116 117 118 119 120 121 124 125 126
 127 128 129 131 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 150 151 152 153 154 155 156 157 159 160 161 162 164 165 166 169 170
 171 172 173 176 177 179 180 181 183 187 188 189 191 192 193 199 200 202
 203 204 205 209 210 211 212 213 214 215 216 217 218 220 221 224 225 226
 227 228 229 230 231 232 233 234 235 236 237 238 240 241 242 243 244 245
 246 247 248 249 250 251 252 253 254 255 256 258 259 261 262 263 264 265
 266 267 268 269 270 271 272 273 274 275 277 279 280 281 282 287 289 29

In [313]:
from sklearn.ensemble import GradientBoostingClassifier

In [365]:
clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, random_state=156).fit(X_train, np.squeeze(y_train))
clf.score(X_test, np.squeeze(y_test))

0.7425249169435216

# XGboost and SVM

In [383]:
y_pred_proba = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

Test auc is 0.8266496788529356
balanced accuarcy score:0.7435079651104946


In [398]:
from sklearn.svm import SVC

In [403]:
clf = SVC(gamma='auto')

In [404]:
clf.fit(X_train, np.squeeze(y_train))
y_pred_proba = clf.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

Test auc is 0.6958637805808284
balanced accuarcy score:0.6958637805808284


In [423]:
# # estimator = SVC(kernel="linear")
# estimiator = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, random_state=156)
# selector = RFE(estimator, n_features_to_select=5, step=1)
# selector = selector.fit(X_train, np.squeeze(y_train))

In [425]:
from sklearn.linear_model import LogisticRegression

In [428]:
clf = LogisticRegression(penalty='l1',solver='liblinear')
clf.fit(X_train, np.squeeze(y_train))
y_pred_proba = clf.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

Test auc is 0.7073664835225577
balanced accuarcy score:0.7073664835225577


In [439]:
coef = clf.coef_

In [447]:
coef[:,334]

array([-4.66449368])

In [445]:
np.where(coef!=0)[1]

array([  0,   1,   2,  12,  13,  17,  20,  21,  22,  25,  26,  27,  28,
        30,  36,  43,  44,  46,  47,  48,  52,  53,  56,  57,  58,  67,
        68,  69,  70,  72,  74,  78,  80,  84,  85, 102, 104, 105, 134,
       136, 137, 140, 142, 144, 148, 150, 164, 173, 176, 188, 202, 229,
       233, 235, 245, 246, 248, 249, 252, 253, 256, 261, 262, 263, 279,
       280, 282, 292, 294, 295, 297, 299, 334])

In [450]:
non_zero_coef = coef[:,np.where(coef!=0)[1]]

In [451]:
non_zero_coef.shape

(1, 73)