In [1]:
%reload_ext autoreload
%reload_ext autoreload

In [2]:
import time
import os
import sys
import copy
import time
import datetime
import random
import math
import warnings
from functools import partial
# warnings.filterwarnings('ignore')

In [3]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

In [4]:
import numpy as np
import scipy
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm, trange

In [5]:
sys.path.append("../")

In [6]:
from chaosmining.data_utils import read_formulas, create_simulation_data
from chaosmining.simulation.models import MLPRegressor
from chaosmining.simulation.functions import abs_argmax_topk
from chaosmining.utils import radar_factory

In [358]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, precision_score, recall_score, accuracy_score, roc_curve, auc, balanced_accuracy_score

In [8]:
from captum.attr import IntegratedGradients, Saliency, DeepLift, FeatureAblation

In [9]:
import matplotlib
# mpl.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt 
from matplotlib.colors import ListedColormap

matplotlib.rcParams['lines.linewidth'] = 1
matplotlib.rcParams['lines.markersize'] = 5
plt.rcParams['figure.figsize'] = [4, 4]

# Santander customer dataset

In [10]:
house_train_path = '../data/house_prices_advanced_regression/train.csv'
house_test_path = '../data/house_prices_advanced_regression/test.csv'

In [13]:
customer_train_path = '../data/santander_customer_satisfaction/train.csv'
customer_test_path = '../data/santander_customer_satisfaction/test.csv'
sample_path = '../data/santander_customer_satisfaction/sample_submission.csv'

In [14]:
train  = pd.read_csv(customer_train_path,index_col=0)
test   = pd.read_csv(customer_test_path, index_col=0)
sample = pd.read_csv(sample_path)

In [15]:
sample['TARGET']

0        0
1        0
2        0
3        0
4        0
        ..
75813    0
75814    0
75815    0
75816    0
75817    0
Name: TARGET, Length: 75818, dtype: int64

In [16]:
train.dtypes.value_counts()

int64      259
float64    111
Name: count, dtype: int64

In [17]:
train.isnull().values.any()

False

In [18]:
train.select_dtypes(include=['int64']).nunique()

var3                             208
var15                            100
ind_var1_0                         2
ind_var1                           2
ind_var2_0                         1
                                ... 
num_var45_ult3                   172
saldo_var2_ult1                    1
saldo_medio_var13_medio_hace3      1
saldo_medio_var13_medio_ult1       3
TARGET                             2
Length: 259, dtype: int64

a good many of the integer features have one single value. Such columns have zero variance and thus have no predictive value, In https://www.kaggle.com/code/carlmcbrideellis/tabular-classification-with-neural-networks-keras/notebook they drop these columns from the train, as well as the test data to maintain consistency.

In [19]:
features_to_drop = train.nunique()
features_to_drop = features_to_drop.loc[features_to_drop.values==1].index
# now drop these columns from both the training and the test datasets
train = train.drop(features_to_drop,axis=1)
test  = test.drop(features_to_drop,axis=1)

In [98]:
train.dtypes.value_counts()

int64      225
float64    111
Name: count, dtype: int64

In [20]:
X = train.iloc[:,:-1]
y = train['TARGET']

In [29]:
from imblearn.under_sampling import RandomUnderSampler

In [89]:
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)
X_resampled = X_resampled.to_numpy()
y_resampled = y_resampled.to_numpy().reshape(y_resampled.shape[0],-1)

In [227]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, 
                                                  train_size=0.8,
                                                  test_size=0.2, 
                                                  random_state=42, 
                                                  shuffle=True)

In [228]:
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test   = scaler.transform(X_test)
# test    = scaler.transform(test)

# Training and Test a MLP

In [536]:
train_set = TensorDataset(Tensor(X_train), Tensor(y_train))
train_loader = DataLoader(train_set, batch_size=1000, shuffle=True)
test_set = TensorDataset(Tensor(X_test), Tensor(y_test))
test_loader = DataLoader(test_set, batch_size=y_test.shape[0])

In [537]:
hidden_layer_sizes = (100,100,100)
num_epochs = 500

In [538]:
from typing import List

class LinearBlock(nn.Module):
    def __init__(self, in_channels: int, out_channels: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_channels, out_channels),
            nn.ReLU(),
        )

    def forward(self, x: Tensor)-> Tensor:
        x = self.net(x)
        return x
    
class MLPClassifier(nn.Module):
    def __init__(self, in_channels: int, sizes: List[int], p: float=0.0):
        super().__init__()
        self.blocks = nn.ModuleList([
            LinearBlock(in_channels, sizes[0]),
            *[LinearBlock(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)]
        ])
        self.dropout = nn.Dropout(p)
        self.project = nn.Sequential(
            nn.Linear(sizes[-1], 1),
            nn.Sigmoid()
        )
    def forward(self, x: Tensor)-> Tensor:
        for block in self.blocks:
            x = block(x)
        x = self.dropout(x)
        x = self.project(x)
        return x

In [539]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [540]:
model = MLPClassifier(X_train.shape[-1], hidden_layer_sizes, p=0.0)
model.to(device)
model.train()

MLPClassifier(
  (blocks): ModuleList(
    (0): LinearBlock(
      (net): Sequential(
        (0): Linear(in_features=335, out_features=100, bias=True)
        (1): ReLU()
      )
    )
    (1-2): 2 x LinearBlock(
      (net): Sequential(
        (0): Linear(in_features=100, out_features=100, bias=True)
        (1): ReLU()
      )
    )
  )
  (dropout): Dropout(p=0.0, inplace=False)
  (project): Sequential(
    (0): Linear(in_features=100, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

In [541]:
def train(model, dataloader, num_epochs, optimizer):
    pbar = trange(num_epochs, desc='Train', unit='epoch', initial=0, disable=False)
    for epoch in pbar:  # loop over the dataset multiple times
        running_loss = 0.0
        for inputs, targets in dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()
            inputs = inputs.to(device)
            targets = targets.to(device)
            # forward pass
            outputs = model(inputs)
            # defining loss
            # print(outputs.dtype,targets.dtype)
            # print(outputs.shape,targets.shape)
            # print(outputs[0],targets[0])
            loss = criterion(outputs, targets)

            # computing gradients
            loss.backward()
            # accumulating running loss
            running_loss += loss.item()
            # updated weights based on computed gradients
            optimizer.step()
        pbar.set_postfix(loss = '%.3f' % running_loss)
        # print(loss.item())
    print('train loss:', running_loss)

In [542]:
# criterion = nn.MSELoss(reduction='mean')
criterion = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), 0.001)

In [543]:
train(model, train_loader, num_epochs, optimizer)

Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 1.6107119023799896


In [544]:
X_test.shape

(1204, 335)

In [545]:
y_pred = model(Tensor(X_test).to(device)).detach().cpu().numpy()

In [546]:
y_test =y_test.astype(int)

In [547]:
fpr, tpr, thresholds = roc_curve(y_test,y_pred)
print('Test auc is', auc(fpr, tpr))

Test auc is 0.7746443061344064


In [548]:
balanced_accuracy_score(y_test,np.around(y_pred))

0.7359905811601093

# Iterative Method

In [523]:
for xai_method in [Saliency,DeepLift,FeatureAblation,IntegratedGradients]:
    print(xai_method.get_name())
    reduce_rate = 0.8
    best_score = 0
    num_cur_features = X_train.shape[-1]
    select_arr = np.ones(num_cur_features)
    remaining_inds = np.nonzero(select_arr)[0]
    num_select = 73
    while num_cur_features>num_select:
        bool_arr = np.array(select_arr, dtype='bool') 
        print(X_train[...,bool_arr].shape)
        train_set = TensorDataset(Tensor(X_train[...,bool_arr]), Tensor(y_train))
        train_loader = DataLoader(train_set, batch_size=y_train.shape[0], shuffle=True)
        test_set = TensorDataset(Tensor(X_test[...,bool_arr]), Tensor(y_test))
        test_loader = DataLoader(test_set, batch_size=y_test.shape[0])
    
        model = MLPClassifier(int(np.sum(select_arr)), hidden_layer_sizes, p=0.0)
        model.to(device)
        model.train()
    
        # criterion = nn.MSELoss(reduction='mean')
        criterion = nn.BCELoss(reduction='mean')
        optimizer = torch.optim.Adam(model.parameters(), 0.001)
    
        train(model, train_loader, num_epochs, optimizer)
    
        model.eval()
        y_pred = model(Tensor(X_test[...,bool_arr]).to(device)).detach().cpu().numpy()
        fpr, tpr, thresholds = roc_curve(y_test,y_pred)
        score = auc(fpr, tpr)
        auc_score = auc(fpr, tpr)
        print('Test auc is', auc_score)
        ACC = balanced_accuracy_score(y_test,np.around(y_pred))
        
        print('Test ACC is', ACC)
    
        xai = xai_method(model)
        # xai = DeepLift(model)
        # xai = FeatureAblation(model)
        # xai = IntegratedGradients(model)
    
        num_remove = int(num_cur_features*(1-reduce_rate))
        if num_cur_features - num_remove<num_select:
            num_remove = num_cur_features - num_select
        print('num_remove', num_remove)
        xai_attr_test = xai.attribute(Tensor(X_test[...,bool_arr]).to(device))
        abs_xai_attr_test = np.abs(xai_attr_test.detach().cpu().numpy()).mean(0)
        inds = np.argpartition(abs_xai_attr_test, num_remove)[:num_remove]
        inds_to_remove = remaining_inds[inds]
        select_arr[inds_to_remove] = 0
        
        remaining_inds = np.nonzero(select_arr)[0]
        num_cur_features -= num_remove
        print('remaining', len(remaining_inds), num_cur_features)
    # print('The best score is:', best_score)
    # print('best features:', np.where(select_arr==1)[0])
    np.save(f'./{xai_method.get_name()}_feature.npy',np.where(select_arr==1)[0])

Saliency
(4812, 335)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3880941569805145
Test auc is 0.7773486297356755
Test ACC is 0.7180842831400555
num_remove 66
remaining 269 269
(4812, 269)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38835984468460083
Test auc is 0.7801054644748334
Test ACC is 0.7193500779375837
num_remove 53
remaining 216 216
(4812, 216)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39570337533950806
Test auc is 0.7765360889704501
Test ACC is 0.7222022618481709
num_remove 43
remaining 173 173
(4812, 173)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3783181309700012
Test auc is 0.7787235924251301
Test ACC is 0.7240677891152703
num_remove 34
remaining 139 139
(4812, 139)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39901003241539
Test auc is 0.7678261494411711
Test ACC is 0.734119526404811
num_remove 27
remaining 112 112
(4812, 112)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.4211292266845703
Test auc is 0.7635202361342959
Test ACC is 0.7249328410183844
num_remove 22
remaining 90 90
(4812, 90)




Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.4621064066886902
Test auc is 0.7748336226052157
Test ACC is 0.7238715632842124
num_remove 17




remaining 73 73
Deep Lift
(4812, 335)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.38652315735816956
Test auc is 0.7775821661120754
Test ACC is 0.7237361398233415
num_remove 66
remaining 269 269
(4812, 269)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3859478831291199
Test auc is 0.7736908144201112
Test ACC is 0.7222077893363698
num_remove 53
remaining 216 216
(4812, 216)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.36902034282684326
Test auc is 0.7739588975977536
Test ACC is 0.7237416673115402
num_remove 43
remaining 173 173
(4812, 173)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3836003839969635
Test auc is 0.7676534154349581
Test ACC is 0.7179433321909858
num_remove 34
remaining 139 139
(4812, 139)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37878358364105225
Test auc is 0.7708179024287785
Test ACC is 0.7202814596990835
num_remove 27
remaining 112 112
(4812, 112)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3826960325241089
Test auc is 0.7602756005615927
Test ACC is 0.7179460959350852
num_remove 22
remaining 90 90
(4812, 90)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3766317367553711
Test auc is 0.7767102048487128
Test ACC is 0.7206075815028135
num_remove 17
remaining 73 73
Feature Ablation
(4812, 335)


               activations. The hooks and attributes will be removed
            after the attribution is finished


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3898269236087799
Test auc is 0.7685253766983207
Test ACC is 0.7196098698829279
num_remove 66
remaining 269 269
(4812, 269)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3918795585632324
Test auc is 0.7645428214510761
Test ACC is 0.7227439556916546
num_remove 53
remaining 216 216
(4812, 216)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37716081738471985
Test auc is 0.7795914080723437
Test ACC is 0.71522380799717
num_remove 43
remaining 173 173
(4812, 173)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3746878504753113
Test auc is 0.7771399670561703
Test ACC is 0.7246758128171397
num_remove 34
remaining 139 139
(4812, 139)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37539368867874146
Test auc is 0.7705042174734956
Test ACC is 0.7152818466232574
num_remove 27
remaining 112 112
(4812, 112)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3762601613998413
Test auc is 0.7864040372773804
Test ACC is 0.7178161999624131
num_remove 22
remaining 90 90
(4812, 90)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.39044657349586487
Test auc is 0.7853690151121527
Test ACC is 0.7182805089711133
num_remove 17
remaining 73 73
Integrated Gradients
(4812, 335)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3845120668411255
Test auc is 0.769404247321932
Test ACC is 0.7126949821462132
num_remove 66
remaining 269 269
(4812, 269)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37709057331085205
Test auc is 0.7799714228860122
Test ACC is 0.7288020827575533
num_remove 53
remaining 216 216
(4812, 216)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.378109335899353
Test auc is 0.7811183766872658
Test ACC is 0.728868412615939
num_remove 43
remaining 173 173
(4812, 173)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37879496812820435
Test auc is 0.7787926860276154
Test ACC is 0.7282659164022685
num_remove 34
remaining 139 139
(4812, 139)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3684869110584259
Test auc is 0.7606155410858199
Test ACC is 0.7121560520468289
num_remove 27
remaining 112 112
(4812, 112)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.37968119978904724
Test auc is 0.7745323744983804
Test ACC is 0.7192782205909991
num_remove 22
remaining 90 90
(4812, 90)


Train:   0%|          | 0/500 [00:00<?, ?epoch/s]

train loss: 0.3998943269252777
Test auc is 0.7907859535469892
Test ACC is 0.7277297500469837
num_remove 17
remaining 73 73


In [467]:
feature_saliency = np.load('Saliency_feature.npy')

In [469]:
feature_saliency.shape

(160,)

In [465]:
bool_arr = np.array(select_arr, dtype='bool')

In [466]:
X_test[...,bool_arr].shape

(1204, 164)

In [464]:
select_arr.shape

(335,)

In [296]:
print('The best score is:', best_score)
print('best features:', np.where(select_arr==1)[0])

The best score is: 0.7959417181644317
best features: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  25  26  27  28  29  30  31  32  33  36  37  38  39
  40  43  44  45  46  47  48  49  50  51  52  53  56  57  58  59  60  61
  62  67  68  69  70  71  72  74  75  76  77  78  79  80  81  82  83  84
  85  86  87  88  89  90  91  92  95  96  97  98  99 101 102 103 104 105
 106 107 108 109 110 112 113 114 115 116 117 118 119 120 121 124 125 126
 127 128 129 131 134 135 136 137 138 139 140 141 142 143 144 145 146 147
 148 150 151 152 153 154 155 156 157 159 160 161 162 164 165 166 169 170
 171 172 173 176 177 179 180 181 183 187 188 189 191 192 193 199 200 202
 203 204 205 209 210 211 212 213 214 215 216 217 218 220 221 224 225 226
 227 228 229 230 231 232 233 234 235 236 237 238 240 241 242 243 244 245
 246 247 248 249 250 251 252 253 254 255 256 258 259 261 262 263 264 265
 266 267 268 269 270 271 272 273 274 275 277 279 280 281 282 287 289 29

In [313]:
from sklearn.ensemble import GradientBoostingClassifier

# XGboost and SVM

In [506]:
clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, random_state=156).fit(X_train, np.squeeze(y_train))
clf.score(X_test, np.squeeze(y_test))

0.7425249169435216

In [507]:
y_pred_proba = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

Test auc is 0.8266496788529356
balanced accuarcy score:0.7435079651104946


In [398]:
from sklearn.svm import SVC

In [508]:
clf = SVC(gamma='auto')

In [509]:
clf.fit(X_train, np.squeeze(y_train))
y_pred_proba = clf.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

Test auc is 0.6958637805808284
balanced accuarcy score:0.6958637805808284


In [510]:
y_pred_proba

array([1, 0, 1, ..., 1, 0, 0])

In [425]:
from sklearn.linear_model import LogisticRegression

In [513]:
clf = LogisticRegression(penalty='l1',solver='liblinear')
clf.fit(X_train, np.squeeze(y_train))
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print(y_pred_proba)
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba))
print(f'balanced accuarcy score:{acc}')

[0.57961394 0.27623552 0.6748607  ... 0.82667629 0.65256925 0.20646396]
Test auc is 0.7894331008103297
balanced accuarcy score:0.7065677614778292


In [439]:
coef = clf.coef_

In [447]:
coef[:,334]

array([-4.66449368])

In [445]:
np.where(coef!=0)[1]

array([  0,   1,   2,  12,  13,  17,  20,  21,  22,  25,  26,  27,  28,
        30,  36,  43,  44,  46,  47,  48,  52,  53,  56,  57,  58,  67,
        68,  69,  70,  72,  74,  78,  80,  84,  85, 102, 104, 105, 134,
       136, 137, 140, 142, 144, 148, 150, 164, 173, 176, 188, 202, 229,
       233, 235, 245, 246, 248, 249, 252, 253, 256, 261, 262, 263, 279,
       280, 282, 292, 294, 295, 297, 299, 334])

In [450]:
non_zero_coef = coef[:,np.where(coef!=0)[1]]

In [451]:
non_zero_coef.shape

(1, 73)

# RFE logstic regression

In [478]:
estimator = LogisticRegression(penalty=None)
selector = RFE(estimator, n_features_to_select=73, step=1)
selector = selector.fit(X_train, np.squeeze(y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [483]:
selected_RFE = selector.ranking_
selected_RFE[selected_RFE != 1] = 0

In [496]:
bool_arr = np.array(selected_RFE, dtype='bool') 
X_train_selected = X_train[...,bool_arr]
X_test_selected = X_test[...,bool_arr]

In [497]:
X_train_selected.shape

(4812, 73)

In [516]:
clf = LogisticRegression(penalty=None)
clf.fit(X_train_selected, np.squeeze(y_train))

y_pred_proba_linear = clf.predict_proba(X_test_selected)[:,1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba_linear)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba_linear))
print(f'balanced accuarcy score:{acc}')

Test auc is 0.7915238732215308
balanced accuarcy score:0.7116917430381287


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# RFE svc

In [None]:
estimator = SVC(                    
            kernel = 'linear',
            probability = True,
            random_state = 42) 
selector = RFE(estimator, n_features_to_select=73, step=1)
selector = selector.fit(X_train, np.squeeze(y_train))

In [None]:
selected_RFE = selector.ranking_
selected_RFE[selected_RFE != 1] = 0

In [None]:
bool_arr = np.array(selected_RFE, dtype='bool') 
X_train_selected = X_train[...,bool_arr]
X_test_selected = X_test[...,bool_arr]

In [None]:
X_train_selected.shape

In [None]:
clf = LogisticRegression(penalty=None)
clf.fit(X_train_selected, np.squeeze(y_train))

y_pred_proba_linear = clf.predict_proba(X_test_selected)[:,1]
fpr, tpr, thresholds = roc_curve(y_test,y_pred_proba_linear)
print('Test auc is', auc(fpr, tpr))
acc = balanced_accuracy_score(y_test,np.around(y_pred_proba_linear))
print(f'balanced accuarcy score:{acc}')