In [1]:
import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import balanced_accuracy_score
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.svm import SVC, NuSVC

from sklearn.multiclass import OneVsRestClassifier

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest

from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
from sklearn.ensemble import IsolationForest

In [3]:
## SPLIT FUNCTIONS FOR CV ##

def split_df(df, n) :
    '''
    df: dataframe to shuffle and split
    n: amount of to be splitted pieces
    splits: list of dataframes (splits)
    '''
    # defining list with optimal cutting points
    split_points = list(map( lambda x: int(x*len(df)/n), (list(range(1,n)))))  
    # shuffling the df, then splitting it from split_points
    splits = list(np.split(df.sample(frac=1), split_points))
    return splits

def make_train_and_test(splits, index) :
    '''
    Take splits from splitDf, and return into test set (splits[index])
    and training set (the rest)
    '''
    
    # index is zero based, so range 0-9 for 10 fold split
    test = splits[index]

    left_lst = splits[:index]
    right_lst = splits[index+1:]

    train = pd.concat(left_lst+right_lst)
    return train, test

In [4]:
X_train_df = pd.read_csv('raw/X_train.csv')
y_train_df = pd.read_csv('raw/y_train.csv')
X_test_df = pd.read_csv('raw/X_test.csv')
sample_df = pd.read_csv('raw/sample.csv')

train_data_df = pd.concat([X_train_df, y_train_df.drop(['id'], axis=1)], axis=1)
columns = np.array(train_data_df.columns)
train_data = train_data_df.to_numpy()

In [5]:
#classes, counts = np.unique(y_tr, return_counts=True)
#d = {k:v for k,v in zip(classes, counts)}
#least_count = counts.min()
#weights = np.array([least_count/d[i] for i in y_tr])

In [53]:
X_tr = X_train_df.drop(['id'], axis=1).to_numpy()
y_tr = y_train_df['y'].to_numpy()

In [85]:
train_df = pd.concat([X_train_df, y_train_df], axis=1)
splits = split_df(train_df, 5)
train, test = make_train_and_test(splits, 0)
X_tr = train.drop(['id', 'y'], axis=1).to_numpy()
y_tr = train['y'].to_numpy()
X_te = test.drop(['id', 'y'], axis=1).to_numpy()
y_te = test['y'].to_numpy()

In [86]:
def isof(x, y):
    model = IsolationForest()
    y_pred = model.fit_predict(x)
    return x[y_pred == 1], y[y_pred == 1]

In [87]:
from sklearn.feature_selection import VarianceThreshold, GenericUnivariateSelect

In [88]:
thresh = VarianceThreshold(threshold=0)
X_tr = thresh.fit_transform(X_tr)

In [89]:
X_tr, y_tr = isof(X_tr, y_tr)

In [90]:
#from imblearn.under_sampling import RandomUnderSampler
#sampler = RandomUnderSampler()
#X_tr, y_tr = sampler.fit_resample(X_tr, y_tr)

In [91]:
#from imblearn.over_sampling import RandomOverSampler
#sampler = RandomOverSampler()
#X_tr, y_tr = sampler.fit_resample(X_tr, y_tr)

In [92]:
from imblearn.combine import SMOTEENN, SMOTETomek
sampler = SMOTEENN()
X_tr, y_tr = sampler.fit_resample(X_tr, y_tr)

In [93]:
selection = GenericUnivariateSelect(mode='fdr', param=0.1)#, param=0.1, mode='fwe')
X_tr = selection.fit_transform(X_tr, y_tr)

In [94]:
scale = StandardScaler()
X_tr = scale.fit_transform(X_tr)

In [95]:
X_te = thresh.transform(X_te)
X_te = selection.transform(X_te)
X_te = scale.transform(X_te)

In [17]:
# ----------------------

In [35]:
param_grid_xgb = [
    {
        'estimator__max_depth':[3]
    }
]

In [36]:
# initial test XGBClassifier
clf = OneVsRestClassifier(estimator = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                                colsample_bynode=1, colsample_bytree=1, gamma=0,
                                importance_type='gain', learning_rate=0.1, max_delta_step=0,
                                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                                n_jobs=1, nthread=None, random_state=0,
                                reg_alpha=0, reg_lambda=1, seed=None,
                                silent=None, subsample=1, verbosity=1))
search = GridSearchCV(clf, param_grid=param_grid_xgb, n_jobs=-1, scoring='balanced_accuracy', cv=3, verbose=10)

In [37]:
search.fit(X_tr, y_tr)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   17.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   17.8s finished


GridSearchCV(cv=3,
             estimator=OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5,
                                                                   booster='gbtree',
                                                                   colsample_bylevel=1,
                                                                   colsample_bynode=1,
                                                                   colsample_bytree=1,
                                                                   gamma=0,
                                                                   gpu_id=None,
                                                                   importance_type='gain',
                                                                   interaction_constraints=None,
                                                                   learning_rate=0.1,
                                                                   max_delta_step=0,
                                    

In [38]:
print('Best', search.scoring, 'score: ', search.best_score_)
print("Best parameters set:", search.best_params_)

Best balanced_accuracy score:  0.6773322808354655
Best parameters set: {'estimator__max_depth': 3}


In [39]:
from sklearn.metrics import balanced_accuracy_score

X_te = thresh.transform(X_te)
X_te = selection.transform(X_te)
X_te = scale.transform(X_te)

predicted = search.predict(X_te)
test_score = balanced_accuracy_score(y_te, predicted)
print('Test BMAC: {}'.format(test_score))

Test BMAC: 0.6835214675680171


In [None]:
# ----------------------

In [96]:
from sklearn.neural_network import MLPClassifier

In [97]:
param_grid_nn = [
    {
        'estimator__hidden_layer_sizes':[(60,)]#, tuple(150*np.ones(3, dtype=int))]
    } # best (70,70,70)
]

In [103]:
# initial test NN
clf = OneVsRestClassifier(estimator=MLPClassifier(activation='relu', alpha=1))
search = GridSearchCV(clf, param_grid=param_grid_nn, n_jobs=-1, scoring='balanced_accuracy', cv=3, verbose=10)

In [104]:
search.fit(X_tr, y_tr)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   20.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   20.0s finished


GridSearchCV(cv=3,
             estimator=OneVsRestClassifier(estimator=MLPClassifier(alpha=1)),
             n_jobs=-1, param_grid=[{'estimator__hidden_layer_sizes': [(60,)]}],
             scoring='balanced_accuracy', verbose=10)

In [105]:
print('Best', search.scoring, 'score: ', search.best_score_)
print("Best parameters set:", search.best_params_)

Best balanced_accuracy score:  0.9925861697772516
Best parameters set: {'estimator__hidden_layer_sizes': (60,)}


In [106]:
search.cv_results_

{'mean_fit_time': array([18.25391976]),
 'std_fit_time': array([1.33633071]),
 'mean_score_time': array([0.03503831]),
 'std_score_time': array([0.00107297]),
 'param_estimator__hidden_layer_sizes': masked_array(data=[(60,)],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'estimator__hidden_layer_sizes': (60,)}],
 'split0_test_score': array([0.986411]),
 'split1_test_score': array([0.99674481]),
 'split2_test_score': array([0.9946027]),
 'mean_test_score': array([0.99258617]),
 'std_test_score': array([0.00445322]),
 'rank_test_score': array([1], dtype=int32)}

In [107]:
predicted = search.predict(X_te)
test_score = balanced_accuracy_score(y_te, predicted)
print('Test BMAC: {}'.format(test_score))

Test BMAC: 0.6914304333659173


In [None]:
#--------------------

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid_rf = [
    {
        'estimator__min_samples_split':[2]
    }
]

In [None]:
# initial test RandomForest
clf = OneVsRestClassifier(estimator=RandomForestClassifier())
search = GridSearchCV(clf, param_grid=param_grid_rf, n_jobs=-1, scoring='balanced_accuracy', cv=3, verbose=10)

In [None]:
search.fit(X_tr, y_tr)

In [None]:
print('Best', search.scoring, 'score: ', search.best_score_)
print("Best parameters set:", search.best_params_)

In [None]:
# -----------------

In [None]:
param_grid_svc = [
    {
        'kernel':['rbf']
    }
]

In [None]:
# initial test SVC try 'fdr' with svc
clf = SVC()
search = GridSearchCV(clf, param_grid=param_grid_svc, n_jobs=-1, scoring='balanced_accuracy', cv=3, verbose=10)

In [None]:
search.fit(X_tr, y_tr)

In [None]:
print('Best', search.scoring, 'score: ', search.best_score_)
print("Best parameters set:", search.best_params_)

In [None]:
predicted = search.predict(X_te)
test_score = balanced_accuracy_score(y_te, predicted)
print('Test BMAC: {}'.format(test_score))

In [None]:
# -----------------

In [None]:
param_grid_nusvc = [
    {
        'estimator__nu':[0.2,0.25,0.3,0.35]
    }
]

In [None]:
clf = OneVsRestClassifier(estimator=NuSVC())
search = GridSearchCV(clf, param_grid=param_grid_nusvc, n_jobs=-1, scoring='balanced_accuracy', cv=3, verbose=10)

In [None]:
search.fit(X_tr, y_tr)

In [None]:
print('Best', search.scoring, 'score: ', search.best_score_)
print("Best parameters set:", search.best_params_)

# TRYING NN

In [47]:
import numpy as np
import pandas as pd
#import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [299]:
train_df = pd.concat([X_train_df, y_train_df], axis=1)
splits = split_df(train_df, 5)
train, test = make_train_and_test(splits, 0)
X_tr = train.drop(['id', 'y'], axis=1).to_numpy()
y_tr = train['y'].to_numpy()
X_te = test.drop(['id', 'y'], axis=1).to_numpy()
y_te = test['y'].to_numpy()

thresh = VarianceThreshold(threshold=0)
X_tr = thresh.fit_transform(X_tr)

X_tr, y_tr = isof(X_tr, y_tr)

#sampler = SMOTEENN()
sampler = RandomUnderSampler()
X_tr, y_tr = sampler.fit_resample(X_tr, y_tr)

selection = GenericUnivariateSelect(mode='fwe', param=0.1)#, param=0.1)
X_tr = selection.fit_transform(X_tr, y_tr)

scale = StandardScaler()
X_tr = scale.fit_transform(X_tr)

In [300]:
#X_tr = X_train_df.drop(['id'], axis=1).to_numpy()
#y_tr = y_train_df['y'].to_numpy()

# Split into train+val and test
X_trainval, X_test, y_trainval, y_test = train_test_split(X_tr, y_tr, test_size=0.1, stratify=y_tr, random_state=69)

# Split train into train-val
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, stratify=y_trainval, random_state=21)

In [301]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [302]:
class ClassifierDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())

In [303]:
target_list = []
for _, t in train_dataset:
    target_list.append(t)
    
target_list = torch.tensor(target_list)
target_list = target_list[torch.randperm(len(target_list))]

In [304]:
class_count = [len(y_train[y_train == 0]), len(y_train[y_train == 1]), len(y_train[y_train == 2])]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
class_weights_all = class_weights[target_list]

In [305]:
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

In [351]:
EPOCHS = 70
BATCH_SIZE = 64
LEARNING_RATE = 0.0007
NUM_FEATURES = X_tr.shape[1]
NUM_CLASSES = 3

In [364]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          sampler=weighted_sampler
)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)
test_loader = DataLoader(dataset=test_dataset, batch_size=20)

In [390]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 30)
        #self.layer_2 = nn.Linear(20, 20)
        #self.layer_2b = nn.Linear(256, 128)
        #self.layer_3 = nn.Linear(128, 64)
        
        self.layer_out = nn.Linear(30, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        self.batchnorm0 = nn.BatchNorm1d(30)
        
    def forward(self, x):
        #x = self.layer_1(x)
        #x = self.batchnorm2(x)
        #x = self.relu(x)
        
        #x = self.layer_3(x)
        #x = self.batchnorm3(x)
        #x = self.relu(x)
        #x = self.dropout(x)
        
        x = self.layer_1(x)
        #x = self.batchnorm0(x)
        x = self.relu(x)
        
        #x = self.layer_2(x)
        #x = self.batchnorm0(x)
        #x = self.relu(x)
        #x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [391]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [392]:
model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)

MulticlassClassification(
  (layer_1): Linear(in_features=869, out_features=30, bias=True)
  (layer_out): Linear(in_features=30, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batchnorm1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm0): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [393]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc) * 100
    
    return acc

In [394]:
accuracy_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

In [395]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    for X_train_batch, y_train_batch in train_loader:
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model(X_train_batch)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
        
    # VALIDATION    
    with torch.no_grad():
        
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        model.eval()
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            y_val_pred = model(X_val_batch)
                        
            val_loss = criterion(y_val_pred, y_val_batch)
            val_acc = multi_acc(y_val_pred, y_val_batch)
            
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()
    loss_stats['train'].append(train_epoch_loss/len(train_loader))
    loss_stats['val'].append(val_epoch_loss/len(val_loader))
    accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
    accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
                              
    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')

Begin training.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=70.0), HTML(value='')))

Epoch 001: | Train Loss: 1.04000 | Val Loss: 0.94785 | Train Acc: 61.111| Val Acc: 71.429
Epoch 002: | Train Loss: 0.94702 | Val Loss: 0.87040 | Train Acc: 55.556| Val Acc: 71.429
Epoch 003: | Train Loss: 0.87899 | Val Loss: 0.82154 | Train Acc: 100.000| Val Acc: 71.429
Epoch 004: | Train Loss: 0.88414 | Val Loss: 0.81991 | Train Acc: 77.778| Val Acc: 100.000
Epoch 005: | Train Loss: 0.85153 | Val Loss: 0.79550 | Train Acc: 88.889| Val Acc: 85.714
Epoch 006: | Train Loss: 0.82707 | Val Loss: 0.82128 | Train Acc: 88.889| Val Acc: 71.429
Epoch 007: | Train Loss: 0.83552 | Val Loss: 0.78789 | Train Acc: 94.444| Val Acc: 100.000
Epoch 008: | Train Loss: 0.84112 | Val Loss: 0.77135 | Train Acc: 94.444| Val Acc: 85.714
Epoch 009: | Train Loss: 0.79953 | Val Loss: 0.76575 | Train Acc: 100.000| Val Acc: 85.714
Epoch 010: | Train Loss: 0.79725 | Val Loss: 0.77196 | Train Acc: 100.000| Val Acc: 100.000
Epoch 011: | Train Loss: 0.83835 | Val Loss: 0.80262 | Train Acc: 94.444| Val Acc: 85.714
Epoc

In [396]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_pred_softmax = torch.log_softmax(y_test_pred, dim = 1)
        _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)
        y_pred_list.append(y_pred_tags.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [397]:
#balanced_accuracy_score(y_test, np.array(y_pred_list))

In [398]:
X_te.shape

(960, 869)

In [399]:
X_te = test.drop(['id', 'y'], axis=1).to_numpy()
y_te = test['y'].to_numpy()
X_te = thresh.transform(X_te)
X_te = selection.transform(X_te)
X_te = scale.fit_transform(X_te)

y_pred_list = []
y_test_pred = model(torch.Tensor(X_te))
y_pred_softmax = torch.log_softmax(y_test_pred, dim = 1)
_, y_pred_tags = torch.max(y_pred_softmax, dim = 1)
y_pred_list.append(y_pred_tags.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [400]:
balanced_accuracy_score(y_te, np.array(y_pred_list).reshape(960))

0.6315275354717081

In [None]:
# initial test
splits = split_df(train_data_df, 10)
BMACs = []

for i in range(10): # 10-fold cross-validation
    train, test = make_train_and_test(splits, i)
    X_train = train.drop(columns=['id','y']).to_numpy()
    y_train = train['y'].to_numpy()
    X_validate = test.drop(columns=['id','y']).to_numpy()
    y_validate = test['y'].to_numpy()
    
    clf = OneVsRestClassifier(xgb.XGBClassifier(n_jobs=-1, max_depth=4))
    
    clf.fit(X_train, y_train)
            
    y_predicted = clf.predict(X_validate)
    current_BMAC = balanced_accuracy_score(y_validate, y_predicted)
    BMACs.append(current_BMAC)
    print('{i}th iteration done'.format(i=i+1))

mean_BMAC = (1./len(BMACs)) * np.sum(BMACs)
print('--------------')
print('mean BMAC score: {m}'.format(m=mean_BMAC))

In [None]:
# initial test
splits = split_df(train_data_df, 10)
BMACs = []

for i in range(10): # 10-fold cross-validation
    train, test = make_train_and_test(splits, i)
    X_train = train.drop(columns=['id','y']).to_numpy()
    y_train = train['y'].to_numpy()
    X_validate = test.drop(columns=['id','y']).to_numpy()
    y_validate = test['y'].to_numpy()
    
    clf = SVC()
    
    clf.fit(X_train, y_train)
            
    y_predicted = clf.predict(X_validate)
    current_BMAC = balanced_accuracy_score(y_validate, y_predicted)
    BMACs.append(current_BMAC)
    print('{i}th iteration done'.format(i=i+1))
    print('----> BMAC score: {s}'.format(s=current_BMAC))

mean_BMAC = (1./len(BMACs)) * np.sum(BMACs)
print('--------------')
print('mean BMAC score: {m}'.format(m=mean_BMAC))

In [None]:
## test the same pipeline of task1 (feature selection, outlier detection etc.)

# --------------------------

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import Normalizer, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, f_regression

from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler

from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

# FINAL PIPELINE:
### - 1) Feature selection
### - 2) Outlier detection
### - 3) Scale (Normalization)
### - 4) Parameter Tuning XGBoost

In [None]:
## 2) FEATURE SELECTION ##

from sklearn.feature_selection import SelectFromModel

splits = split_df(train_data_df, 5)
important_indices_dic = {}

for i in range(5): # 5-fold cross-validation
    train, test = make_train_and_test(splits, i)
    X_train = train.drop(columns=['id','y']).to_numpy()
    y_train = train['y'].to_numpy()
    X_validate = test.drop(columns=['id','y']).to_numpy()
    y_validate = test['y'].to_numpy()
        
    model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                                colsample_bynode=1, colsample_bytree=1, gamma=0,
                                importance_type='gain', learning_rate=0.1, max_delta_step=0,
                                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                                n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
                                reg_alpha=0, reg_lambda=1, seed=None,
                                silent=None, subsample=1, verbosity=1)
    model.fit(X_train, y_train)
    #feature_selector = SelectFromModel(model, prefit=True)
    #X_new = feature_selector.transform(X_train)
    #print(X_new.shape)
    
    # feature selection
    thresholds = np.sort(model.feature_importances_)[-300:]
    important_indices =  model.feature_importances_.argsort()[-300:]
    for indice in important_indices:
        if indice in important_indices_dic.keys():
            important_indices_dic[indice] += 1
        else:
            important_indices_dic[indice] = 1
            
    print('{i}th iteration done'.format(i=i+1))

In [None]:
'''
import yaml
    
important_indices_dic_str = {}
for item in important_indices_dic.items():
    important_indices_dic_str[str(item[0])] = item[1]
    
with open('feature_importances.yaml', 'w') as file:
    yaml.dump(important_indices_dic_str, file, default_flow_style=False'
'''

In [None]:
import yaml

with open('feature_importances.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    important_indices_dic_str = yaml.load(file, Loader=yaml.FullLoader)

In [None]:
number_of_features = [50,70,100,150,200]
#number_of_refined_features = [65,70,75,80,85,90]

BMAC_dic = {}
for number in number_of_features:
    most_frequent = sorted(important_indices_dic, key=important_indices_dic.get, reverse=True)[:number]
    drop_list = ['x{}'.format(index) for index in range(832) if index not in most_frequent]
    selected_features = train_data_df.drop(drop_list, axis=1)

    splits = split_df(selected_features, 5)
    BMACs = []

    for i in range(5): # 5-fold cross-validation
        train, test = make_train_and_test(splits, i)
        X_train = train.drop(columns=['id','y']).to_numpy()
        y_train = train['y'].to_numpy()
        X_validate = test.drop(columns=['id','y']).to_numpy()
        y_validate = test['y'].to_numpy()
        
        model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                                colsample_bynode=1, colsample_bytree=1, gamma=0,
                                importance_type='gain', learning_rate=0.1, max_delta_step=0,
                                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                                n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
                                reg_alpha=0, reg_lambda=1, seed=None,
                                silent=None, subsample=1, verbosity=1)
        eval_set = [(X_validate, y_validate)]
        #model.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="rmse", eval_set=eval_set, verbose=False)
        model.fit(X_train, y_train)
            
        y_predicted = model.predict(X_validate)
        current_BMAC = balanced_accuracy_score(y_validate, y_predicted)
        BMACs.append(current_BMAC)
        print('{i}th iteration done'.format(i=i+1))
        print('----> BMAC score: {s}'.format(s=current_BMAC))

    mean_BMAC = (1./len(BMACs)) * np.sum(BMACs)
    print('------------------')
    print('Mean BMAC score for number {n} is: {bmac}'.format(n=number, bmac=mean_BMAC))
    BMAC_dic[number] = mean_BMAC
print('------------------')

In [None]:
# go with 100 features
most_frequent = sorted(important_indices_dic, key=important_indices_dic.get, reverse=True)[:100]
drop_list = ['x{}'.format(index) for index in range(832) if index not in most_frequent]
selected_features = train_data_df.drop(drop_list, axis=1)

### -----------------------------------------

In [None]:
## 3) OUTLIER DETECTION ##

outlier_detect_sf = selected_features.copy()

#specify the 12 metrics column names to be modelled
to_model_columns = outlier_detect_sf.drop(['id', 'y'], axis=1).columns#[1:13]
from sklearn.ensemble import IsolationForest
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.1), \
                        max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(outlier_detect_sf[to_model_columns])
pred = clf.predict(outlier_detect_sf[to_model_columns])
outlier_detect_sf['anomaly']=pred
outliers=outlier_detect_sf.loc[outlier_detect_sf['anomaly']==-1]
outlier_index=list(outliers.index)
#print(outlier_index)
#Find the number of anomalies and normal points here points classified -1 are anomalous
print(outlier_detect_sf['anomaly'].value_counts())

In [None]:
# visualization
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
pca = PCA(n_components=3)  # Reduce to k=3 dimensions
scaler = StandardScaler()
#normalize the metrics
processed_train_df = train_data_df.copy()
X = scaler.fit_transform(processed_train_df[to_model_columns])
X_reduce = pca.fit_transform(X)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.set_zlabel("x_composite_3")
# Plot the compressed data points
ax.scatter(X_reduce[:, 0], X_reduce[:, 1], zs=X_reduce[:, 2], s=4, lw=1, label="inliers",c="green")
# Plot x's for the ground truth outliers
ax.scatter(X_reduce[outlier_index,0],X_reduce[outlier_index,1], X_reduce[outlier_index,2],
           lw=2, s=60, marker="x", c="red", label="outliers")
ax.legend()
plt.show()

In [None]:
# testing possible outlier ratios
ratios = [0.02,0.03,0.04,0.05,0.06,0.07,0.08]
BMAC_dic = {}
for ratio in ratios:
    outlier_detect_sf = selected_features.copy()

    #specify the 12 metrics column names to be modelled
    to_model_columns = outlier_detect_sf.drop(['id', 'y'], axis=1).columns#[1:13]
    from sklearn.ensemble import IsolationForest
    clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(ratio), \
                            max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    #clf=IsolationForest(n_estimators=100, max_samples='auto', contamination='auto',
    #                        max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    clf.fit(outlier_detect_sf[to_model_columns])
    pred = clf.predict(outlier_detect_sf[to_model_columns])
    outlier_detect_sf['anomaly']=pred
    outliers=outlier_detect_sf.loc[outlier_detect_sf['anomaly']==-1]
    outlier_index=list(outliers.index)

    outlier_detect_sf_ = outlier_detect_sf[outlier_detect_sf['anomaly'] == 1]
    outlier_detect_sf_ = outlier_detect_sf_.drop(['anomaly'], axis = 1)
    splits = split_df(outlier_detect_sf_, 5)
    BMACs = []

    for i in range(5): # 5-fold cross-validation
        train, test = make_train_and_test(splits, i)
        X_train = train.drop(columns=['id','y']).to_numpy()
        y_train = train['y'].to_numpy()
        X_validate = test.drop(columns=['id','y']).to_numpy()
        y_validate = test['y'].to_numpy()
            
        model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                                colsample_bynode=1, colsample_bytree=1, gamma=0,
                                importance_type='gain', learning_rate=0.1, max_delta_step=0,
                                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                                n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
                                reg_alpha=0, reg_lambda=1, seed=None,
                                silent=None, subsample=1, verbosity=1)
    
        eval_set = [(X_validate, y_validate)]
        #model.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="rmse", eval_set=eval_set, verbose=False)
        model.fit(X_train, y_train)
            
        y_predicted = model.predict(X_validate)
        current_BMAC = balanced_accuracy_score(y_validate, y_predicted)
        BMACs.append(current_BMAC)
        print('{i}th iteration done'.format(i=i+1))
        print('----> BMAC score: {s}'.format(s=current_BMAC))

    mean_BMAC = (1./len(BMACs)) * np.sum(BMACs)
    print('------------------')
    print('Mean BMAC score for number {n} is: {bmac}'.format(n=number, bmac=mean_BMAC))
    BMAC_dic[number] = mean_BMAC
print('------------------')