In [8]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from scipy.stats import uniform, norm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_classif, mutual_info_classif, VarianceThreshold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import CategoricalNB, ComplementNB, MultinomialNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import svm
from XLB import *

import warnings
warnings.filterwarnings('ignore')

In [11]:
def train_all(filename_train,filename_val,selector,model_selector,name=""):
    # extract data from files
    x_train, y_train = extract_data(filename_train)
    x_val, y_val = extract_data(filename_val)

    # scale data values
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    x_val = scaler.transform(x_val)

    # feature selection
    feat_sel = VarianceThreshold()
    x_train = feat_sel.fit_transform(x_train)
    x_train = selector.fit_transform(x_train,y_train)
    x_val = selector.transform(feat_sel.transform(x_val))
    
    rand_seed = 3454132

    oversampler = SMOTE(sampling_strategy="not majority",random_state=rand_seed)
    x_smote, y_smote = oversampler.fit_resample(x_train,y_train)
#     print(x_smote.shape,y_smote.shape)

    oversampler = RandomOverSampler(sampling_strategy="not majority",random_state=rand_seed)
    x_os, y_os = oversampler.fit_resample(x_train,y_train)
#     print(x_os.shape,y_os.shape)
    print("{} Results:".format(name))
    print_res("Vanilla",x_train,y_train,model_selector)
    test_res("Vanilla",x_val,y_val,model_selector)
    print_res("Random Oversampling",x_os,y_os,model_selector)
    test_res("Random Oversampling",x_val,y_val,model_selector)
    print_res("SMOTE",x_smote,y_smote,model_selector)
    test_res("SMOTE",x_val,y_val,model_selector)
    

In [None]:
num_features = 45
selectors = {
    "chi2" : SelectKBest(chi2,k=num_features),
    "f_classif" : SelectKBest(f_classif,k=num_features),
    "mutual_info_classif" : SelectKBest(mutual_info_classif,k=num_features),
    "FromModel DT" : SelectFromModel(estimator=tree.DecisionTreeClassifier(random_state=481516234)),
    "RandForest" : SelectFromModel(estimator=RandomForestClassifier(n_estimators=50,random_state=481516234)),
    "LogReg" : SelectFromModel(estimator=LogisticRegression(random_state=481516234)),
    "LinearSVC" : SelectFromModel(estimator=svm.LinearSVC(C=0.25, penalty="l1", dual=False,random_state=481516234)),
}
# MLPClassifier, RandomizedSearch
random_search_iterations = 80
k_folds = 10
rand_seed = 3249807

parameters = {
    'activation':['logistic', 'tanh', 'relu'], 
    'solver' : ['lbfgs', 'sgd', 'adam'],
    'alpha': [3e-4],
    'batch_size' : [min(200,int(np.power(2,i))) for i in range(4,8)],
    'learning_rate' : ['constant', 'invscaling', 'adaptive'],
    'max_iter' : [750],
    'shuffle' : [True,False],
    'momentum' : uniform(loc=0.2,scale=0.8),
    'nesterovs_momentum' : [True,False],
    'early_stopping' : [True,False]
}
mlp = MLPClassifier(random_state=rand_seed)
random_search_mlp = RandomizedSearchCV(mlp, parameters,cv=k_folds,\
                                       n_iter=random_search_iterations,\
                                       random_state=rand_seed, n_jobs=-1)
for k,v in selectors.items():
    train_all("FinalTrainingSet.csv","Validation Set.csv",v,random_search_mlp,k)


chi2 Results:
Best Score: 0.3466
activation => relu
alpha => 0.0003
batch_size => 32
early_stopping => False
learning_rate => adaptive
max_iter => 750
momentum => 0.339628718776361
nesterovs_momentum => False
shuffle => False
solver => sgd
Splits: 10
Vanilla Train Accuracy: 38.6534%
Vanilla Validation Accuracy: 14.29%
Vanilla F1-score: 0.08
Best Score: 0.7893
activation => tanh
alpha => 0.0003
batch_size => 128
early_stopping => False
learning_rate => adaptive
max_iter => 750
momentum => 0.3554610967202623
nesterovs_momentum => False
shuffle => False
solver => lbfgs
Splits: 10
Random Oversampling Train Accuracy: 99.8667%
Random Oversampling Validation Accuracy: 28.57%
Random Oversampling F1-score: 0.25
Best Score: 0.7067
activation => relu
alpha => 0.0003
batch_size => 16
early_stopping => False
learning_rate => adaptive
max_iter => 750
momentum => 0.8511013917812551
nesterovs_momentum => False
shuffle => True
solver => lbfgs
Splits: 10
SMOTE Train Accuracy: 99.8667%
SMOTE Validation A