In [12]:
# utilities
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# classification models
#from sklearn.preprocessing import StandardScalar
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, IsolationForest
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.datasets import make_classification
from sklearn.utils import class_weight
from collections import Counter

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

# neural network
#import keras.backend as Kb
#from keras.models import Sequential
#from keras.layers import Dense

#scaling methods
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer


# resampling methods modules
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE

from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek

# cross-validation
from sklearn.model_selection import train_test_split, KFold

# metrics
from sklearn.metrics import recall_score
from sklearn.metrics import balanced_accuracy_score

#feature selecti
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV



In [13]:
train_x_raw = pd.read_csv('/Users/charlotteout/Documents/AML/task2/X_train.csv').values
train_y_raw = pd.read_csv('/Users/charlotteout/Documents/AML/task2/y_train.csv').values
test_x_raw = pd.read_csv("/Users/charlotteout/Documents/AML/task2/X_test.csv").values

N_FEATURES = train_x_raw[0].size-1

X_train_data = train_x_raw[:,1:N_FEATURES+1]
y_train_data = train_y_raw[:,1]

X_test_data = test_x_raw[:,1:N_FEATURES+1]

In [14]:
def Standardscaler(data):
    scaler = StandardScaler()
    data = scaler.fit(data)
    return scaler

def PowTrans(data):
    scaler = PowerTransformer(method='yeo-johnson',standardize=True)
    data = scaler.fit(data)
    return scaler

def Robustnorm(data):
    scaler = RobustScaler()
    fitscal = scaler.fit(data)
    return fitscal

def Quanttrans(data):
    scaler = QuantileTransformer(n_quantiles=10, random_state=0)
    fitscal = scaler.fit(data)
    return fitscal
    



# svm classifier model
def svc(data_set_X, data_set_y):
    model = SVC(class_weight='balanced',gamma=0.00001, kernel='rbf',C=100,random_state=0)
    model.fit(data_set_X, data_set_y)
    return model

# mlp classifier model
def mlp(data_set_X, data_set_y):
    model = MLPClassifier(random_state=0)
    model.fit(data_set_X, data_set_y)
    return model



# ada resampling
def r_ada(data_set_X, data_set_Y):
    ada = ADASYN(random_state=SEED)
    X_res, y_res = ada.fit_resample(data_set_X, data_set_Y)
    return X_res, y_res

# bsm resampling
def r_bsmote(data_set_X, data_set_Y):
    bsm = BorderlineSMOTE(random_state=0)
    X_res, y_res = bsm.fit_resample(data_set_X, data_set_Y)
    return X_res, y_res

# smote resampling
def r_smote(data_set_X, data_set_Y):
    sm = SMOTE(random_state=0)
    X_res, y_res = sm.fit_resample(data_set_X, data_set_Y)
    return X_res, y_res


def SVM_smote(data_set_X, data_set_Y):
    svm_smote = SVMSMOTE(random_state=0)
    X_res, y_res = svm_smote.fit_resample(data_set_X, data_set_Y)
    return X_res, y_res


def smote_enn(data_set_X, data_set_Y):
    smote_enn = SMOTEENN(random_state=0)
    X_res, y_res = smote_enn.fit_resample(data_set_X, data_set_Y)
    return X_res, y_res


def smote_tomek(data_set_X, data_set_Y):
    smote_tomek = SMOTETomek(random_state=0)
    X_res, y_res = smote_tomek.fit_resample(data_set_X, data_set_Y)
    return X_res, y_res




In [10]:
scaler= Standardscaler(X_train_data)
X_train_data = scaler.transform(X_train_data)
X_test_data =scaler.transform(X_test_data)

In [5]:
C_range = np.logspace(-2,10,13)
gamma_range = np.logspace(-9,3,13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(SVC(), param_grid, cv=cv)
grid.fit(X_train_data, y_train_data)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=0, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                         'gamma': array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
       1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [6]:
print(grid.best_params_,grid.best_score_)

{'C': 100.0, 'gamma': 1e-05} 0.815


In [15]:
def do_kfold(classifier, X, y, k, score1, score2):
    kf = KFold(n_splits=10)

    mses_eval = []
    mses_unbalaced = []

    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train = X[train_index], X[test_index], y[train_index]
        scaler = Standardscaler(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        # X_train, y_train = resample(X_train, y_train)
        model = classifier(X_train, y_train)
        y_pred_test = model.predict(X_test)
        mses_eval.append(score1(y[test_index], y_pred_test))
        mses_unbalaced.append(score2(y[test_index], y_pred_test, average='weighted'))
        print(mses_eval)
        print(mses_unbalaced)

    return np.mean(mses_eval), np.std(mses_eval), np.mean(mses_unbalaced), np.std(mses_unbalaced)

In [16]:
mean1, std1, mean2, std2 = do_kfold(svc,X_train_data, y_train_data, 10, balanced_accuracy_score, recall_score)


#kfold so makes sense that its lower
print('K-fold cross-validation for balanced accuracy score: mean % 5.5f, std % 5.5f' %(mean1, std1))
print('K-fold cross-validation for recall score: mean % 5.5f, std % 5.5f' %(mean2, std2))

[0.7050717852684145]
[0.7291666666666666]
[0.7050717852684145, 0.7112112723080374]
[0.7291666666666666, 0.7395833333333334]
[0.7050717852684145, 0.7112112723080374, 0.726713400041457]
[0.7291666666666666, 0.7395833333333334, 0.7458333333333333]
[0.7050717852684145, 0.7112112723080374, 0.726713400041457, 0.6943155651785408]
[0.7291666666666666, 0.7395833333333334, 0.7458333333333333, 0.7541666666666667]
[0.7050717852684145, 0.7112112723080374, 0.726713400041457, 0.6943155651785408, 0.6644697243288793]
[0.7291666666666666, 0.7395833333333334, 0.7458333333333333, 0.7541666666666667, 0.7104166666666667]
[0.7050717852684145, 0.7112112723080374, 0.726713400041457, 0.6943155651785408, 0.6644697243288793, 0.6926736043501149]
[0.7291666666666666, 0.7395833333333334, 0.7458333333333333, 0.7541666666666667, 0.7104166666666667, 0.7270833333333333]
[0.7050717852684145, 0.7112112723080374, 0.726713400041457, 0.6943155651785408, 0.6644697243288793, 0.6926736043501149, 0.6716126063952151]
[0.729166666

In [17]:

'''
Write predictions into csv file
'''

# function to write csv file
def csv_write(prediction):

    # size of prediction
    n_size = prediction.size

    # header
    header = []
    header.append('id')
    header.append('y')

    # array containing ids
    ids = []

    for i in range(0, n_size):
        ids.append(float(i))

    ids = np.array(ids)

    # write file
    with open('predictionEIGHT.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer = csv.DictWriter(csvfile, fieldnames = ['id', 'y'])
        writer.writeheader()
        nsize = prediction.size
        for i in range(0, nsize):
            row = {}
            row['id'] = ids[i]
            row['y'] = prediction[i]
            writer.writerow(row)



In [18]:
scaler= Standardscaler(X_train_data)
X_train_data = scaler.transform(X_train_data)
X_test_data =scaler.transform(X_test_data)

model = svc(X_train_data, y_train_data)
y_pred_test = model.predict(X_test_data)

csv_write(y_pred_test)