<a href="https://colab.research.google.com/github/dbssn/DMML2019_Team_Microsoft/blob/master/feature_selection_and_model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras import layers
from keras import optimizers
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model import Lasso,Ridge
from sklearn.model_selection import GridSearchCV
from keras.wrappers import scikit_learn
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2,SelectKBest,mutual_info_classif,SelectFromModel

warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)

df=pd.read_csv("")
y=df[""]
x=df.drop([""],axis=1)

#feature selection 1, univariate method  → chi2 or mutual_info_classif
x_new = SelectKBest(chi2, k=2).fit_transform(x, y)

#feature selection 2, select from model including svc,logit,decision tree (try and error)
clf = RandomForestClassifier(n_estimators=50)
clf = clf.fit(x, y)
model = SelectFromModel(clf, prefit=True)
x_new = model.transform(x)
           

kf = KFold(n_splits=5, random_state=None, shuffle=True)

#classification algorithm

# I personally like to choose the best 3 classifier into vote from experiments. 
def vote(x,y):
    clf1 = KNeighborsClassifier()
    clf2 = LogisticRegression()
    clf3 = SVC(probability=True)
    eclf = VotingClassifier(estimators=[('1', clf1), ('2', clf2), ('3', clf3)])
    scores = cross_val_score(eclf,x,y,cv=kf)
    print("vote:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def gbc(x,y):
    clf = GradientBoostingClassifier()
    scores = cross_val_score(clf,x,y,cv=kf)
    print("gbc:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def ada(x,y):
    clf = AdaBoostClassifier()
    scores = cross_val_score(clf,x,y,cv=kf)
    print("ada:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def qda(x,y):
    clf=QuadraticDiscriminantAnalysis()
    scores = cross_val_score(clf,x,y,cv=kf)
    print("qda:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def mlp(x,y):
    mlp=MLPClassifier(hidden_layer_sizes=(200, ))
    scores = cross_val_score(mlp,x,y,cv=kf)
    print("mlp:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def bayes(x,y):
    clf = GaussianNB()
    scores = cross_val_score(clf,x,y, cv=kf)
    print("bayes:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def knn(x,y):
    neigh = KNeighborsClassifier()
    scores = cross_val_score(neigh,x,y, cv=kf)
    print("knn:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def logistic(x,y):
    classifier = LogisticRegression()
    scores = cross_val_score(classifier,x,y, cv=kf)
    print("logit:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def forest(x,y):
    forest = RandomForestClassifier()
    scores = cross_val_score(forest,x,y, cv=kf)
    print("forest:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def xg(x,y):
    x=np.array(x)
    xgbc=xgb.XGBClassifier()
    scores = cross_val_score(xgbc, x,y, cv=kf)
    print("xgb:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def tr(x,y):
    tre = tree.DecisionTreeClassifier()
    scores = cross_val_score(tre,x,y, cv=kf)
    print("DecisionTree:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def sv(x,y):
    clf = SVC(gamma='auto')
    scores = cross_val_score(clf,x,y, cv=kf)
    print("svc:",round(scores.mean(),3))
    out.append(round(scores.mean(),3))
    
def pipe(x,y):
    bayes(x,y)
    tr(x,y)
    knn(x,y)
    xg(x,y)
    forest(x,y)
    gbc(x,y)
    sv(x,y)
    vote(x,y)
    ada(x,y)
    logistic(x,y)

#To plot the results of Neural network
def plot(history):
    plt.style.use('ggplot')
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()  
    
# Neural network using Keras
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)
model = Sequential()
a=optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,amsgrad=False)
model.add(layers.Dense(6, activation='relu'))
model.add(layers.Dense(3, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=a, metrics=['accuracy'])
out = model.fit(xtrain,ytrain,epochs=30,verbose=True,validation_data=(xtest, ytest),batch_size=5)
print(model.summary())
loss, accuracy = model.evaluate(xtrain, ytrain, verbose=False)
print("Training Accuracy:  {:.3f}".format(round(accuracy,2)))
loss, accuracy = model.evaluate(xtest, ytest, verbose=False)
print("Testing Accuracy:  {:.3f}".format(round(accuracy,2)))
plot(out)

# Grid search for parameter tuning, the items in dict depending on model
model=LogisticRegression()
param_grid = dict()
grid = GridSearchCV(estimator=model, param_grid=param_grid,cv=kf,scoring="accuracy")
grid_result = grid.fit(x,y)
print(grid_result.best_params_)
print(grid_result.best_score_)