# Cup classification with pre-extracted feature
* **Date** 11/28/2016
* **Author** [Haonan Chen](https://chaonan99.github.io/)
* **Student No.** 2013011449
* **Class** 32

## Highlight
* Multi classifier
* Select different number of features
* P-value based feature selection
* Model based feature selection
* PCA decomposition

In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
def read_pickle(file_path):
    with open(file_path, "rb") as fin:
        return pickle.load(fin)
    return None
X_train_raw, y_train = read_pickle("train_forstu.pickle")
X_test_raw, y_test = read_pickle("valid_forstu.pickle")

In [10]:
classifier_names = ["Bayes", "LDA", "LSVM", "RBF SVM", "MLP", "DTree"]
classifiers = [
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    LinearSVC(C=0.025),
    SVC(gamma=0.001, C=100),
    MLPClassifier(alpha=1e-5, hidden_layer_sizes=256, random_state=1, max_iter=3000),
    ExtraTreesClassifier(),
]
selectors = [
#     number of feature experiment
    SelectKBest(f_classif, k=256),  # Actually an empty selector
#     SelectKBest(f_classif, k=8),
#     SelectKBest(f_classif, k=7),
#     SelectKBest(f_classif, k=6),
#     SelectKBest(f_classif, k=5),
#     SelectKBest(f_classif, k=4),
#     SelectKBest(f_classif, k=3),
#     SelectKBest(f_classif, k=2),
#     SelectKBest(f_classif, k=1),
#     p-value experiment
    SelectKBest(chi2, k=100),
    SelectKBest(f_classif, k=100),
    SelectKBest(mutual_info_classif, k=100),
#     RFE(LogisticRegression(), 3),
#     RFE(LinearSVC(C=0.005), 3),
#     RFE(LinearSVC(C=0.005, penalty="l1", dual=False), 3),
#     RFE(ExtraTreesClassifier(), 3),
#     SelectFromModel(LogisticRegression()),
#     SelectFromModel(LinearSVC(C=0.005)),
#     SelectFromModel(LinearSVC(C=0.005, penalty="l1", dual=False)),
#     SelectFromModel(ExtraTreesClassifier()),
]

def do_prediction(clf, X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    clf.fit(X_train, y_train)
    return accuracy_score(clf.predict(X_test), y_test)

def do_selection_and_prediction(sel, clf, X_train, X_test, y_train, y_test):
    sel.fit(X_train, y_train)
    return do_prediction(clf, sel.transform(X_train), sel.transform(X_test), y_train, y_test)

In [11]:
print("Training accuracy")
train_res = pd.DataFrame([["{0:.2f}%".format(do_selection_and_prediction(sel, clf, X_train_raw,
            X_train_raw, y_train, y_train)*100) for clf in classifiers] for sel in selectors],
            columns=classifier_names)
train_res

Training accuracy


Unnamed: 0,Bayes,LDA,LSVM,RBF SVM,MLP,DTree
0,73.85%,89.67%,94.96%,99.79%,99.79%,99.82%
1,68.78%,78.75%,82.60%,96.22%,99.79%,99.82%
2,69.34%,78.61%,82.74%,96.29%,99.79%,99.82%
3,68.08%,76.86%,81.13%,96.95%,99.79%,99.82%


In [12]:
print("Test accuracy")
test_res = pd.DataFrame([["{0:.2f}%".format(do_selection_and_prediction(sel, clf, X_train_raw,
            X_test_raw, y_train, y_test)*100) for clf in classifiers] for sel in selectors],
            columns=classifier_names)
test_res

Test accuracy


Unnamed: 0,Bayes,LDA,LSVM,RBF SVM,MLP,DTree
0,58.78%,74.48%,76.46%,76.24%,81.55%,53.92%
1,52.60%,64.42%,68.07%,67.40%,72.15%,53.92%
2,55.47%,64.20%,67.40%,69.39%,70.39%,56.13%
3,52.82%,62.65%,68.62%,66.85%,74.70%,52.82%


In [14]:
pca = PCA(n_components=200)
pca.fit(X_train_raw)
X_train = pca.transform(X_train_raw)
X_test = pca.transform(X_test_raw)
pca_res = pd.DataFrame([["{0:.2f}%".format(do_prediction(clf, X_train,
            X_train, y_train, y_train)*100) for clf in classifiers],
            ["{0:.2f}%".format(do_prediction(clf, X_train,
            X_test, y_train, y_test)*100) for clf in classifiers]], index=["train", "test"], columns=classifier_names)
# pca_res.to_latex("tmp.tex")
pca_res

Unnamed: 0,Bayes,LDA,LSVM,RBF SVM,MLP,DTree
train,64.37%,87.68%,92.33%,99.79%,99.79%,99.82%
test,48.40%,72.93%,76.24%,72.93%,76.35%,41.10%


In [17]:
test_res.to_latex("doc/tmp.tex")