In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix
from itertools import product

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from collections import defaultdict

# Binary Classification

In [2]:
# load processed data, no transformation included
data = pd.read_csv("binary_data_processed.csv")

In [3]:
data

Unnamed: 0,subject,activity,F1,F2,F3,F4,F5,F6,F7,F8,...,F552,F553,F554,F555,F556,F557,F558,F559,F560,F561
0,1,0,0.043580,-0.005970,-0.035054,-0.995381,-0.988366,-0.937382,-0.995007,-0.988816,...,-0.012236,-0.314848,-0.713308,-0.112754,0.030400,-0.464761,-0.018446,-0.841559,0.179913,-0.051718
1,1,0,0.039480,-0.002131,-0.029067,-0.998348,-0.982945,-0.971273,-0.998702,-0.983315,...,0.202804,-0.603199,-0.860677,0.053477,-0.007435,-0.732626,0.703511,-0.845092,0.180261,-0.047436
2,1,0,0.039978,-0.005153,-0.022651,-0.995482,-0.977314,-0.984760,-0.996415,-0.975835,...,0.440079,-0.404427,-0.761847,-0.118559,0.177899,0.100699,0.808529,-0.849230,0.180610,-0.042271
3,1,0,0.039785,-0.011809,-0.028916,-0.996194,-0.988569,-0.993256,-0.996994,-0.988526,...,0.430891,-0.138373,-0.491604,-0.036788,-0.012892,0.640011,-0.485366,-0.848947,0.181907,-0.040826
4,1,0,0.038758,-0.002289,-0.023863,-0.998241,-0.986774,-0.993115,-0.998216,-0.986479,...,0.137735,-0.366214,-0.702490,0.123320,0.122542,0.693578,-0.615971,-0.848164,0.185124,-0.037080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7762,30,1,0.048048,-0.042445,-0.065884,-0.195448,-0.278326,-0.219954,-0.282233,-0.305861,...,-0.008381,-0.596760,-0.879026,-0.190437,0.829718,0.206972,-0.425619,-0.792292,0.238580,0.056020
7763,30,1,0.037639,0.006430,-0.044345,-0.235372,-0.302680,-0.232843,-0.322483,-0.354464,...,0.209452,-0.404418,-0.684496,0.064907,0.875679,-0.879033,0.400219,-0.772288,0.252653,0.056252
7764,30,1,0.037451,-0.002724,0.021009,-0.218281,-0.378082,-0.076950,-0.304446,-0.400661,...,0.237003,0.000207,-0.317314,0.052806,-0.266724,0.864404,0.701169,-0.779566,0.249121,0.047071
7765,30,1,0.044011,-0.004536,-0.051242,-0.219202,-0.383350,-0.081035,-0.310419,-0.380233,...,0.069366,0.037919,-0.356579,-0.101360,0.700740,0.936674,-0.589479,-0.785603,0.246409,0.031700


In [4]:
X_train = data.drop(columns=['subject', 'activity'])
Y_train = data['activity']

In [5]:

X_train_np = X_train.to_numpy()
Y_train_np = Y_train.to_numpy()
n_obs = X_train_np.shape[0]
total_idx = np.array(list(range(n_obs)))

In [6]:
def Model_Selection(X_train, Y_train,X_valid,Y_valid,pca=False,n_comp_min=10,  n_comp_max=150, step = 10,verbose = True):
    history = defaultdict(list)

    #print(X_train.shape)
    #print(Y_train[1:5])
    #print(X_valid.shape)
    #print(Y_valid[1:5])
    if pca:
        for i in range(n_comp_min,n_comp_max+1,step):
            model_svm = svm.SVC()
            model_log = LogisticRegression()
            model_forest = RandomForestClassifier(n_estimators=700)
            model_lda = LinearDiscriminantAnalysis()
            model_nb = GaussianNB()
            pca_model = PCA(n_components=i)
            pca_model.fit(X_train)
            X_train_pca = pca_model.transform(X_train)
            pca_model_valid = PCA(n_components=i)
            pca_model_valid.fit(X_valid)
            X_valid_pca = pca_model_valid.transform(X_valid)
            if verbose:
                print("now the n_pc is:" + str(i) )
            model_svm.fit(X_train_pca, Y_train)
            model_log.fit(X_train_pca, Y_train)
            model_forest .fit(X_train_pca, Y_train)
            model_lda.fit(X_train_pca, Y_train)
            model_nb.fit(X_train_pca, Y_train)
            pred_svm = np.mean(model_svm.predict(X_valid_pca)==Y_valid)
            pred_log = np.mean(model_log.predict(X_valid_pca)==Y_valid)
            pred_forest = np.mean(model_forest.predict(X_valid_pca)==Y_valid)
            pred_lda =  np.mean(model_lda.predict(X_valid_pca)==Y_valid)
            pred_nb = np.mean(model_nb.predict(X_valid_pca)==Y_valid)
            history["svm"].append(pred_svm)
            history["log"].append(pred_log)
            history["forest"].append(pred_forest)
            history["lda"].append(pred_lda)
            history["nb"].append(pred_nb)
    else:
            model_svm = svm.SVC()
            model_log = LogisticRegression()
            model_forest = RandomForestClassifier(n_estimators=700)
            model_lda = LinearDiscriminantAnalysis()
            model_nb = GaussianNB()

            model_svm.fit(X_train, Y_train)
            model_log.fit(X_train, Y_train)
            model_forest .fit(X_train, Y_train)
            model_lda.fit(X_train, Y_train)
            model_nb.fit(X_train, Y_train)
            #print(model_log)
            pred_svm = np.mean(model_svm.predict(X_valid)==Y_valid)
            pred_log = np.mean(model_log.predict(X_valid)==Y_valid)
            pred_forest = np.mean(model_forest.predict(X_valid)==Y_valid)
            pred_lda =  np.mean(model_lda.predict(X_valid)==Y_valid)
            pred_nb = np.mean(model_nb.predict(X_valid)==Y_valid)
            history["svm"].append(pred_svm)
            history["log"].append(pred_log)
            history["forest"].append(pred_forest)
            history["lda"].append(pred_lda)
            history["nb"].append(pred_nb)
        
    return history

In [7]:
def extarct_results(model,data):
    data = data[model]
    mean_list = []
    std_list = []
    for mean, std in data:
        mean_list.append(mean)
        std_list.append(std)
    return mean_list, std_list

In [8]:
# 5 holdouts test
results = {}
for i in range(5):
    np.random.seed(123+i)
    idx_train = np.random.choice(total_idx,int(n_obs*0.2),replace = False)
    idx_valid = np.setdiff1d(total_idx,idx_train)
    X_train_sub = X_train_np[idx_train]
    X_valid_sub = X_train_np[idx_valid]
    Y_train_sub = Y_train_np [idx_train]
    Y_valid_sub = Y_train_np [idx_valid]
    results[i] = Model_Selection(X_train_sub ,Y_train_sub,X_valid_sub,Y_valid_sub ,n_comp_min=10,
                            n_comp_max = 50, step = 10,pca = True )

now the n_pc is:10
now the n_pc is:20
now the n_pc is:30
now the n_pc is:40
now the n_pc is:50
now the n_pc is:10
now the n_pc is:20
now the n_pc is:30
now the n_pc is:40
now the n_pc is:50
now the n_pc is:10
now the n_pc is:20
now the n_pc is:30
now the n_pc is:40
now the n_pc is:50
now the n_pc is:10
now the n_pc is:20
now the n_pc is:30
now the n_pc is:40
now the n_pc is:50
now the n_pc is:10
now the n_pc is:20
now the n_pc is:30
now the n_pc is:40
now the n_pc is:50


In [9]:
results_pca = results

In [10]:
# 5 holdouts test, no PCA
results = {}
for i in range(5):
    np.random.seed(123+i)
    idx_train = np.random.choice(total_idx,int(n_obs*0.2),replace = False)
    idx_valid = np.setdiff1d(total_idx,idx_train)
    X_train_sub = X_train_np[idx_train]
    X_valid_sub = X_train_np[idx_valid]
    Y_train_sub = Y_train_np [idx_train]
    Y_valid_sub = Y_train_np [idx_valid]
    results[i] = Model_Selection(X_train_sub ,Y_train_sub,X_valid_sub,Y_valid_sub ,n_comp_min=10,
                            n_comp_max = 50, step = 10,pca = False )
results_raw = results

In [11]:
#pc_list = [10,20,30,40,50]
model_list = ["svm","log","forest","lda","nb"]
summary_dict = defaultdict(list)
for model in model_list:
    for i in range(5):
        #current_pc = pc_list[i]
        tmp = [results_pca[0][model][i],results_pca[1][model][i],results_pca[2][model][i],results_pca[3][model][i], 
                         results_pca[4][model][i]]
        mean_acc = np.mean(tmp)
        sd_acc = np.std(tmp)
        summary_dict[model].append((mean_acc,sd_acc))
        
        
    

In [12]:
for model in model_list:
    tmp = [results_raw[0][model][0],results_raw[1][model][0],results_raw[2][model][0],results_raw[3][model][0], 
                         results_raw[4][model][0]]
    mean_acc = np.mean(tmp)
    sd_acc = np.std(tmp)
    summary_dict[model].append((mean_acc,sd_acc))

In [13]:
svm_mean,svm_std = extarct_results("svm",summary_dict)
log_mean,log_std = extarct_results("log",summary_dict)
forest_mean,forest_std = extarct_results("forest",summary_dict)
lda_mean,lda_std = extarct_results("lda",summary_dict)
nb_mean,nb_std = extarct_results("nb",summary_dict)

In [14]:
summary_binary = pd.DataFrame({"PCs":[10,20,30,40,50,"no_pc"],"svm_mean":svm_mean, "svm_std":svm_std,
                                "log_mean":log_mean,"log_std":log_std,"forest_mean":forest_mean,"forest_std":forest_std,
                                "lda_mean":lda_mean, "lda_std":lda_std,"nb_mean":nb_mean, "nb_std":nb_std})

In [15]:
summary_binary.max(axis=0)

  """Entry point for launching an IPython kernel.


svm_mean       0.998487
svm_std        0.011832
log_mean       0.999067
log_std        0.006747
forest_mean    0.996846
forest_std     0.008726
lda_mean       0.998648
lda_std        0.010322
nb_mean        0.986804
nb_std         0.001926
dtype: float64

In [16]:
summary_binary

Unnamed: 0,PCs,svm_mean,svm_std,log_mean,log_std,forest_mean,forest_std,lda_mean,lda_std,nb_mean,nb_std
0,10,0.983167,0.010856,0.98085,0.006747,0.986707,0.008726,0.977213,0.009335,0.974348,0.00107
1,20,0.987126,0.010855,0.98793,0.00475,0.987351,0.007893,0.986064,0.010322,0.975861,0.001926
2,30,0.986386,0.011593,0.986772,0.005373,0.987255,0.007802,0.985581,0.010039,0.971999,0.000755
3,40,0.986386,0.011832,0.986643,0.005633,0.987287,0.00691,0.985066,0.009692,0.972288,0.00177
4,50,0.986482,0.011549,0.987029,0.005713,0.98571,0.008513,0.985163,0.010079,0.971194,0.001478
5,no_pc,0.998487,0.000415,0.999067,0.000503,0.996846,0.001398,0.998648,0.000745,0.986804,0.000455


logestic regression without pca seems to have the best performance

In [47]:
summary_binary.to_csv("summary_binary.csv",index=False)

**we choose logestic regression without pca as our final model**

In [19]:
test_data = pd.read_csv("test_data.csv")
X_test = test_data.drop(columns=['subject']).to_numpy()

In [21]:
model_log = LogisticRegression()
model_log.fit(X_train_np, Y_train_np)

LogisticRegression()

In [22]:
prediction_bin = pd.DataFrame(model_log.predict(X_test))

In [25]:
prediction_bin.to_csv("binary_fine.txt",index=False,header = False)

# Multiclass Task

In [26]:
training_mul_data = pd.read_csv("multi_processed.csv")

In [29]:
X_train = training_mul_data.drop(columns=['subject', 'activity'])
Y_train = training_mul_data['activity']
X_train_np = X_train.to_numpy()
Y_train_np = Y_train.to_numpy()

In [30]:
test_data = pd.read_csv("test_data.csv")
X_test = test_data.drop(columns=['subject']).to_numpy()

In [31]:
# 5 holdouts test
results = {}
for i in range(5):
    np.random.seed(123+i)
    idx_train = np.random.choice(total_idx,int(n_obs*0.2),replace = False)
    idx_valid = np.setdiff1d(total_idx,idx_train)
    X_train_sub = X_train_np[idx_train]
    X_valid_sub = X_train_np[idx_valid]
    Y_train_sub = Y_train_np [idx_train]
    Y_valid_sub = Y_train_np [idx_valid]
    results[i] = Model_Selection(X_train_sub ,Y_train_sub,X_valid_sub,Y_valid_sub ,n_comp_min=10,
                            n_comp_max = 50, step = 10,pca = True )

now the n_pc is:10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:20


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:40


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:50


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:20


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:40


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:50


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:20


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:40


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:50


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:20


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:40


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:50


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:20


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:40


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


now the n_pc is:50


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [32]:
results_pca = results

In [33]:
# 5 holdouts test, no PCA
results = {}
for i in range(5):
    np.random.seed(123+i)
    idx_train = np.random.choice(total_idx,int(n_obs*0.2),replace = False)
    idx_valid = np.setdiff1d(total_idx,idx_train)
    X_train_sub = X_train_np[idx_train]
    X_valid_sub = X_train_np[idx_valid]
    Y_train_sub = Y_train_np [idx_train]
    Y_valid_sub = Y_train_np [idx_valid]
    results[i] = Model_Selection(X_train_sub ,Y_train_sub,X_valid_sub,Y_valid_sub ,n_comp_min=10,
                            n_comp_max = 50, step = 10,pca = False )
results_raw = results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [34]:
model_list = ["svm","log","forest","lda","nb"]
summary_dict = defaultdict(list)
for model in model_list:
    for i in range(5):
        #current_pc = pc_list[i]
        tmp = [results_pca[0][model][i],results_pca[1][model][i],results_pca[2][model][i],results_pca[3][model][i], 
                         results_pca[4][model][i]]
        mean_acc = np.mean(tmp)
        sd_acc = np.std(tmp)
        summary_dict[model].append((mean_acc,sd_acc))
for model in model_list:
    tmp = [results_raw[0][model][0],results_raw[1][model][0],results_raw[2][model][0],results_raw[3][model][0], 
                         results_raw[4][model][0]]
    mean_acc = np.mean(tmp)
    sd_acc = np.std(tmp)
    summary_dict[model].append((mean_acc,sd_acc))
svm_mean,svm_std = extarct_results("svm",summary_dict)
log_mean,log_std = extarct_results("log",summary_dict)
forest_mean,forest_std = extarct_results("forest",summary_dict)
lda_mean,lda_std = extarct_results("lda",summary_dict)
nb_mean,nb_std = extarct_results("nb",summary_dict)

In [35]:
summary_multi = pd.DataFrame({"PCs":[10,20,30,40,50,"no_pc"],"svm_mean":svm_mean, "svm_std":svm_std,
                                "log_mean":log_mean,"log_std":log_std,"forest_mean":forest_mean,"forest_std":forest_std,
                                "lda_mean":lda_mean, "lda_std":lda_std,"nb_mean":nb_mean, "nb_std":nb_std})

In [36]:
summary_multi

Unnamed: 0,PCs,svm_mean,svm_std,log_mean,log_std,forest_mean,forest_std,lda_mean,lda_std,nb_mean,nb_std
0,10,0.705439,0.048965,0.670518,0.064091,0.717509,0.033424,0.693981,0.024422,0.694689,0.027052
1,20,0.713808,0.054227,0.656646,0.071138,0.731413,0.028689,0.689218,0.035064,0.715481,0.029434
2,30,0.710492,0.046422,0.650241,0.057469,0.730769,0.027155,0.678436,0.049893,0.712037,0.033829
3,40,0.714387,0.039527,0.660251,0.038832,0.730415,0.028429,0.678436,0.047094,0.709527,0.032805
4,50,0.71281,0.043077,0.658159,0.038394,0.729063,0.029857,0.675443,0.050947,0.707757,0.032978
5,no_pc,0.93505,0.004313,0.969102,0.002376,0.958964,0.003465,0.972256,0.001286,0.704892,0.084601


In [42]:
summary_multi.to_csv("summary_multi.csv",index=False)

**lda without pca has the best performance**

In [38]:
model_lda = LinearDiscriminantAnalysis()
model_lda .fit(X_train_np, Y_train_np)
multi_pred = model_lda.predict(X_test)
prediction_multi = pd.DataFrame(multi_pred)

In [None]:
prediction_multi.to_csv("multiclass_fine.txt",index=False,header = False)