In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

data = pd.read_csv("data/38_feature.csv",index_col=0)
data_norm = pd.read_csv("data/38_feature_norm.csv",index_col=0)
data_std_scale = pd.read_csv("data/38_feature_std_scale.csv",index_col=0)

X = data.iloc[:,2:]
y2 = data['class2']
y4 = data['class4']
X_norm = data_norm.iloc[:,2:]
X_std_scale = data_std_scale.iloc[:,2:]

## 2-class

In [2]:
# define dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y2, stratify=y2 ,test_size=0.20, random_state=42)

# LR
from sklearn.linear_model import LogisticRegression
LR_clf =LogisticRegression(C=20,penalty='l2',solver='liblinear',random_state=1,max_iter=1000).fit(X_train, y_train)

# SVC
from sklearn.svm import SVC
SVC_clf =SVC(C=1.0,kernel='poly',probability=True).fit(X_train, y_train)

# GB
from sklearn.ensemble import GradientBoostingClassifier
GB_clf = GradientBoostingClassifier(n_estimators = 1000,learning_rate=0.01,subsample=0.5,max_depth=7).fit(X_train, y_train)

# RF
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(max_features = 'sqrt',n_estimators=100).fit(X_train, y_train)

# softvoting ensemble

from sklearn.ensemble import VotingClassifier
eclf_2 = VotingClassifier(
    estimators=[\
    ('LR', LR_clf),
    ('SVC_clf', SVC_clf),
    ('RF_clf', RF_clf),
    ('GB_clf',GB_clf)], 
    voting='soft', 
    weights=[1, 1, 1, 1]).fit(X_train, y_train)



## 4-class

In [3]:
# define dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y4, stratify=y4 ,test_size=0.20, random_state=42)

# LR
from sklearn.linear_model import LogisticRegression
LR_clf =LogisticRegression(C=20,penalty='l2',solver='liblinear',random_state=1,max_iter=1000).fit(X_train, y_train)

# RF
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(max_features = 'sqrt',n_estimators=1000)
RF_clf.fit(X_train, y_train)

# GB
from sklearn.ensemble import GradientBoostingClassifier
GB_clf = GradientBoostingClassifier(n_estimators = 1000,learning_rate=0.01,subsample=0.7,max_depth=7)
GB_clf.fit(X_train, y_train)

from sklearn.ensemble import VotingClassifier

eclf_4 = VotingClassifier(
    estimators=[
        ('RF_clf', RF_clf),
        ('GB_clf', GB_clf),
        ('LR_clf', LR_clf)],
    voting='hard').fit(X_train, y_train)


# test_hidden data

In [4]:
X_hidden = pd.read_csv("data/npf_test_hidden.csv")
X_hidden = X_hidden[X_hidden.columns.intersection(X.columns)]
X_hidden_norm = (X_hidden - X_hidden.min())/(X_hidden.max()-X_hidden.min())
X_hidden_norm

Unnamed: 0,CO2168.mean,CO2168.std,H2O168.mean,H2O168.std,NO168.std,NO336.std,NO42.std,NO504.mean,NO504.std,NO672.std,...,RPAR.mean,RPAR.std,SO2168.mean,SO2168.std,SWS.mean,SWS.std,T84.mean,T84.std,CS.mean,CS.std
0,0.278938,0.018998,0.183163,0.022930,0.009311,0.012234,0.085353,0.011402,0.017434,0.011019,...,0.018861,0.028178,0.022406,0.015538,0.963095,0.003594,0.426378,0.028074,0.025721,0.015359
1,0.504000,0.015784,0.022105,0.007552,0.193844,0.278596,0.125013,0.121995,0.281247,0.279674,...,0.190066,0.239194,0.194391,0.035191,0.941012,0.010179,0.129581,0.087790,0.222128,0.014814
2,0.257581,0.050786,0.193513,0.136942,0.020796,0.030065,0.009843,0.010364,0.025180,0.028995,...,0.159068,0.211310,0.035964,0.058267,0.976103,0.003496,0.559029,0.381819,0.089357,0.064752
3,0.375806,0.137629,0.264998,0.177378,0.009751,0.015562,0.008744,0.008264,0.018324,0.019838,...,0.149487,0.197698,0.009928,0.014079,0.944857,0.005313,0.615985,0.743413,0.143422,0.077947
4,0.357351,0.188850,0.614028,0.584217,0.026347,0.041739,0.013478,0.015932,0.039720,0.039329,...,0.060249,0.101572,0.066991,0.130748,0.765239,0.835360,0.709012,0.475913,0.589092,0.705933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,0.460974,0.373973,0.527525,0.348726,0.012121,0.018362,0.013141,0.012451,0.020573,0.019234,...,0.103321,0.182883,0.025239,0.016320,0.814373,0.253355,0.748788,0.440532,0.175363,0.120263
961,0.352716,0.003427,0.161315,0.010795,0.012184,0.014054,0.011086,0.015633,0.016119,0.020180,...,0.040901,0.039028,0.016575,0.023179,0.865881,0.121413,0.375970,0.031627,0.145090,0.012543
962,0.396262,0.101765,0.150565,0.042608,0.104519,0.165757,0.052669,0.037540,0.158977,0.158696,...,0.272364,0.324432,0.116948,0.098149,0.977993,0.004935,0.616752,0.565716,0.399598,0.133447
963,0.325693,0.048607,0.161042,0.052766,0.004566,0.003033,0.004284,0.004835,0.003868,0.006606,...,0.279905,0.294204,0.005643,0.010843,0.979402,0.008850,0.522074,0.565589,0.037580,0.032726


In [5]:
y_4_pred = eclf_4.predict(X_hidden_norm)
y_2_pred = eclf_2.predict(X_hidden_norm)
y_2_pred_prob = eclf_2.predict_proba(X_hidden_norm)

In [6]:
answer = pd.DataFrame({'class4':y_4_pred,'p':y_2_pred_prob[:,0]},)

from sklearn.model_selection import cross_val_score
score = cross_val_score(eclf_2,X_norm,y2,cv=10,n_jobs=-1,scoring='accuracy').mean()


In [8]:
answer = answer.T
answer.reset_index(inplace=True)
answer = answer.T
answer.columns = [score,""]
answer.to_csv('answer/answers.csv',index =False)