In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer

In [None]:
def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
#     clf = GradientBoostingClassifier(n_estimators = 250, 
#                                        max_depth = 5,
#                                        learning_rate = 0.1, 
#                                        max_features = 60, random_state=0)
    clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, 
                                    min_samples_split=60, min_samples_leaf=9, subsample=1,
                                    max_features=50, random_state=0)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    score = f1_score(y_val, y_val_pred, average='micro')
    y_pred = clf.predict(X_test) 
    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, _ = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val)

        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))

def train_k_fold_pred(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=True)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        X_train, y_train = expand_dataset(X_train, y_train)
        
        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        y_pred_list.append(y_pred)
        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

def train_k_fold_pred_trick(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=True)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        X_train, y_train = expand_dataset(X_train, y_train)
        
        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        print('The obtained validation r1 score is : ',score)
        test_score += score
        if score > 0.81:        
            y_pred_list.append(y_pred)
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape 
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

def expand_dataset(X_train, y_train, num_class = 4):
    sample = np.c_[X_train, y_train]
    sample_by_class = [sample[(sample[:,-1] == i)] for i in range(num_class)]
    sample_sizes = [ sample_class.shape[0] for sample_class in sample_by_class]
    expand_ratio = np.round(np.max(np.array(sample_sizes))/sample_sizes)
    expanded_sample_by_class = [ np.repeat(sample_by_class[i], expand_ratio[i], axis = 0) for i in range(num_class)]
    expanded_sample = np.concatenate(expanded_sample_by_class)
    return expanded_sample[:, :-1], expanded_sample[:, -1]

In [3]:
X_train_fft_dataset = pd.read_csv('X_train_fft_processed.csv')
X_train_wavelet_dataset = pd.read_csv('X_train_wavelet_processed.csv')
X_train_pnn_dataset = pd.read_csv('X_train_pnn_processed.csv')
X_train_half_dataset = pd.read_csv('X_train_half_processed.csv')
X_train_rm_dataset = pd.read_csv('../X_train_feature_rm.csv')

X_test_fft_dataset = pd.read_csv('X_test_fft_processed.csv')
X_test_wavelet_dataset = pd.read_csv('X_test_wavelet_processed.csv')
X_test_pnn_dataset =  pd.read_csv('X_test_pnn_processed.csv')
X_test_half_dataset = pd.read_csv('X_test_half_processed.csv')
X_test_rm_dataset = pd.read_csv('../X_test_feature_rm.csv')

In [4]:
X_train_fft_dataset = np.array(X_train_fft_dataset)
X_train_wavelet_dataset = np.array(X_train_wavelet_dataset)
X_train_pnn_dataset = np.array(X_train_pnn_dataset)
X_train_half_dataset = np.array(X_train_half_dataset)
X_train_rm_dataset = np.array(X_train_rm_dataset)
X_train_half_dataset = np.concatenate([
    X_train_half_dataset[: ,0:16],
    X_train_rm_dataset,
    X_train_half_dataset[:, 466:],
], axis = 1)

X_test_fft_dataset = np.array(X_test_fft_dataset)
X_test_wavelet_dataset = np.array(X_test_wavelet_dataset)
X_test_pnn_dataset = np.array(X_test_pnn_dataset)
X_test_half_dataset = np.array(X_test_half_dataset)
X_test_rm_dataset = np.array(X_test_rm_dataset)
X_test_half_dataset = np.concatenate([
    X_test_half_dataset[:, 0:16],
    X_test_rm_dataset,
    X_test_half_dataset[:, 466:],
], axis = 1)

In [5]:
print(X_train_fft_dataset.shape)
print(X_train_wavelet_dataset.shape)
print(X_train_pnn_dataset.shape)
print(X_train_half_dataset.shape)

(5117, 15)
(5117, 190)
(5117, 2)
(5117, 823)


In [6]:
X_train = np.concatenate((
                X_train_fft_dataset,
                X_train_wavelet_dataset,
                X_train_pnn_dataset,
                X_train_half_dataset,
                    ), axis=1)
X_test = np.concatenate((
                X_test_fft_dataset,
                X_test_wavelet_dataset,
                X_test_pnn_dataset,
                X_test_half_dataset,
                    ), axis=1)
print(X_train.shape, X_test.shape)

(5117, 1030) (3411, 1030)


In [7]:
impute1 = SimpleImputer(strategy = 'median', fill_value = 0)
X_train = impute1.fit_transform(X_train)
impute2 = SimpleImputer(strategy = 'median', fill_value = 0)
X_test = impute2.fit_transform(X_test)

#rescaling data
scaler = StandardScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
y_train_data = pd.read_csv('../y_train.csv')
y_train = np.array(y_train_data)[:,1]

In [9]:
_, y_pred_half_wavelet_fft = train_k_fold_pred_trick(X_train, y_train, X_test, fold_num=5)

The obtained validation r1 score is :  0.830078125
The obtained validation r1 score is :  0.826171875
The obtained validation r1 score is :  0.8504398826979471
The obtained validation r1 score is :  0.8181818181818182
The obtained validation r1 score is :  0.823069403714565
Validation score: 0.829588


In [None]:
for i in range(5):
    _, y_pred_half_wavelet_fft = train_k_fold_pred_trick(X_train, y_train, X_test, fold_num=5)

The obtained validation r1 score is :  0.830078125
The obtained validation r1 score is :  0.826171875
The obtained validation r1 score is :  0.8504398826979471
The obtained validation r1 score is :  0.8181818181818182
The obtained validation r1 score is :  0.823069403714565
Validation score: 0.829588
The obtained validation r1 score is :  0.830078125
The obtained validation r1 score is :  0.826171875
The obtained validation r1 score is :  0.8504398826979471
The obtained validation r1 score is :  0.8181818181818182
The obtained validation r1 score is :  0.823069403714565
Validation score: 0.829588
The obtained validation r1 score is :  0.830078125
The obtained validation r1 score is :  0.826171875
The obtained validation r1 score is :  0.8504398826979471
The obtained validation r1 score is :  0.8181818181818182
The obtained validation r1 score is :  0.823069403714565
Validation score: 0.829588
The obtained validation r1 score is :  0.830078125
The obtained validation r1 score is :  0.82