In [11]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer

In [12]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

def expand_dataset(X_train, y_train, num_class = 4):
    sample = np.c_[X_train, y_train]
    sample_by_class = [sample[(sample[:,-1] == i)] for i in range(num_class)]
    sample_sizes = [ sample_class.shape[0] for sample_class in sample_by_class]
    expand_ratio = np.round(np.max(np.array(sample_sizes))/sample_sizes)
    expanded_sample_by_class = [ np.repeat(sample_by_class[i], expand_ratio[i], axis = 0) for i in range(num_class)]
    expanded_sample = np.concatenate(expanded_sample_by_class)
    return expanded_sample[:, :-1], expanded_sample[:, -1]

In [13]:
X_train_data = pd.read_csv('X_train_feature_fusion2.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test_feature_fusion2.csv')

indices_test = np.array(X_test_data)[:,0]
indices_train = np.array(X_train_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(5117, 690)
(5117,)
(3411, 690)


In [14]:
# X_train = fill_missing_values(X_train, method="median")
# X_test = fill_missing_values(X_test, method="median")

In [19]:
def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
    estimators = [# ('rf', RandomForestClassifier(random_state=0, min_samples_leaf=2, n_estimators=1000)),
                  ('gb', GradientBoostingClassifier(n_estimators = 250, 
                                                    max_depth = 5, 
                                                    learning_rate = 0.1, 
                                                    max_features = 60, 
                                                    random_state = 0
                                                   )
                  ),
                  ('mlp', MLPClassifier(hidden_layer_sizes=(900,),
                                        solver='sgd',
                                        max_iter=200,
                                        random_state=0,
                                        verbose=False,
                                       )
                  )
                 ]
    clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    score = f1_score(y_val, y_val_pred, average='micro')
    y_pred = clf.predict(X_test) 
    return score, y_pred

def train_k_fold_pred(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        # X_train, y_train = expand_dataset(X_train, y_train)
        
        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        y_pred_list.append(y_pred)
        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

def train_k_fold_pred_trick(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        print('The obtained validation r1 score is : ',score)
        test_score += score
        if score > 0.81:        
            y_pred_list.append(y_pred)
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

In [20]:
_, y_pred = train_k_fold_pred(X_train, y_train, X_test, fold_num=5) 



The obtained validation r1 score is :  0.8037109375




The obtained validation r1 score is :  0.8125




The obtained validation r1 score is :  0.8396871945259042




The obtained validation r1 score is :  0.8054740957966764




The obtained validation r1 score is :  0.793743890518084
Validation score: 0.811023


In [21]:
sample =  pd.read_csv("sample.csv")
sample["y"] = y_pred
sample.to_csv("output.csv", index = False)