In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

import xgboost as xgb

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

In [3]:
X_train_data = pd.read_csv('X_train_feature.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test_feature.csv')

indices_test = np.array(X_test_data)[:,0]
indices_train = np.array(X_train_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(5117, 936)
(5117,)
(3411, 936)


In [4]:
X_train = fill_missing_values(X_train, method="median")
X_test = fill_missing_values(X_test, method="median")

In [5]:
def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
    xg_train = xgb.DMatrix(X_train, label=y_train)
    xg_eval = xgb.DMatrix(X_val, label=y_val)
    
    # setup parameters for xgboost
    param = {}

    param['objective'] = 'multi:softmax'

    param['eta'] = 0.2
    param['gamma'] = 1.0
    param['max_depth'] = 6
    param['silent'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.9
    param['min_child_weight'] = 20
    param['num_class'] = 4

    watchlist = [(xg_train, 'train'), (xg_eval, 'eval')]
    num_round = 80
    clf = xgb.train(param, 
                    xg_train, 
                    num_round, 
                    watchlist, 
                    feval=lambda y,t: ("f1", f1_score(y, t.get_label(), average='micro')))


    y_val_pred = clf.predict(xg_eval)

    score = f1_score(y_val, y_val_pred, average='micro')
    
    xg_test = xgb.DMatrix(X_test)
    y_pred = clf.predict(xg_test)
    
    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, _ = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val)

        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))

def train_k_fold_pred(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        y_pred_list.append(y_pred)
        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

In [None]:
_, y_pred = train_k_fold_pred(X_train, y_train, X_test, fold_num=5) 

In [None]:
sample =  pd.read_csv("sample.csv")
sample["y"] = y_pred
sample.to_csv("output.csv", index = False)