In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer

In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    return X_norma_fixed

def expand_dataset(X_train, y_train, num_class = 4):
    sample = np.c_[X_train, y_train]
    sample_by_class = [sample[(sample[:,-1] == i)] for i in range(num_class)]
    sample_sizes = [ sample_class.shape[0] for sample_class in sample_by_class]
    expand_ratio = np.round(np.max(np.array(sample_sizes))/sample_sizes)
    expanded_sample_by_class = [ np.repeat(sample_by_class[i], expand_ratio[i], axis = 0) for i in range(num_class)]
    expanded_sample = np.concatenate(expanded_sample_by_class)
    return expanded_sample[:, :-1], expanded_sample[:, -1]

In [3]:
X_train_data = pd.read_csv('X_train_feature_fusion2.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test_feature_fusion2.csv')

indices_test = np.array(X_test_data)[:,0]
indices_train = np.array(X_train_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(5117, 692)
(5117,)
(3411, 692)


In [4]:
# class HierarchyClassifier:
#     def __init__(self, first_estimator, second_estimator):
#         self.first_estimator = first_estimator
#         self.second_estimator = second_estimator
        
#     def fit(self, X_train, y_train):
#         y_train_first = y_train.copy()
#         mask = (y_train_first != 0)
#         y_train_first [mask] = 1

#         X_train_second = X_train[mask]
#         y_train_second = y_train[mask]-1
        
#         X_train, y_train_first = expand_dataset(X_train, y_train_first, num_class = 2)
#         self.first_estimator.fit(X_train, y_train_first)
#         X_train_second, y_train_second = expand_dataset(X_train_second, y_train_second, num_class = 3)
#         self.second_estimator.fit(X_train_second, y_train_second)
        
#     def predict(self, X_test):
#         y_pred_first = self.first_estimator.predict(X_test)
#         mask = (y_pred_first != 0)
#         X_test_second = X_test[mask]
#         y_pred_second = self.second_estimator.predict(X_test_second) + 1
#         y_pred_first[mask] = y_pred_second
#         return y_pred_first

class HierarchyClassifier:
    def __init__(self, first_estimator, second_estimator):
        self.first_estimator = first_estimator
        self.second_estimator = second_estimator
        
    def fit(self, X_train, y_train):
        y_train_first = y_train.copy()
        mask = (y_train_first != 3)
        y_train_first [mask] = 1
        y_train_first [~mask] = 0

        X_train_second = X_train[mask]
        y_train_second = y_train[mask]
        
        X_train, y_train_first = expand_dataset(X_train, y_train_first, num_class = 2)
        self.first_estimator.fit(X_train, y_train_first)
        X_train_second, y_train_second = expand_dataset(X_train_second, y_train_second, num_class = 3)
        self.second_estimator.fit(X_train_second, y_train_second)
        
    def predict(self, X_test):
        y_pred_first = self.first_estimator.predict(X_test)
        mask = (y_pred_first != 0)
        y_pred_first[~mask] = 3
        
        X_test_second = X_test[mask]
        y_pred_second = self.second_estimator.predict(X_test_second)
        y_pred_first[mask] = y_pred_second
        return y_pred_first
        
    
def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
    first_estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, 
                                    min_samples_split=60, min_samples_leaf=9, subsample=1,
                                    max_features=60, random_state=0)
    
    second_estimator = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, 
                                    min_samples_split=60, min_samples_leaf=9, subsample=1,
                                    max_features=60, random_state=0)
    # bagging
    clf = HierarchyClassifier(first_estimator, second_estimator)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    score = f1_score(y_val, y_val_pred, average='micro')
    y_pred = clf.predict(X_test) 
    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, _ = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val)

        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))

def train_k_fold_pred(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=True)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        y_pred_list.append(y_pred)
        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

In [7]:
for i in range(5):
    _, y_pred = train_k_fold_pred(X_train, y_train, X_test, fold_num=5) 

The obtained validation r1 score is :  0.8427734375
The obtained validation r1 score is :  0.796875
The obtained validation r1 score is :  0.841642228739003
The obtained validation r1 score is :  0.8289345063538613
The obtained validation r1 score is :  0.844574780058651
Validation score: 0.830960
The obtained validation r1 score is :  0.859375
The obtained validation r1 score is :  0.8193359375
The obtained validation r1 score is :  0.8406647116324536
The obtained validation r1 score is :  0.83088954056696
The obtained validation r1 score is :  0.8269794721407625
Validation score: 0.835449
The obtained validation r1 score is :  0.818359375
The obtained validation r1 score is :  0.8291015625
The obtained validation r1 score is :  0.8220918866080157
The obtained validation r1 score is :  0.855327468230694
The obtained validation r1 score is :  0.8347996089931574
Validation score: 0.831936
The obtained validation r1 score is :  0.83984375
The obtained validation r1 score is :  0.82617187

In [8]:
sample =  pd.read_csv("sample.csv")
sample["y"] = y_pred
sample.to_csv("output_hierarchy.csv", index = False)