In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer

In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    X_norma_fixed = imputer.fit_transform(X_norma)
    return X_norma_fixed

def expand_dataset(X_train, y_train, num_class = 4):
    sample = np.c_[X_train, y_train]
    sample_by_class = [sample[(sample[:,-1] == i)] for i in range(num_class)]
    sample_sizes = [ sample_class.shape[0] for sample_class in sample_by_class]
    expand_ratio = np.round(np.max(np.array(sample_sizes))/sample_sizes)
    expanded_sample_by_class = [ np.repeat(sample_by_class[i], expand_ratio[i], axis = 0) for i in range(num_class)]
    expanded_sample = np.concatenate(expanded_sample_by_class)
    return expanded_sample[:, :-1], expanded_sample[:, -1]

In [3]:
X_train_data = pd.read_csv('X_train_feature_fusion2.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test_feature_fusion2.csv')

indices_test = np.array(X_test_data)[:,0]
indices_train = np.array(X_train_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(5117, 692)
(5117,)
(3411, 692)


In [4]:
class FusedClassifier:
    def __init__(self, estimators, weights):
        assert(len(estimators)==len(weights))
        self.estimators = estimators
        self.weights = weights
        
    def fit(self, X_train, y_train):
        for estimator in self.estimators:
             estimator.fit(X_train, y_train)
    
    def predict(self, X_test):
        final_predict_list = []
        for i in range(len(self.estimators)):
            y_pred = self.estimators[i].predict(X_test)
            for j in range(self.weights[i]):
                final_predict_list.append(y_pred)
        final_predict_list = np.array(final_predict_list)
        final_predict = []
        for i in range(final_predict_list.shape[1]):
            item = final_predict_list[:,i]
            a = item[item==0].shape
            b = item[item==1].shape
            c = item[item==2].shape
            d = item[item==3].shape
            candidate = [a, b, c, d]
            final_predict.append(np.argmax(candidate))
        final_predict = np.array(final_predict)
        return final_predict
def fit_model_and_pred(clfs, weights, X_train, y_train, X_val, y_val, X_test):
    
    
#     clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, 
#                                     min_samples_split=60, min_samples_leaf=9, subsample=1,
#                                     max_features=50, random_state=0)
    clf = FusedClassifier(clfs, weights)
    clf.fit(X_train, y_train)
    y_val_pred = clf.predict(X_val)
    score = f1_score(y_val, y_val_pred, average='micro')
    y_pred = clf.predict(X_test) 
    return score, y_pred

def train_k_fold(clfs, weights, X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, _ = fit_model_and_pred(clfs, weights, X_train, y_train, X_val, y_val, X_val)

        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))

def train_k_fold_pred(clfs, weights, X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=True)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        X_train, y_train = expand_dataset(X_train, y_train)
        
        score, y_pred = fit_model_and_pred(clfs, weights, X_train, y_train, X_val, y_val, X_test)
        y_pred_list.append(y_pred)
        print('The obtained validation r1 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

def train_k_fold_pred_trick(clfs, weights, X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    y_pred_list = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        X_train, y_train = expand_dataset(X_train, y_train)
        
        score, y_pred = fit_model_and_pred(clfs, weights, X_train, y_train, X_val, y_val, X_test)
        print('The obtained validation r1 score is : ',score)
        test_score += score
        if score > 0.82:        
            y_pred_list.append(y_pred)
    print("Validation score: %f"%(test_score/fold_num))
    
    y_pred_list = np.array(y_pred_list)
    y_test_predict = []
    for i in range(y_pred_list.shape[1]):
        item = y_pred_list[:,i]
        a = item[item==0].shape
        b = item[item==1].shape
        c = item[item==2].shape
        d = item[item==3].shape 
        candidate = [a, b, c, d]
        y_test_predict.append(np.argmax(candidate))
    y_test_predict = np.array(y_test_predict)
    return test_score/fold_num, y_test_predict

In [5]:
clfs = [
    GradientBoostingClassifier(learning_rate=0.05, n_estimators=500, max_depth=7, 
                                    min_samples_split=60, min_samples_leaf=9, subsample=1,
                                    max_features=50, random_state=0),
    HistGradientBoostingClassifier(learning_rate=0.05, max_depth=7, min_samples_leaf=9, random_state=0),
    RandomForestClassifier(random_state=0, min_samples_leaf=2, n_estimators=1000)
]
weights = [5, 4, 3]
y_pred_list = []
for i in range(5):
    score, y_pred = train_k_fold_pred(clfs, weights, X_train, y_train, X_test, fold_num=5) 
    if(score > 0.833):
        y_pred_list.append(y_pred)

The obtained validation r1 score is :  0.8310546875
The obtained validation r1 score is :  0.8203125
The obtained validation r1 score is :  0.8357771260997068
The obtained validation r1 score is :  0.8191593352883676
The obtained validation r1 score is :  0.8426197458455523
Validation score: 0.829785
The obtained validation r1 score is :  0.8603515625
The obtained validation r1 score is :  0.8212890625
The obtained validation r1 score is :  0.8064516129032258
The obtained validation r1 score is :  0.8328445747800586
The obtained validation r1 score is :  0.8220918866080157
Validation score: 0.828606
The obtained validation r1 score is :  0.81640625
The obtained validation r1 score is :  0.822265625
The obtained validation r1 score is :  0.8299120234604106
The obtained validation r1 score is :  0.8299120234604106
The obtained validation r1 score is :  0.8367546432062561
Validation score: 0.827050
The obtained validation r1 score is :  0.8310546875
The obtained validation r1 score is :  

In [6]:
y_pred_list = np.array(y_pred_list)
y_test_predict = []
for i in range(y_pred_list.shape[1]):
    item = y_pred_list[:,i]
    a = item[item==0].shape
    b = item[item==1].shape
    c = item[item==2].shape
    d = item[item==3].shape 
    candidate = [a, b, c, d]
    y_test_predict.append(np.argmax(candidate))
y_test_predict = np.array(y_test_predict)

IndexError: tuple index out of range

In [None]:
sample =  pd.read_csv("sample.csv")
sample["y"] = y_test_predict
sample.to_csv("output_fusion_834.csv", index = False)