In [77]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor
import lightgbm as lgb

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, DotProduct, WhiteKernel, RationalQuadratic
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

%matplotlib inline

In [78]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

def remove_outliers(X, y):
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    Z = np.c_[X, y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
    X , y = X[mask, :], y[mask]
    print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return X, y

def select_features(X, y, X_test, feature_num=50, n_estimators=80):
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=80, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    return X, X_test

In [79]:

X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [80]:
X_train_missing_indices = X_train[X_train==np]
X_train = fill_missing_values(X_train, n_neighbors=75)
X_test_processed = fill_missing_values(X_test)
print(X_train.shape)
print(X_test_processed.shape)

  X_norma = (X-X_ave)/X_std
  X_norma = (X-X_ave)/X_std


(1212, 828)
(776, 828)


## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [81]:
X_processed,y_processed = remove_outliers(X_train,y_train)

IsolationForest-Traing data shape before removed: (1212, 828)
IsolationForest-Traing data shape after removed: (1200, 828)


## 3. Feature Selection

In [82]:
# pearsonr feature selection
from scipy.stats import pearsonr
from sklearn.decomposition import PCA

def pearsonr_feature_selection(X, y, X_test,degree,weights):
    feature_idx =  np.array([i for i in range(0,X.shape[1])])
    correlations = {}
#     print("feature_idx: ", feature_idx.T)
    for idx in feature_idx:
        correlation = 0
        for degree_idx in range(0,len(weights)):
            correlation += abs(weights[degree_idx]*pearsonr(X[:,idx],y**degree[degree_idx])[0])
        correlations[idx] = correlation
    data_correlations = pd.DataFrame(correlations, index=['correlation']).T
    indices_desc = data_correlations['correlation'].abs().sort_values(ascending=False).index
#     print(indices_desc)
    return data_correlations,indices_desc

def pearsonr_feature_selection_correlation(X, X_test,threshold):
    feature_idx =  np.array([i for i in range(0,X.shape[1])])
    correlation_select = np.array([True for i in range(0,X.shape[1])])
    for f_idx in feature_idx:
        if(correlation_select[f_idx]):
            feature_co,indices_desc = pearsonr_feature_selection(X[:,(f_idx+1):], X[:,f_idx], X_test_processed,np.array([1]),np.array([1]))
            reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
            remove_index = reomove_features.index+(f_idx+1)
            correlation_select[remove_index] = False
    
    return X[:,correlation_select],X_test[:,correlation_select]

In [83]:
def LGB_feature_selection(X_processed,X_test_processed,y_processed):
    data_relation,indices_desc = pearsonr_feature_selection(X_processed, y_processed, X_test_processed,np.array([1,-3,3]),np.array([0.9,0.05,0.05]))
    selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.025]
    X_selection = X_processed[:,selected_features.index]
    X_test_selection = X_test_processed[:,selected_features.index]
    X_selection_tree, X_test_selection_tree = select_features(X_selection, y_processed, X_test_selection,feature_num = 105)
    X_selection_tree_cor, X_test_selection_tree_cor = pearsonr_feature_selection_correlation(X_selection_tree, X_test_selection_tree,0.93)
    return X_selection_tree_cor,X_test_selection_tree_cor

In [89]:
def GP_feature_selection(X_processed,X_test_processed,y_processed, tree_num = 43):
    data_relation,indices_desc = pearsonr_feature_selection(X_processed, y_processed, X_test_processed,np.array([1,-3,3]),np.array([0.95,0.025,0.025]))
    selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]
    X_selection = X_processed[:,selected_features.index]
    X_test_selection = X_test_processed[:,selected_features.index]
    X_selection_tree, X_test_selection_tree = select_features(X_selection, y_processed, X_test_selection,feature_num = tree_num)
    X_selection_tree_cor, X_test_selection_tree_cor = pearsonr_feature_selection_correlation(X_selection_tree, X_test_selection_tree,0.96)
    return X_selection_tree_cor,X_test_selection_tree_cor

In [85]:
def GP_rbf_feature_selection(X_processed,X_test_processed,y_processed):
    data_relation,indices_desc = pearsonr_feature_selection(X_processed, y_processed, X_test_processed,np.array([1,-3,3]),np.array([0.95,0.025,0.025]))
    selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]
    X_selection = X_processed[:,selected_features.index]
    X_test_selection = X_test_processed[:,selected_features.index]
    X_selection_tree, X_test_selection_tree = select_features(X_selection, y_processed, X_test_selection,feature_num = 54)
    X_selection_tree_cor, X_test_selection_tree_cor = pearsonr_feature_selection_correlation(X_selection_tree, X_test_selection_tree,0.96)
    return X_selection_tree_cor,X_test_selection_tree_cor

In [86]:
def Tree_feature_selection(X_processed,X_test_processed,y_processed):
    data_relation,indices_desc = pearsonr_feature_selection(X_processed, y_processed, X_test_processed,np.array([1,-3,3]),np.array([0.95,0.025,0.025]))
    selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.1]
    X_selection = X_processed[:,selected_features.index]
    X_test_selection = X_test_processed[:,selected_features.index]
    X_selection_tree, X_test_selection_tree = select_features(X_selection, y_processed, X_test_selection,feature_num = 50, n_estimators= 100)
    X_selection_tree_cor, X_test_selection_tree_cor = pearsonr_feature_selection_correlation(X_selection_tree, X_test_selection_tree,0.96)
    return X_selection_tree_cor,X_test_selection_tree_cor

In [87]:
X_lgb,X_test_lgb = LGB_feature_selection(X_processed,X_test_processed,y_processed)
print(X_lgb.shape)
print(X_test_lgb.shape)

  selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.025]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_des

  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][

(1200, 98)
(776, 98)


  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][

In [91]:
X_gp_quad,X_test_gp_quad = GP_feature_selection(X_processed,X_test_processed,y_processed)
print(X_gp_quad.shape)
print(X_test_gp_quad.shape)

  selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]


(1200, 42)
(776, 42)


  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][

In [92]:
X_gp_rbf,X_test_gp_rbf = GP_rbf_feature_selection(X_processed,X_test_processed,y_processed)
print(X_gp_rbf.shape)
print(X_test_rbf.shape)

  selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_des

(1200, 50)
(776, 50)


  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]


In [51]:
x_tree,X_test_tree = Tree_feature_selection(X_processed,X_test_processed,y_processed)
print(x_tree.shape)
print(X_test_tree.shape)

  selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.1]


(1200, 49)
(776, 49)


  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][

## 4. Gaussian Process and lgb

In [93]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, DotProduct, WhiteKernel

def custom_r2(prediction, train_data):
    """Regular r2 cost function returned as a tuple to be used with lgb"""
    labels = train_data.get_label()
    return 'r2', r2_score(labels, prediction), True

def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
#     kernel = RBF(0.5, (1e-4, 10))
#     gpr = GaussianProcessRegressor(kernel=kernel, random_state=0, n_restarts_optimizer=1, alpha=0.07, normalize_y=True)
#     gpr.fit(X_train, y_train)
#     score = gpr.score(X_val, y_val)   
#     y_pred = gpr.predict(X_test) 
    kernel = RationalQuadratic(length_scale=0.5, alpha=1.0, length_scale_bounds=(1e-4, 10))
    gpr = GaussianProcessRegressor(kernel=kernel, random_state=0, n_restarts_optimizer=1, alpha=0.07, normalize_y=True)
    gpr.fit(X_train, y_train)
    score = gpr.score(X_val, y_val)
    y_val_pred = gpr.predict(X_val)   
    y_pred = gpr.predict(X_test)
    
    return score, y_pred

def fit_model_and_pred_gp2(X_train, y_train, X_val, y_val, X_test):
    kernel = RBF(0.5, (1e-4, 10))
    gpr = GaussianProcessRegressor(kernel=kernel, random_state=0, n_restarts_optimizer=1, alpha=0.07, normalize_y=True)
    gpr.fit(X_train, y_train)
    score = gpr.score(X_val, y_val)   
    y_pred = gpr.predict(X_test) 
    
    return score, y_pred

def fit_model_and_pred_tree(X_train, y_train, X_val, y_val, X_test, random_state=0):
    model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=190, random_state=random_state, min_samples_split=3, max_features=None)
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    
    y_pred = model.predict(X_test) 
    
    return score, y_pred

def fit_model_and_pred_lgb(X_train, y_train, X_val, y_val, X_test):
   
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 1800,
        'learning_rate': 0.025,
        'max_depth': 11,
        'n_estimators': 1000,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'num_iterations':1000,
    }  
    
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=500,
                    feval=custom_r2,
                    valid_sets={lgb_train, lgb_eval},
                    early_stopping_rounds=100,
                    verbose_eval=False
                   )
    
    y_val_pred = gbm.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    y_pred = gbm.predict(X_test) 
    return score, y_pred

def train_k_fold_combine(X_lgb,X_test_lgb,X_gp,X_test_gp,X_tree,X_test_tree, y , weight1 = 0.7, weight2 = 0.75, fold_num=10):
    kf = KFold(n_splits=fold_num, shuffle=False)
    kf.get_n_splits(X_lgb)
    test_score = 0.0
    train_score = 0.0
    y_test_pred = np.zeros((X_test_lgb.shape[0]))
#     print("shape of y_test_pred:", y_test_pred.shape)
    i = 0
    for train_index, test_index in kf.split(X_lgb):
        X_lgb_train, X_lgb_val = X_lgb[train_index], X_lgb[test_index]
        X_gp_train, X_gp_val = X_gp[train_index], X_gp[test_index]
        X_tree_train, X_tree_val = X_tree[train_index], X_tree[test_index]
        y_train, y_val = y[train_index], y[test_index]
        

        _, y_pred1 = fit_model_and_pred(X_gp_train, y_train, X_gp_val, y_val, X_gp_val)
        _, y_pred2 = fit_model_and_pred_lgb(X_lgb_train, y_train, X_lgb_val, y_val, X_lgb_val)
        _, y_pred3 = fit_model_and_pred_tree(X_tree_train, y_train, X_tree_val, y_val, X_tree_val)
        y_pred = weight2*(weight1*y_pred1 + (1-weight1)*y_pred2)+(1-weight2)*y_pred3

        _, y_test_pred1 = fit_model_and_pred(X_gp_train, y_train, X_gp_val, y_val, X_test_gp)
        _, y_test_pred2 = fit_model_and_pred_lgb(X_lgb_train, y_train, X_lgb_val, y_val, X_test_lgb)
        _, y_test_pred3 = fit_model_and_pred_tree(X_tree_train, y_train, X_tree_val, y_val, X_test_tree)
        y_test_pred += weight2*(weight1*y_test_pred1 + (1-weight1)*y_test_pred2)+(1-weight2)*y_test_pred3
        
        score = r2_score(y_val, y_pred)
        print(i+1, 'th . the obtained validation r2 score is : ',score)
        test_score += score
        i += 1

    return test_score/fold_num,y_test_pred/fold_num

def train_k_fold_2GP_LGB(X_lgb,X_test_lgb,X_gp,X_test_gp,X_tree,X_test_tree, y , weight1 = 0.5, weight2 = 0.75, fold_num=10):
# weight1 is the ratio of gp_rbf to lgb, weight2 is the combine of previous to gp.quadric
    kf = KFold(n_splits=fold_num, shuffle=False)
    kf.get_n_splits(X_lgb)
    test_score = 0.0
    train_score = 0.0
    y_test_pred = np.zeros((X_test_lgb.shape[0]))
#     print("shape of y_test_pred:", y_test_pred.shape)
    i = 0
    for train_index, test_index in kf.split(X_lgb):
        X_lgb_train, X_lgb_val = X_lgb[train_index], X_lgb[test_index]
        X_gp_train, X_gp_val = X_gp[train_index], X_gp[test_index]
        X_tree_train, X_tree_val = X_tree[train_index], X_tree[test_index]
        y_train, y_val = y[train_index], y[test_index]
        

        _, y_pred1 = fit_model_and_pred(X_gp_train, y_train, X_gp_val, y_val, X_gp_val)
        _, y_pred2 = fit_model_and_pred_lgb(X_lgb_train, y_train, X_lgb_val, y_val, X_lgb_val)
        _, y_pred3 = fit_model_and_pred_gp2(X_tree_train, y_train, X_tree_val, y_val, X_tree_val)
        y_pred = weight2*(weight1*y_pred3+(1-weight1)*y_pred2) + (1-weight2)*y_pred1

        _, y_test_pred1 = fit_model_and_pred(X_gp_train, y_train, X_gp_val, y_val, X_test_gp)
        _, y_test_pred2 = fit_model_and_pred_lgb(X_lgb_train, y_train, X_lgb_val, y_val, X_test_lgb)
        _, y_test_pred3 = fit_model_and_pred_gp2(X_tree_train, y_train, X_tree_val, y_val, X_test_tree)
        y_test_pred += weight2*(weight1*y_test_pred3+(1-weight1)*y_test_pred2) + (1-weight2)*y_test_pred1
        
        score = r2_score(y_val, y_pred)
        print(i+1, 'th . the obtained validation r2 score is : ',score)
        test_score += score
        i += 1

    return test_score/fold_num,y_test_pred/fold_num

def train_k_fold_predict(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    cnt = 0
    val_score = 0.0
    train_score = 0.0
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test) 
        val_score += score
        y_test_predict += y_pred
        
        score, _ = fit_model_and_pred(X_train, y_train, X_train, y_train, X_test)
        train_score += score
    return train_score/fold_num, val_score/fold_num, y_test_predict/fold_num

In [94]:
score,prediction = train_k_fold_2GP_LGB(X_lgb,X_test_lgb,X_gp_quad,X_test_gp_quad,X_gp_rbf,X_test_gp_rbf, y_processed , weight1 = 0.578, weight2 = 0.68, fold_num=10)







1 th . the obtained validation r2 score is :  0.792082806872864








2 th . the obtained validation r2 score is :  0.8007089386194455








3 th . the obtained validation r2 score is :  0.6230468385931184








4 th . the obtained validation r2 score is :  0.7362094459018682








5 th . the obtained validation r2 score is :  0.6766446576609021








6 th . the obtained validation r2 score is :  0.6814799387934302








7 th . the obtained validation r2 score is :  0.6165303650935852








8 th . the obtained validation r2 score is :  0.6453161026346551








9 th . the obtained validation r2 score is :  0.7572087813010993








10 th . the obtained validation r2 score is :  0.6167032358834279


In [95]:
score

0.6945931111354395

In [96]:
prediction

array([67.69633865, 69.24752423, 68.88168906, 70.540496  , 70.94428903,
       74.22264261, 62.45109883, 59.98574933, 80.34934564, 78.5080765 ,
       57.87408918, 80.63058967, 72.72791523, 81.55647858, 55.27308652,
       80.36900814, 69.87395007, 74.17983802, 71.66095016, 69.66490821,
       69.79135965, 65.91879067, 80.05247976, 75.41781537, 62.34961265,
       70.10909757, 54.94007516, 71.46661414, 66.02635235, 73.69749874,
       64.09343384, 69.62229206, 57.50310361, 71.13925372, 69.02689627,
       65.60375219, 74.64776384, 55.98263182, 71.58994496, 69.63315581,
       71.60080553, 74.68406993, 71.81584727, 79.51244508, 72.7932702 ,
       68.53453198, 69.06964882, 72.82826439, 77.15070854, 70.0993279 ,
       76.85037346, 66.11151894, 61.06773032, 82.63447171, 76.68939058,
       72.7701355 , 64.67581883, 71.51023123, 72.85647308, 60.80259538,
       65.44911451, 74.2835017 , 64.82070008, 72.68574263, 72.33558138,
       58.03099551, 69.62826399, 78.89853203, 62.24375577, 74.44

In [97]:
final_res = np.vstack((indices_test, prediction)).T
df_res = pd.DataFrame(final_res)
df_res.to_csv("our_result_rbf_quad_lgb2.csv", header = ["id", "y"], index=False)

In [61]:
# weight_trial = [0.68+0.01*i for i in range(0,15)]
# y_pred_trial = np.zeros((X_test_gp.shape[0],len(weight_trial)))
# score_trial = np.zeros(len(weight_trial))

# for i in range(0,len(weight_trial)):
#     score_trial[i],y_pred_trial[:,i] = train_k_fold_2GP_LGB(X_lgb,X_test_lgb,X_gp_quad,X_test_gp_quad,X_gp_rbf,X_test_gp_rbf, y_processed , weight1 = 0.578, weight2 = weight_trial[i], fold_num=10)







1 th . the obtained validation r2 score is :  0.7913689644235541








2 th . the obtained validation r2 score is :  0.8008280325426509








3 th . the obtained validation r2 score is :  0.6224255988049037








4 th . the obtained validation r2 score is :  0.735493737943419








5 th . the obtained validation r2 score is :  0.6763323915532709








6 th . the obtained validation r2 score is :  0.6816949487880756








7 th . the obtained validation r2 score is :  0.6160961048599536








8 th . the obtained validation r2 score is :  0.6435453994012088








9 th . the obtained validation r2 score is :  0.7571429170946076








10 th . the obtained validation r2 score is :  0.6159740476033899








1 th . the obtained validation r2 score is :  0.7913025429528895








2 th . the obtained validation r2 score is :  0.800962443145826








3 th . the obtained validation r2 score is :  0.6227631521838368








4 th . the obtained validation r2 score is :  0.7351908962018977








5 th . the obtained validation r2 score is :  0.6763005891206646








6 th . the obtained validation r2 score is :  0.6816866001876283








7 th . the obtained validation r2 score is :  0.6160652780690132








8 th . the obtained validation r2 score is :  0.6434624949744063








9 th . the obtained validation r2 score is :  0.7571317835946907








10 th . the obtained validation r2 score is :  0.616076607245754








1 th . the obtained validation r2 score is :  0.7912309573470453








2 th . the obtained validation r2 score is :  0.8010925546001458








3 th . the obtained validation r2 score is :  0.6230955928961857








4 th . the obtained validation r2 score is :  0.7348832267943254








5 th . the obtained validation r2 score is :  0.6762641179359961








6 th . the obtained validation r2 score is :  0.6816750705849519








7 th . the obtained validation r2 score is :  0.6160301761246254








8 th . the obtained validation r2 score is :  0.6433750926001698








9 th . the obtained validation r2 score is :  0.7571164934814382








10 th . the obtained validation r2 score is :  0.6161748501891255








1 th . the obtained validation r2 score is :  0.7911542076060216








2 th . the obtained validation r2 score is :  0.8012183669056104








3 th . the obtained validation r2 score is :  0.6234229209419504








4 th . the obtained validation r2 score is :  0.7345707297207017








5 th . the obtained validation r2 score is :  0.6762229779992657








6 th . the obtained validation r2 score is :  0.6816603599800468








7 th . the obtained validation r2 score is :  0.6159907990267901








8 th . the obtained validation r2 score is :  0.6432831922784997








9 th . the obtained validation r2 score is :  0.7570970467548501








10 th . the obtained validation r2 score is :  0.616268776433505








1 th . the obtained validation r2 score is :  0.7910722937298182








2 th . the obtained validation r2 score is :  0.8013398800622197








3 th . the obtained validation r2 score is :  0.623745136321131








4 th . the obtained validation r2 score is :  0.734253404981027








5 th . the obtained validation r2 score is :  0.6761771693104732








6 th . the obtained validation r2 score is :  0.6816424683729128








7 th . the obtained validation r2 score is :  0.6159471467755073








8 th . the obtained validation r2 score is :  0.6431867940093954








9 th . the obtained validation r2 score is :  0.7570734434149267








10 th . the obtained validation r2 score is :  0.6163583859788924








1 th . the obtained validation r2 score is :  0.7909852157184354








2 th . the obtained validation r2 score is :  0.8014570940699737








3 th . the obtained validation r2 score is :  0.6240622390337275








4 th . the obtained validation r2 score is :  0.7339312525753008








5 th . the obtained validation r2 score is :  0.6761266918696187








6 th . the obtained validation r2 score is :  0.68162139576355








7 th . the obtained validation r2 score is :  0.6158992193707771








8 th . the obtained validation r2 score is :  0.6430858977928573








9 th . the obtained validation r2 score is :  0.7570456834616677








10 th . the obtained validation r2 score is :  0.6164436788252877








1 th . the obtained validation r2 score is :  0.790892973571873








2 th . the obtained validation r2 score is :  0.8015700089288723








3 th . the obtained validation r2 score is :  0.6243742290797396








4 th . the obtained validation r2 score is :  0.7336042725035234








5 th . the obtained validation r2 score is :  0.6760715456767021








6 th . the obtained validation r2 score is :  0.6815971421519582








7 th . the obtained validation r2 score is :  0.6158470168125993








8 th . the obtained validation r2 score is :  0.6429805036288854








9 th . the obtained validation r2 score is :  0.7570137668950734








10 th . the obtained validation r2 score is :  0.6165246549726906








1 th . the obtained validation r2 score is :  0.790795567290131








2 th . the obtained validation r2 score is :  0.8016786246389158








3 th . the obtained validation r2 score is :  0.6246811064591677








4 th . the obtained validation r2 score is :  0.7332724647656951








5 th . the obtained validation r2 score is :  0.6760117307317235








6 th . the obtained validation r2 score is :  0.6815697075381375








7 th . the obtained validation r2 score is :  0.615790539100974








8 th . the obtained validation r2 score is :  0.6428706115174796








9 th . the obtained validation r2 score is :  0.7569776937151436








10 th . the obtained validation r2 score is :  0.6166013144211017








1 th . the obtained validation r2 score is :  0.7906929968732095








2 th . the obtained validation r2 score is :  0.8017829412001038








3 th . the obtained validation r2 score is :  0.6249828711720113








4 th . the obtained validation r2 score is :  0.7329358293618153








5 th . the obtained validation r2 score is :  0.6759472470346829








6 th . the obtained validation r2 score is :  0.681539091922088








7 th . the obtained validation r2 score is :  0.6157297862359012








8 th . the obtained validation r2 score is :  0.6427562214586398








9 th . the obtained validation r2 score is :  0.7569374639218782








10 th . the obtained validation r2 score is :  0.6166736571705203








1 th . the obtained validation r2 score is :  0.7905852623211085








2 th . the obtained validation r2 score is :  0.8018829586124367








3 th . the obtained validation r2 score is :  0.6252795232182711








4 th . the obtained validation r2 score is :  0.7325943662918843








5 th . the obtained validation r2 score is :  0.6758780945855802








6 th . the obtained validation r2 score is :  0.6815052953038095








7 th . the obtained validation r2 score is :  0.6156647582173809








8 th . the obtained validation r2 score is :  0.6426373334523661








9 th . the obtained validation r2 score is :  0.7568930775152773








10 th . the obtained validation r2 score is :  0.6167416832209469








1 th . the obtained validation r2 score is :  0.7904723636338278








2 th . the obtained validation r2 score is :  0.8019786768759143








3 th . the obtained validation r2 score is :  0.6255710625979465








4 th . the obtained validation r2 score is :  0.7322480755559023








5 th . the obtained validation r2 score is :  0.6758042733844154








6 th . the obtained validation r2 score is :  0.6814683176833023








7 th . the obtained validation r2 score is :  0.6155954550454131








8 th . the obtained validation r2 score is :  0.6425139474986586








9 th . the obtained validation r2 score is :  0.7568445344953412








10 th . the obtained validation r2 score is :  0.6168053925723813








1 th . the obtained validation r2 score is :  0.7903543008113676








2 th . the obtained validation r2 score is :  0.8020700959905365








3 th . the obtained validation r2 score is :  0.6258574893110378








4 th . the obtained validation r2 score is :  0.7318969571538689








5 th . the obtained validation r2 score is :  0.6757257834311887








6 th . the obtained validation r2 score is :  0.6814281590605661








7 th . the obtained validation r2 score is :  0.6155218767199979








8 th . the obtained validation r2 score is :  0.6423860635975172








9 th . the obtained validation r2 score is :  0.7567918348620695








10 th . the obtained validation r2 score is :  0.6168647852248237








1 th . the obtained validation r2 score is :  0.7902310738537279








2 th . the obtained validation r2 score is :  0.8021572159563033








3 th . the obtained validation r2 score is :  0.6261388033575447








4 th . the obtained validation r2 score is :  0.7315410110857845








5 th . the obtained validation r2 score is :  0.6756426247258998








6 th . the obtained validation r2 score is :  0.6813848194356009








7 th . the obtained validation r2 score is :  0.6154440232411351








8 th . the obtained validation r2 score is :  0.6422536817489418








9 th . the obtained validation r2 score is :  0.7567349786154622








10 th . the obtained validation r2 score is :  0.6169198611782737








1 th . the obtained validation r2 score is :  0.7901026827609087








2 th . the obtained validation r2 score is :  0.802240036773215








3 th . the obtained validation r2 score is :  0.6264150047374675








4 th . the obtained validation r2 score is :  0.7311802373516485








5 th . the obtained validation r2 score is :  0.6755547972685487








6 th . the obtained validation r2 score is :  0.681338298808407








7 th . the obtained validation r2 score is :  0.6153618946088247








8 th . the obtained validation r2 score is :  0.6421168019529326








9 th . the obtained validation r2 score is :  0.7566739657555197








10 th . the obtained validation r2 score is :  0.6169706204327319








1 th . the obtained validation r2 score is :  0.7899691275329098








2 th . the obtained validation r2 score is :  0.8023185584412714








3 th . the obtained validation r2 score is :  0.6266860934508061








4 th . the obtained validation r2 score is :  0.7308146359514613








5 th . the obtained validation r2 score is :  0.6754623010591359








6 th . the obtained validation r2 score is :  0.6812885971789843








7 th . the obtained validation r2 score is :  0.6152754908230669








8 th . the obtained validation r2 score is :  0.6419754242094894








9 th . the obtained validation r2 score is :  0.7566087962822414








10 th . the obtained validation r2 score is :  0.6170170629881975


In [69]:
# feature_trial = [43 ]
# y_pred_trial = np.zeros((X_test_gp.shape[0],len(feature_trial)))
# score_trial = np.zeros(len(weight_trial))

# for i in range(0,len(weight_trial)):
#     X_gp_quad,X_test_gp_quad = GP_feature_selection(X_processed,X_test_processed,y_processed, tree_num = feature_trial[i])
#     score_trial[i],y_pred_trial[:,i] = train_k_fold_2GP_LGB(X_lgb,X_test_lgb,X_gp_quad,X_test_gp_quad,X_gp_rbf,X_test_gp_rbf, y_processed , weight1 = 0.578, weight2 = 0.69, fold_num=10)


  selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
  reomove_features = feature_co.loc[indices_des







1 th . the obtained validation r2 score is :  0.7919953181106536








2 th . the obtained validation r2 score is :  0.8008487384764256








3 th . the obtained validation r2 score is :  0.6233681317783641








4 th . the obtained validation r2 score is :  0.7358868431378822








5 th . the obtained validation r2 score is :  0.6766072806967387








6 th . the obtained validation r2 score is :  0.6814800303741555








7 th . the obtained validation r2 score is :  0.6164911690261252








8 th . the obtained validation r2 score is :  0.645188691363417








9 th . the obtained validation r2 score is :  0.7571980454147821








10 th . the obtained validation r2 score is :  0.6167866431832831


IndexError: list index out of range

In [48]:
# weight_trial = [0.57+0.002*i for i in range(0,6)]
# y_pred_trial = np.zeros((X_test_gp.shape[0],len(weight_trial)))
# score_trial = np.zeros(len(weight_trial))

# for i in range(0,len(weight_trial)):
#     score_trial[i],y_pred_trial[:,i] = train_k_fold_combine(X_lgb,X_test_lgb,X_gp,X_test_gp, y_processed , weight_trial[i], fold_num=10)

































































































































































































































































































































































































































































































Experiment shows that the ratio of GP(with RBF kernals) to lgb is 0.578, the ratio of combination of above 2 to GP(quadratic) is 0.68. Experiment shows that good feature num for GP with quadratic is 43-45