In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import lightgbm as lgb

from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')


import skopt


%matplotlib inline

In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

def remove_outliers(X, y):
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    Z = np.c_[X, y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
    X , y = X[mask, :], y[mask]
    print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return X, y

def select_features(X, y, X_test, feature_num=50):
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=80, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    return X, X_test

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
X_train_missing_indices = X_train[X_train==np]
X_train = fill_missing_values(X_train, n_neighbors=75)
X_test = fill_missing_values(X_test)
print(X_train.shape)
print(X_test.shape)

(1212, 828)
(776, 828)


## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [5]:
X_train,y_train = remove_outliers(X_train,y_train)

IsolationForest-Traing data shape before removed: (1212, 828)
IsolationForest-Traing data shape after removed: (1199, 828)


## 3. Feature Selection

In [6]:
X_train, X_test = select_features(X_train, y_train, X_test,feature_num = 50)
print("Traing data shape after selection: {}".format(X_train.shape))
print("Testing data shape after selection: {}".format(X_test.shape))

Traing data shape after selection: (1199, 50)
Testing data shape after selection: (776, 50)


## 4. LGB

In [7]:
def custom_r2(prediction, train_data):
    """Regular r2 cost function returned as a tuple to be used with lgb"""
    labels = train_data.get_label()
    return 'r2', r2_score(labels, prediction), True

FIXED_PARAMS={
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'verbose': -1,
}
SEARCH_PARAMS = {
    # overfitting
    'lambda_l1':0.0,
    'lambda_l2':0.0,
    # control the model complexity
    'num_leaves': 1700,
    'max_depth': 11,
    # randomly select feature to be used for each round
    'feature_fraction': 0.5,
    # percentage of rows used per tree building iteration
    'subsample': 0.8,
    # Small max_bin causes faster speed and large value improves accuracy
    'max_bin':255,
    # recommended to use smaller learning rate with larger iteration
    'learning_rate': 0.025,
    'num_iterations':600,
    # Start with a lower number of trees to build a baseline and increase it later
    'n_estimators': 1000,
        
    'min_child_weight': 1,
    'colsample_bytree': 0.8,
        
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}
    

def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test, params):

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=params['num_iterations'],
                    feval=custom_r2,
                    valid_sets={lgb_train, lgb_eval},
                    early_stopping_rounds=params['num_iterations']//10,
                    verbose_eval=False
                   )
    
    y_val_pred = gbm.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    y_pred = gbm.predict(X_test) 
    
    return score, y_pred

def train_k_fold(X, y, search_params, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    params = {
        'task': FIXED_PARAMS['task'],
        'boosting_type': FIXED_PARAMS['boosting_type'],
        'objective': FIXED_PARAMS['objective'],
        'verbose': FIXED_PARAMS['verbose'],
        **search_params
    }
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val, params)
        test_score += score
    print('The obtained validation r2 score is : ',test_score/fold_num)
    return test_score/fold_num



In [8]:
SPACE = [
    skopt.space.Real(0.01, 0.05, name='lambda_l1', prior='log-uniform'),
    skopt.space.Real(0.01, 0.05, name='lambda_l2', prior='log-uniform'),
    skopt.space.Real(0.4, 0.6, name='feature_fraction', prior='log-uniform'),
    skopt.space.Real(0.7, 0.9, name='subsample', prior='log-uniform'),
    skopt.space.Real(0.01, 0.05, name='learning_rate', prior='log-uniform'),
    
    skopt.space.Integer(1600, 1800, name='num_leaves', prior='uniform'),
    skopt.space.Integer(8, 14, name='max_depth', prior='uniform'),
    skopt.space.Integer(400, 1000, name='num_iterations', prior='uniform'),
    skopt.space.Integer(700, 1300, name='n_estimators', prior='uniform'),

]
@skopt.utils.use_named_args(SPACE)
def objective(**params):
   return -1.0 * train_k_fold(X_train, y_train, params)

results = skopt.forest_minimize(objective, SPACE, 
                                n_calls=1000, n_random_starts=10,)

The obtained validation r2 score is :  0.6258885111157608
The obtained validation r2 score is :  0.6337959358946312
The obtained validation r2 score is :  0.6320824100468916
The obtained validation r2 score is :  0.6253049549416712
The obtained validation r2 score is :  0.630025337741556
The obtained validation r2 score is :  0.6258414492909632
The obtained validation r2 score is :  0.6328355159791341
The obtained validation r2 score is :  0.6293513422387409
The obtained validation r2 score is :  0.63425587387056
The obtained validation r2 score is :  0.6331769641517645
The obtained validation r2 score is :  0.6302406338721571
The obtained validation r2 score is :  0.6312758305537515
The obtained validation r2 score is :  0.6276730953190339
The obtained validation r2 score is :  0.6335099600425739
The obtained validation r2 score is :  0.6295828789325556
The obtained validation r2 score is :  0.631092529134732
The obtained validation r2 score is :  0.6295345585620542
The obtained valid

The obtained validation r2 score is :  0.6333737897972211
The obtained validation r2 score is :  0.6272496358221242
The obtained validation r2 score is :  0.6297362092956241
The obtained validation r2 score is :  0.6288378379428846
The obtained validation r2 score is :  0.6264593355823923
The obtained validation r2 score is :  0.632197109983927
The obtained validation r2 score is :  0.6276759868137092
The obtained validation r2 score is :  0.6274302154843755
The obtained validation r2 score is :  0.6341499819935008
The obtained validation r2 score is :  0.6372053831979758
The obtained validation r2 score is :  0.6302143703623322
The obtained validation r2 score is :  0.6312428311064844
The obtained validation r2 score is :  0.6253578700442541
The obtained validation r2 score is :  0.6284009438785662
The obtained validation r2 score is :  0.6336175126484572
The obtained validation r2 score is :  0.6360407601035758
The obtained validation r2 score is :  0.6343284674133634
The obtained va

The obtained validation r2 score is :  0.6342165161459006
The obtained validation r2 score is :  0.6342417385292871
The obtained validation r2 score is :  0.6358635519110966
The obtained validation r2 score is :  0.62714812460848
The obtained validation r2 score is :  0.632551498827476
The obtained validation r2 score is :  0.635884858980701
The obtained validation r2 score is :  0.6329897097243495
The obtained validation r2 score is :  0.6336106486604727
The obtained validation r2 score is :  0.6347075332091527
The obtained validation r2 score is :  0.6313347933659159
The obtained validation r2 score is :  0.6366249499025677
The obtained validation r2 score is :  0.6296312409243863
The obtained validation r2 score is :  0.6306150042758822
The obtained validation r2 score is :  0.6294061310937205
The obtained validation r2 score is :  0.6325051709129149
The obtained validation r2 score is :  0.6359194106495355
The obtained validation r2 score is :  0.6349114037730237
The obtained valid

The obtained validation r2 score is :  0.6292572593422103
The obtained validation r2 score is :  0.6328021967095697
The obtained validation r2 score is :  0.6353264867052668
The obtained validation r2 score is :  0.6337749715375317
The obtained validation r2 score is :  0.6344578978715014
The obtained validation r2 score is :  0.6346068719774374
The obtained validation r2 score is :  0.6363171232320977
The obtained validation r2 score is :  0.6283126004506668
The obtained validation r2 score is :  0.6342391997726596
The obtained validation r2 score is :  0.6310391626482863
The obtained validation r2 score is :  0.6288686602840718
The obtained validation r2 score is :  0.6352562168043936
The obtained validation r2 score is :  0.6325970927251944
The obtained validation r2 score is :  0.6337237111935861
The obtained validation r2 score is :  0.6374619247690132
The obtained validation r2 score is :  0.6257393414997235
The obtained validation r2 score is :  0.6300870530621874
The obtained v

The obtained validation r2 score is :  0.6260602301540237
The obtained validation r2 score is :  0.6377461372185758
The obtained validation r2 score is :  0.6292204270330397
The obtained validation r2 score is :  0.6330628195120143
The obtained validation r2 score is :  0.6333139513439793
The obtained validation r2 score is :  0.6317630983165119
The obtained validation r2 score is :  0.6300886878034999
The obtained validation r2 score is :  0.6251745282733521
The obtained validation r2 score is :  0.6309970693951577
The obtained validation r2 score is :  0.6326240447018499
The obtained validation r2 score is :  0.6345952071575677
The obtained validation r2 score is :  0.634185423308388
The obtained validation r2 score is :  0.6301889015065822
The obtained validation r2 score is :  0.6301203787719978
The obtained validation r2 score is :  0.6256594444345037
The obtained validation r2 score is :  0.6322431389589681
The obtained validation r2 score is :  0.6349961648037641
The obtained va

The obtained validation r2 score is :  0.6321606073265841
The obtained validation r2 score is :  0.638460261879608
The obtained validation r2 score is :  0.6351293782747307
The obtained validation r2 score is :  0.6293804020643052
The obtained validation r2 score is :  0.6302748033425944
The obtained validation r2 score is :  0.6279753911717797
The obtained validation r2 score is :  0.6306912714913402
The obtained validation r2 score is :  0.6317071189604591
The obtained validation r2 score is :  0.6318045935713497
The obtained validation r2 score is :  0.6300505632822198
The obtained validation r2 score is :  0.6336054523952285
The obtained validation r2 score is :  0.6265209127494241
The obtained validation r2 score is :  0.6370955799844014
The obtained validation r2 score is :  0.6366470956328564
The obtained validation r2 score is :  0.6268586457387193
The obtained validation r2 score is :  0.6334261863494804
The obtained validation r2 score is :  0.636249030211752
The obtained val

The obtained validation r2 score is :  0.6344694644488225
The obtained validation r2 score is :  0.6240233359520546
The obtained validation r2 score is :  0.6369277154692264
The obtained validation r2 score is :  0.6332859223516041
The obtained validation r2 score is :  0.6297565023122568
The obtained validation r2 score is :  0.6334538730592133
The obtained validation r2 score is :  0.6234685264037809
The obtained validation r2 score is :  0.632057076706831
The obtained validation r2 score is :  0.6260265719100968
The obtained validation r2 score is :  0.6319610820166115
The obtained validation r2 score is :  0.6372993030509381
The obtained validation r2 score is :  0.6295534885348438
The obtained validation r2 score is :  0.6289413682555706
The obtained validation r2 score is :  0.6336630483053836
The obtained validation r2 score is :  0.6353512022075191
The obtained validation r2 score is :  0.6287393851587375
The obtained validation r2 score is :  0.6298082563143945
The obtained va

The obtained validation r2 score is :  0.6324273656142914
The obtained validation r2 score is :  0.6252023325005378
The obtained validation r2 score is :  0.6305503112798743
The obtained validation r2 score is :  0.6315487081760314
The obtained validation r2 score is :  0.6358567210397177
The obtained validation r2 score is :  0.630808054974766


In [9]:

for x in results.x:
    print(x)
    

0.022311098951506406
0.024242524864550277
0.5005532443822837
0.8587066270194157
0.04199888269541904
1760
14
483
1250


NameError: name 'results' is not defined