In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import lightgbm as lgb

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
def normalize_data(X):
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    return X_norma

def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    X_fixed = imputer.fit_transform(X)
    return X_fixed

def remove_outliers(X, y):
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    Z = np.c_[X, y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
    return mask

def find_missing_value_and_move_outliers(X, y, X_test):
    print("Traing data shape before impute and outlier remove: {}".format(X.shape))
    print("Testing data shape before impute and outlier remove: {}".format(X_test.shape))
    X_nan = np.isnan(X)  
    X_median = fill_missing_values(X, method="median")
    
    IstInLiers = remove_outliers(X_median, y)
    X_median = X_median[IstInLiers, :]
    X_nan = X_nan[IstInLiers, :]
    y = y[IstInLiers]
    X_median[X_nan] = np.nan
    
    X_median = normalize_data(X_median)
    X_test = normalize_data(X_test)
    
#     X_all = np.r_[X_median, X_test]
    X_KNN = fill_missing_values(X_median, method="KNN")
    X_test_KNN = fill_missing_values(X_test, method="KNN")
    
#     X_all_KNN = fill_missing_values(X_all, method="KNN")
#     X_KNN = X_all_KNN[:X_median.shape[0]]
#     X_test_KNN = X_all_KNN[X_median.shape[0]:]
    print("Traing data shape after impute and outlier remove: {}".format(X_KNN.shape))
    print("Testing data shape after impute and outlier remove: {}".format(X_test_KNN.shape))
    
    return X_KNN, X_test_KNN, y

def select_features(X, y, X_test, feature_num=50):
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=80, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    return X, X_test

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
# X_train_missing_indices = X_train[X_train==np]
# X_train = fill_missing_values(X_train, n_neighbors=75)
# X_test = fill_missing_values(X_test)
# print(X_train.shape)
# print(X_test.shape)

## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [5]:
X_train, X_test, y_train = find_missing_value_and_move_outliers(X_train, y_train, X_test)

Traing data shape before impute and outlier remove: (1212, 832)
Testing data shape before impute and outlier remove: (776, 832)
IsolationForest-Traing data shape before removed: (1212, 832)
Traing data shape after impute and outlier remove: (1198, 828)
Testing data shape after impute and outlier remove: (776, 828)


## 3. Feature Selection

In [6]:
# X, X_test = feature_reduction(X, X_test,750)
X_train, X_test = select_features(X_train, y_train, X_test,feature_num = 50)
print("Traing data shape after selection: {}".format(X_train.shape))
print("Testing data shape after selection: {}".format(X_test.shape))

Traing data shape after selection: (1198, 50)
Testing data shape after selection: (776, 50)


## 4. Extra Tree

In [7]:
def custom_r2(prediction, train_data):
    """Regular r2 cost function returned as a tuple to be used with lgb"""
    labels = train_data.get_label()
    return 'r2', r2_score(labels, prediction), True

def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
   
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 1700,
        'learning_rate': 0.025,
        'max_depth': 11,
        'n_estimators': 1000,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'num_iterations':600,
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=500,
                    feval=custom_r2,
                    valid_sets={lgb_train, lgb_eval},
                    early_stopping_rounds=100,
                    verbose_eval=False
                   )
    
    y_val_pred = gbm.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    y_pred = gbm.predict(X_test) 
    
    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val)

        print('The obtained validation r2 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
# def train_k_fold_predict(X, y,X_test, fold_num=10):
#     kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
#     kf.get_n_splits(X)
#     y_test_predict = np.zeros(X_test.shape[0])
#     for train_index, test_index in kf.split(X):
#         X_train, X_val = X[train_index], X[test_index]
#         y_train, y_val = y[train_index], y[test_index]

#         y_pred = fit_model_and_pred(1, X_train, y_train, X_val, y_val, X_test)
#         y_test_predict += y_pred

#     return y_test_predict/fold_num

def train_k_fold_predict(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    cnt = 0
    val_score = 0.0
        
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        val_score += score
        if(score > 0.6):
            y_test_predict += y_pred
            cnt += 1
    return val_score/fold_num, y_test_predict/cnt

In [8]:
train_k_fold(X_train,y_train, fold_num=10) 

The obtained validation r2 score is :  0.7364113349465762
The obtained validation r2 score is :  0.6799522152047222
The obtained validation r2 score is :  0.5289969640863835
The obtained validation r2 score is :  0.6593595925358319
The obtained validation r2 score is :  0.5650036656245616
The obtained validation r2 score is :  0.5866930512916614
The obtained validation r2 score is :  0.5552872917271834
The obtained validation r2 score is :  0.6467042655098647
The obtained validation r2 score is :  0.6664434781128953
The obtained validation r2 score is :  0.6211767391023032
Validation score: 0.624603


In [9]:
_, y_test_pred = train_k_fold_predict(X_train, y_train, X_test)
final_res = np.vstack((indices_test, y_test_pred)).T



In [10]:
df_res = pd.DataFrame(final_res)
df_res.to_csv("our_result.csv", header = ["id", "y"], index=False)