In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import lightgbm as lgb

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

def remove_outliers(X, y):
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    Z = np.c_[X, y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
    X , y = X[mask, :], y[mask]
    print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return X, y

def select_features(X, y, X_test, feature_num=50):
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=80, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    return X, X_test

In [3]:

X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
X_train_missing_indices = X_train[X_train==np]
X_train = fill_missing_values(X_train, n_neighbors=75)
X_test_processed = fill_missing_values(X_test)
print(X_train.shape)
print(X_test_processed.shape)

  X_norma = (X-X_ave)/X_std
  X_norma = (X-X_ave)/X_std


(1212, 828)
(776, 828)


## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [5]:
X_processed,y_processed = remove_outliers(X_train,y_train)

IsolationForest-Traing data shape before removed: (1212, 828)
IsolationForest-Traing data shape after removed: (1200, 828)


## 3. Feature Selection

In [6]:
# pearsonr feature selection
from scipy.stats import pearsonr
from sklearn.decomposition import PCA

def pearsonr_feature_selection(X, y, X_test,degree,weights):
    feature_idx =  np.array([i for i in range(0,X.shape[1])])
    correlations = {}
#     print("feature_idx: ", feature_idx.T)
    for idx in feature_idx:
        correlation = 0
        for degree_idx in range(0,len(weights)):
            correlation += abs(weights[degree_idx]*pearsonr(X[:,idx],y**degree[degree_idx])[0])
        correlations[idx] = correlation
    data_correlations = pd.DataFrame(correlations, index=['correlation']).T
    indices_desc = data_correlations['correlation'].abs().sort_values(ascending=False).index
#     print(indices_desc)
    return data_correlations,indices_desc

def pearsonr_feature_selection_correlation(X, X_test,threshold):
    feature_idx =  np.array([i for i in range(0,X.shape[1])])
    correlation_select = np.array([True for i in range(0,X.shape[1])])
    for f_idx in feature_idx:
        if(correlation_select[f_idx]):
            feature_co,indices_desc = pearsonr_feature_selection(X[:,(f_idx+1):], X[:,f_idx], X_test_processed,np.array([1]),np.array([1]))
            reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]
            remove_index = reomove_features.index+(f_idx+1)
            correlation_select[remove_index] = False
    
    return X[:,correlation_select],X_test[:,correlation_select]

In [7]:
data_relation,indices_desc = pearsonr_feature_selection(X_processed, y_processed, X_test_processed,np.array([1,-3,3]),np.array([0.95,0.025,0.025]))

In [8]:
selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]
X_selection = X_processed[:,selected_features.index]
X_test_selection = X_test_processed[:,selected_features.index]
print(X_selection.shape)
print(X_test_selection.shape)

(1200, 818)
(776, 818)


  selected_features = data_relation.loc[indices_desc][abs(data_relation['correlation']) >= 0.001]


In [9]:
# X, X_test = feature_reduction(X, X_test,750)
X_selection_tree, X_test_selection_tree = select_features(X_selection, y_processed, X_test_selection,feature_num = 54)
print("Traing data shape after selection: {}".format(X_selection_tree.shape))
print("Testing data shape after selection: {}".format(X_test_selection_tree.shape))

Traing data shape after selection: (1200, 54)
Testing data shape after selection: (776, 54)


In [10]:
X_selection_tree_cor, X_test_selection_tree_cor = pearsonr_feature_selection_correlation(X_selection_tree, X_test_selection_tree,0.96)
# X_selection_tree_cor, X_test_selection_tree_cor = X_selection_tree, X_test_selection_tree
print(X_selection_tree_cor.shape)
print(X_test_selection_tree_cor.shape)

  reomove_features = feature_co.loc[indices_desc][abs(feature_co['correlation']) >= threshold]


(1200, 50)
(776, 50)


## 4. Gaussian Process

In [18]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C, DotProduct, WhiteKernel, ExpSineSquared,RationalQuadratic, Matern

def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
#     kernel = RBF(0.5, (1e-4, 10))
    kernel1 = Matern(0.5, (1e-4, 10), 2.5)
    kernel2 = RationalQuadratic(length_scale=0.5, alpha=0.5, length_scale_bounds=(1e-4, 10000))
    kernel3 = ExpSineSquared(length_scale=0.5, length_scale_bounds=(1e-4, 10.0) )
    kernel = kernel2
    gpr = GaussianProcessRegressor(kernel=kernel, random_state=0, n_restarts_optimizer=1, alpha=0.07, normalize_y=True)
    gpr.fit(X_train, y_train)
    score = gpr.score(X_val, y_val)   
    y_pred = gpr.predict(X_test) 

    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    train_score = 0.0
    i = 0
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val)
        test_score += score
        print(i+1, 'th training. the obtained validation r2 score is : ',score)
        
        score, _ = fit_model_and_pred(X_train, y_train, X_train, y_train, X_val)
        train_score += score
        print(i+1, 'th training. the obtained training r2 score is : ',score)
        print(' ')
        i += 1
       
    print("Validation score: %f"%(test_score/fold_num))
    print("Training score: %f"%(train_score/fold_num))
    
def train_k_fold_predict(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    cnt = 0
    val_score = 0.0
    train_score = 0.0
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test) 
        val_score += score
        y_test_predict += y_pred
        
        score, _ = fit_model_and_pred(X_train, y_train, X_train, y_train, X_test)
        train_score += score
    return train_score/fold_num, val_score/fold_num, y_test_predict/fold_num

In [19]:
train_score, val_score, y_pred = train_k_fold_predict(X_selection_tree_cor,y_processed,X_test_selection_tree_cor, fold_num=10)
print("Validation score: %f"%(val_score))
print("Training score: %f"%(train_score))

Validation score: 0.684845
Training score: 0.984179


In [13]:
final_res = np.vstack((indices_test, y_pred)).T
df_res = pd.DataFrame(final_res)
df_res.to_csv("our_result.csv", header = ["id", "y"], index=False)