In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from missingpy import MissForest
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline



In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

# def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
#     # normalization
#     X_std = np.nanstd(X,axis=0,keepdims=True)
#     X_ave = np.nanmean(X,axis=0,keepdims=True)
#     X_norma = (X-X_ave)/X_std
    
#     imputer = MissForest(max_iter=10, missing_values=np.nan, n_estimators=2, max_depth=4, min_samples_split=2)
#     X_imputed = imputer.fit_transform(X)
    
#     return X_imputed

def remove_outliers(X, y):
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    Z = np.c_[X, y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
    X , y = X[mask, :], y[mask]
    print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return X, y

def select_features(X, y, X_test, feature_num=50):
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=80, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    
    return X, X_test

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
X_train_missing_indices = X_train[X_train==np]
X_train = fill_missing_values(X_train, n_neighbors=75)
X_test = fill_missing_values(X_test)
print(X_train.shape)
print(X_test.shape)

  X_norma = (X-X_ave)/X_std


Iteration: 0
Iteration: 1
Iteration: 2


  X_norma = (X-X_ave)/X_std


Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
(1212, 832)
(776, 832)


## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [5]:
X_train,y_train = remove_outliers(X_train,y_train)

IsolationForest-Traing data shape before removed: (1212, 832)
IsolationForest-Traing data shape after removed: (1198, 832)


## 3. Feature Selection

In [6]:
# X, X_test = feature_reduction(X, X_test,750)
X_train, X_test = select_features(X_train, y_train, X_test,feature_num = 50)
print("Traing data shape after selection: {}".format(X_train.shape))
print("Testing data shape after selection: {}".format(X_test.shape))

Traing data shape after selection: (1198, 50)
Testing data shape after selection: (776, 50)


## 4. Extra Tree

In [7]:
def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
    model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=190, random_state=0, min_samples_split=3, max_features=None)

    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    
    y_pred = model.predict(X_test) 
    
    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val)

        print('The obtained validation r2 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
def train_k_fold_predict(X, y, X_test, fold_num=10):
    kf = KFold(n_splits=fold_num)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    cnt = 0
    val_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
        val_score += score
        if(score > 0.7):
            y_test_predict += y_pred
            cnt += 1
    return val_score/fold_num, y_test_predict/cnt

def train_random_iterations(X, y, X_test, iterations=100):
    y_test_predict = np.zeros(X_test.shape[0])
    y_pred_best = np.zeros(X_test.shape[0])
    cnt = 0
    best_score = -10000
    
    for i in tqdm(range(100)):
        score, y_pred = train_k_fold_predict(X, y, X_test)
        if(score > best_score):
            best_score = score
            y_pred_best = y_pred
            
    print("Total number of prediction used: {}".format(cnt))
    return y_pred_best
    

In [8]:
train_k_fold(X_train,y_train) #Knn with std and feature num = 100

The obtained validation r2 score is :  0.7200893762939793
The obtained validation r2 score is :  0.6568493326629549
The obtained validation r2 score is :  0.5174091632311921
The obtained validation r2 score is :  0.6617141313707703
The obtained validation r2 score is :  0.587016487193833
The obtained validation r2 score is :  0.5015825993369158
The obtained validation r2 score is :  0.5890074807011265
The obtained validation r2 score is :  0.5905130905447111
The obtained validation r2 score is :  0.7091314953017072
The obtained validation r2 score is :  0.6097824145947575
Validation score: 0.614310


In [9]:
# y_test_pred = train_random_iterations(X_train, y_train, X_test)
# final_res = np.vstack((indices_test, y_test_pred)).T

In [10]:
# pd.DataFrame(final_res).to_csv("our_result.csv", header = ["id", "y"], index=False)