In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [2]:
def fill_missing_values(X, n_neighbors = 75, method="KNN"): 
    
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance') if method=="KNN"\
        else SimpleImputer(missing_values=np.nan, strategy='median')
    
    X_norma_fixed = imputer.fit_transform(X_norma)
    
    return X_norma_fixed

# def remove_outliers(X, y):
#     print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
#     iforest = IsolationForest(max_samples=200, random_state=1, contamination='auto')
#     iforest.fit(X)
#     iforest_outlier_pred = iforest.predict(X)

#     local = LocalOutlierFactor(n_neighbors=50, contamination=0.08)
#     local.fit(X)
#     local_outlier_pred = local.predict(X)

#     mask = np.logic_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
#     X , y = X[mask, :], y[mask]
#     print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
#     return X, y

def remove_outliers(X, y):
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    Z = np.c_[X, y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
    X , y = X[mask, :], y[mask]
    print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return X, y

def select_features(X, y, X_test, feature_num=50):
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=80, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    
    return X, X_test

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
X_train_missing_indices = X_train[X_train==np]
X_train = fill_missing_values(X_train, n_neighbors=75)
X_test = fill_missing_values(X_test)
print(X_train.shape)
print(X_test.shape)

  X_norma = (X-X_ave)/X_std
  X_norma = (X-X_ave)/X_std


(1212, 828)
(776, 828)


## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [5]:
X_train,y_train = remove_outliers(X_train,y_train)

IsolationForest-Traing data shape before removed: (1212, 828)
IsolationForest-Traing data shape after removed: (1199, 828)


## 3. Feature Selection

In [6]:
# X, X_test = feature_reduction(X, X_test,750)
X_train, X_test = select_features(X_train, y_train, X_test,feature_num = 50)
print("Traing data shape after selection: {}".format(X_train.shape))
print("Testing data shape after selection: {}".format(X_test.shape))

Traing data shape after selection: (1199, 50)
Testing data shape after selection: (776, 50)


## 4. Extra Tree

In [7]:
def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test, random_state=0):
    model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=190, random_state=random_state, min_samples_split=3, max_features=None)
    model.fit(X_train, y_train)
    
    y_val_pred = model.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    
    y_pred = model.predict(X_test) 
    
    return score, y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_val, random_state=0)

        print('The obtained validation r2 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
def train_k_fold_predict(X, y, X_test, fold_num=10, random_state=0):
    kf = KFold(n_splits=fold_num)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    cnt = 0
    val_score = 0.0
        
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test, random_state=random_state)
        val_score += score
        if(score > 0.68):
            y_test_predict += y_pred
            cnt += 1
    return val_score/fold_num, y_test_predict/cnt

def train_random_iterations(X, y, X_test, iterations=100):
    y_test_predict = np.zeros(X_test.shape[0])
    y_pred_best = np.zeros(X_test.shape[0])
    cnt = 0
    best_score = -10000
    
    for i in tqdm(range(iterations)):
        score, y_pred = train_k_fold_predict(X, y, X_test, random_state=i)
        if(score > 0.635):
            print(score)
            y_pred_best += y_pred
            cnt += 1
            
    return y_pred_best / cnt
    

In [8]:
train_k_fold(X_train,y_train) #Knn with std and feature num = 100

The obtained validation r2 score is :  0.7331676300461234
The obtained validation r2 score is :  0.7437835030819562
The obtained validation r2 score is :  0.5616359281180663
The obtained validation r2 score is :  0.6554973894083775
The obtained validation r2 score is :  0.6341653039748976
The obtained validation r2 score is :  0.6376705004459049
The obtained validation r2 score is :  0.5337553808762132
The obtained validation r2 score is :  0.5169019299029172
The obtained validation r2 score is :  0.7128983915024439
The obtained validation r2 score is :  0.578659799264144
Validation score: 0.630814


In [None]:
y_test_pred = train_random_iterations(X_train, y_train, X_test, iterations=1000)
final_res = np.vstack((indices_test, y_test_pred)).T

  1%|▎                                       | 9/1000 [03:11<5:57:01, 21.62s/it]

0.6355574316595599


  2%|▌                                      | 16/1000 [05:11<4:40:24, 17.10s/it]

0.6374541210319323


  3%|█                                      | 26/1000 [08:24<4:50:37, 17.90s/it]

0.6355996883787901


  3%|█▏                                     | 32/1000 [10:03<4:29:07, 16.68s/it]

0.6359464666770664


  4%|█▌                                     | 41/1000 [12:29<4:09:53, 15.63s/it]

0.6379866497875307


  5%|█▊                                     | 48/1000 [14:20<4:08:35, 15.67s/it]

0.6372897931521061


  5%|██                                     | 52/1000 [15:23<4:11:36, 15.92s/it]

0.6380312201172955


  5%|██                                     | 53/1000 [15:40<4:14:15, 16.11s/it]

0.6350013544233244


  6%|██▎                                    | 58/1000 [17:23<5:11:55, 19.87s/it]

0.6350264551633141


  7%|██▋                                    | 68/1000 [20:49<5:23:41, 20.84s/it]

0.6355587322252753


  7%|██▊                                    | 73/1000 [22:33<5:19:47, 20.70s/it]

0.6357928885761372


  8%|███                                    | 79/1000 [24:38<5:20:30, 20.88s/it]

0.636703429571938


 10%|███▊                                   | 97/1000 [31:57<5:36:27, 22.36s/it]

0.6357702033136765


 10%|███▉                                  | 102/1000 [33:31<4:31:15, 18.12s/it]

0.6358057730833767


 11%|████▏                                 | 109/1000 [35:23<4:06:48, 16.62s/it]

0.635913559653505


 11%|████▎                                 | 113/1000 [36:25<3:51:40, 15.67s/it]

0.635927650290453


 13%|████▊                                 | 128/1000 [40:19<3:41:39, 15.25s/it]

0.635237455760204


 13%|████▉                                 | 129/1000 [40:34<3:41:10, 15.24s/it]

0.635433790150396


 13%|████▉                                 | 130/1000 [40:50<3:42:30, 15.35s/it]

0.635011269256178


 14%|█████▏                                | 137/1000 [42:41<3:47:07, 15.79s/it]

0.6354503712724613


 14%|█████▍                                | 143/1000 [44:14<3:41:19, 15.50s/it]

0.6357983698541954


 15%|█████▊                                | 152/1000 [46:43<3:55:41, 16.68s/it]

0.6351881015328213


 15%|█████▊                                | 154/1000 [47:13<3:44:32, 15.92s/it]

0.6373630981250489


 16%|█████▉                                | 155/1000 [47:28<3:40:43, 15.67s/it]

0.6353568022916265


 16%|██████                                | 160/1000 [48:45<3:36:09, 15.44s/it]

0.6366228075391067


 16%|██████                                | 161/1000 [49:04<3:53:15, 16.68s/it]

0.6354659318206601


 16%|██████▏                               | 163/1000 [49:43<4:10:53, 17.98s/it]

0.6384530495879053


 17%|██████▎                               | 167/1000 [50:54<4:06:50, 17.78s/it]

0.6377522997014614


 17%|██████▌                               | 172/1000 [52:38<4:24:12, 19.15s/it]

0.6372653080282678


 18%|██████▊                               | 180/1000 [55:11<4:09:30, 18.26s/it]

0.6358264819563859


 18%|██████▉                               | 183/1000 [56:05<4:06:49, 18.13s/it]

0.6361816925566943


 19%|███████▏                              | 190/1000 [58:22<4:15:45, 18.94s/it]

0.6363728545333583


 20%|███████                             | 197/1000 [1:00:18<3:44:06, 16.75s/it]

0.6388397953786088


 20%|███████▏                            | 200/1000 [1:01:07<3:42:46, 16.71s/it]

In [None]:
pd.DataFrame(final_res).to_csv("our_result.csv", header = ["id", "y"], index=False)