In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (f_regression, SelectFromModel)
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def fill_missing_values(X, X_test):
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(X)
    X = imp.transform(X)
    X_test = imp.transform(X_test)
    return X, X_test

def remove_outliers(X, y):
    iforest = IsolationForest(max_samples=100, random_state=1, contamination='auto')
    iforest.fit(X)
    print("Traing data shape before removed: {}".format(X.shape))
    outlier_pred = iforest.predict(X)
    X, y = X[(outlier_pred != -1), :], y[(outlier_pred != -1)]
    print("Traing data shape after removed: {}".format(X.shape))
    return X, y

# def select_features(X, y, X_test, feature_num1=150, feature_num2=100):
#     scaler = StandardScaler().fit(X, y)
#     X = scaler.transform(X)
#     X_test = scaler.transform(X_test)
    
#     features_scores = f_regression(X, y)[0]
#     indices_fr = np.asarray(list(features_scores)).argsort()[-feature_num1:][::-1]
    
#     rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, random_state=1)
#     rf.fit(X, y)
#     indices_rf = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num2:][::-1]
    
#     indices = list(np.union1d(indices_rf, indices_fr))
#     X = np.take(X, indices, axis = 1)
#     X_test = np.take(X_test, indices, axis = 1)
    
#     return X, X_test

def select_features(X, y, X_test, feature_num1=200, feature_num2=100, feature_num3=832):
    scaler = StandardScaler().fit(X, y)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num2:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    
    return X, X_test

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
X_train_data, X_test_data = fill_missing_values(X_train_data, X_test_data)

In [5]:
indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y = np.array(y_train_data)[:,1]
X = np.array(X_train_data)[:,1:]

## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [6]:
X, y = remove_outliers(X, y)

Traing data shape before removed: (1212, 832)
Traing data shape after removed: (1194, 832)


## 3. Feature Selection

In [7]:
X, X_test = select_features(X, y, X_test)
print("Traing data shape after selection: {}".format(X.shape))
print("Testing data shape after selection: {}".format(X_test.shape))

Traing data shape after selection: (1194, 100)
Testing data shape after selection: (776, 100)


## 4. Extra Tree

In [8]:
def fit_model_and_pred(degree, X_train, y_train, X_test):
    model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=180, random_state=0, min_samples_split=3, max_features=None)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) 
    return y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    train_score = 0.0
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_val)
        score = r2_score(y_val, y_pred)

        print('The obtained r2 score is : ',score)
        train_score += score
        test_score += score

    print("Training score: %f"%(train_score/fold_num))
    print("Validation score: %f"%(test_score/fold_num))

In [9]:
train_k_fold(X,y)

The obtained r2 score is :  0.7064154108411922
The obtained r2 score is :  0.6736340505784636
The obtained r2 score is :  0.5507036664247489
The obtained r2 score is :  0.5863040503954196
The obtained r2 score is :  0.5779202202900058
The obtained r2 score is :  0.5748806517815278
The obtained r2 score is :  0.5311029984553148
The obtained r2 score is :  0.5317221152947954
The obtained r2 score is :  0.6782772169109788
The obtained r2 score is :  0.5653634235041445
Training score: 0.597632
Validation score: 0.597632


In [10]:
Y_test_pred = fit_model_and_pred(1, X, y, X_test)
final_res = np.vstack((indices_test, Y_test_pred)).T

In [11]:
df_res = pd.DataFrame(final_res)
df_res.to_csv("our_result.csv", header = ["id", "y"], index=False)