In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (f_regression, SelectFromModel)
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import KNNImputer

from sklearn.model_selection import KFold

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt


import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU

from keras.layers import Dropout
from keras import regularizers
from keras import initializers
%matplotlib inline

In [2]:
def fill_missing_values(X, X_test):
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(X)
    X = imp.transform(X)
    X_test = imp.transform(X_test)
    return X, X_test
#     imputer = KNNImputer(missing_values=np.nan, n_neighbors=100)
#     X = imputer.fit_transform(X)
#     X_test = imputer.fit_transform(X_test)
#     return X, X_test

def remove_outliers(X, y):
    iforest = IsolationForest(max_samples=100, random_state=1, contamination='auto')
    iforest.fit(X)
    print("Traing data shape before removed: {}".format(X.shape))
    outlier_pred = iforest.predict(X)
    X, y = X[(outlier_pred != -1), :], y[(outlier_pred != -1)]
    print("Traing data shape after removed: {}".format(X.shape))
    return X, y

# def select_features(X, y, X_test, feature_num1=150, feature_num2=100):
#     scaler = StandardScaler().fit(X, y)
#     X = scaler.transform(X)
#     X_test = scaler.transform(X_test)
    
#     features_scores = f_regression(X, y)[0]
#     indices_fr = np.asarray(list(features_scores)).argsort()[-feature_num1:][::-1]
    
#     rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, random_state=1)
#     rf.fit(X, y)
#     indices_rf = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num2:][::-1]
    
#     indices = list(np.union1d(indices_rf, indices_fr))
#     X = np.take(X, indices, axis = 1)
#     X_test = np.take(X_test, indices, axis = 1)
    
#     return X, X_test

def select_features(X, y, X_test, feature_num1=200, feature_num2=100, feature_num3=832):
    scaler = StandardScaler().fit(X, y)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    features_scores = f_regression(X, y)[0]
    indices_fregression = np.asarray(list(features_scores)).argsort()[-feature_num1:][::-1]
    
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, random_state=1)
    rf.fit(X, y)
    indices_randomforest = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num2:][::-1]
    
#     ls = Lasso(alpha=0.0001, normalize=True)
#     polynomial_features = PolynomialFeatures(1)
#     X = polynomial_features.fit_transform(X)
#     ls.fit(X, y)
#     indices_lasso = np.asarray(list(np.abs(ls.coef_))).argsort()[-feature_num3:][::-1]

#     indices = list(np.intersect1d(indices_lasso, np.union1d(indices_fregression, indices_randomforest))) 

    indices = list(np.union1d(indices_fregression, indices_randomforest))
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    return X, X_test

def coeff_determination(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
X_train_data, X_test_data = fill_missing_values(X_train_data, X_test_data)

In [5]:
indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y = np.array(y_train_data)[:,1]
X = np.array(X_train_data)[:,1:]

## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [6]:
X, y = remove_outliers(X, y)

Traing data shape before removed: (1212, 832)
Traing data shape after removed: (1194, 832)


## 3. Feature Selection

In [7]:
X, X_test = select_features(X, y, X_test)
print("Traing data shape after selection: {}".format(X.shape))
print("Testing data shape after selection: {}".format(X_test.shape))

  correlation_coefficient /= X_norms


Traing data shape after selection: (1194, 213)
Testing data shape after selection: (776, 213)


## 4. Extra Tree

In [10]:
def fit_model_and_pred(degree, X_train, y_train, X_test):
    Y = y_train
    dropout = 0.1
    model = Sequential()
    model.add(Dense(30, input_dim = 213, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))
    model.add(Dense(30, input_dim=72, kernel_regularizer = regularizers.l2(1), kernel_initializer='RandomUniform'))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(rate = dropout))

    model.add(Dense(1, kernel_initializer='RandomUniform'))
    # Compile model
    optimizer = tf.keras.optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[coeff_determination])
    # Fit the model
    print("Start fitting ...")
    model.fit(x=X_train, y=Y, epochs=80, verbose=0, validation_split=0.1, shuffle=True, steps_per_epoch=50, initial_epoch=0, validation_steps=5)
    # calculate predictions
    print("calculate predictions")
    predictions = model.predict(X_test)
    return predictions
    
#     model = Lasso(alpha=0.0001, normalize=True)
#     polynomial_features = PolynomialFeatures(degree)
#     X_train = polynomial_features.fit_transform(X_train)
#     X_test = polynomial_features.fit_transform(X_test)
#     model.fit(X_train, y_train/100)
#     y_pred = model.predict(X_test)

#     model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=180, random_state=0, min_samples_split=3, max_features=None)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test) 
#     return y_pred



def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    train_score = 0.0
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_val)
        score = r2_score(y_val, y_pred)

        print('The obtained r2 score is : ',score)
        train_score += score
        test_score += score

    print("Training score: %f"%(train_score/fold_num))
    print("Validation score: %f"%(test_score/fold_num))

In [11]:
train_k_fold(X,y)

Start fitting ...
calculate predictions
The obtained r2 score is :  0.50176112381747
Start fitting ...


  super(Adam, self).__init__(name, **kwargs)


calculate predictions
The obtained r2 score is :  0.48153841384958074


  super(Adam, self).__init__(name, **kwargs)


Start fitting ...
calculate predictions
The obtained r2 score is :  0.3661374284021933


  super(Adam, self).__init__(name, **kwargs)


Start fitting ...
calculate predictions
The obtained r2 score is :  0.4813365379073593
Start fitting ...


  super(Adam, self).__init__(name, **kwargs)


calculate predictions
The obtained r2 score is :  0.4242904700087017
Start fitting ...


  super(Adam, self).__init__(name, **kwargs)


calculate predictions
The obtained r2 score is :  0.4403258318997276


  super(Adam, self).__init__(name, **kwargs)


Start fitting ...
calculate predictions
The obtained r2 score is :  0.387380275548058


  super(Adam, self).__init__(name, **kwargs)


Start fitting ...
calculate predictions
The obtained r2 score is :  0.37012384117562225


  super(Adam, self).__init__(name, **kwargs)


Start fitting ...
calculate predictions
The obtained r2 score is :  0.5423794516542533
Start fitting ...


  super(Adam, self).__init__(name, **kwargs)


calculate predictions
The obtained r2 score is :  0.3429586251571065
Training score: 0.433823
Validation score: 0.433823


In [12]:
Y_test_pred = fit_model_and_pred(1, X, y, X_test)

final_res = np.vstack((indices_test, Y_test_pred[:,0])).T

Start fitting ...


  super(Adam, self).__init__(name, **kwargs)


calculate predictions


In [13]:
df_res = pd.DataFrame(final_res)
df_res.to_csv("our_result.csv", header = ["id", "y"], index=False)