In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (f_regression, SelectFromModel)
from sklearn.decomposition import PCA

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [2]:
# hyperparameters: KNN number of neighbors, max_samples for outlier, number of features for selection

def fill_missing_values(X, X_test, n_neighbors = 50): 
    # normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_test_std = np.nanstd(X_test,axis=0,keepdims=True)
    X_test_ave = np.nanmean(X_test,axis=0,keepdims=True)
    X_norma = (X-X_ave)/X_std
    X_test_norma = (X_test-X_test_ave)/X_test_std
    
    # use KNNImputer
    imputer = KNNImputer(missing_values=np.nan, n_neighbors=n_neighbors, weights = 'distance')
    X_norma_fixed = imputer.fit_transform(X_norma)
    X_test_norma_fixed = imputer.fit_transform(X_test_norma)
    
    return X_norma_fixed, X_test_norma_fixed

def remove_outliers(X, y, max_samples=100):
    iforest = IsolationForest(max_samples=max_samples, random_state=1, contamination='auto')
    iforest.fit(X)

    outlier_pred = iforest.predict(X)
    X, y = X[(outlier_pred != -1), :], y[(outlier_pred != -1)]

    return X, y

def select_features(X, y, X_test, feature_num=100, n_estimators = 50):
    scaler = StandardScaler().fit(X, y)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, random_state=1)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    
    return X, X_test

def feature_reduction(X_train, X_test,n_component):
    pca = PCA(n_components=n_component)
    X_train_reduced = pca.fit_transform(X_train)
    singulars = pca.singular_values_
    print("chosen singular values, max: ", np.max(singulars)," and min:",np.min(singulars))
    X_test_reduced = pca.fit_transform(X_test)
    singulars = pca.singular_values_
    print("chosen singular values, max: ", np.max(singulars)," and min:",np.min(singulars))
    return X_train_reduced,X_test_reduced

def fit_model_and_pred(degree, X_train, y_train, X_test, n_estimators=180):
    model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=n_estimators, random_state=0, min_samples_split=3, max_features=None)

    model.fit(X_train, y_train)
    
    y_training_pred = model.predict(X_train)
    score = r2_score(y_train, y_training_pred)
    y_pred = model.predict(X_test) 
    return y_pred

def train_k_fold(X, y, fold_num=10, n_estimators=180):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_val, n_estimators=n_estimators)
        score = r2_score(y_val, y_pred)
        test_score += score
        
    return test_score/fold_num
    
def train_k_fold_predict(X, y,X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_test)
        y_test_predict += y_pred

    return y_test_predict/fold_num

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_t = np.array(X_test_data)[:,1:]
y = np.array(y_train_data)[:,1]
X = np.array(X_train_data)[:,1:]

In [4]:
n_estimators_feat = 80
n_estimators_pred = 190

n_neighbors_list = [65, 70, 75, 80, 85]
max_samples_list = [175, 195, 200, 205, 210]
feature_num_list = [40, 45, 50, 55, 60]

max_score = -10000
best_n_neighbors = 75
best_max_samples = 200
best_feature_num = 50

for n_neighbors in tqdm(n_neighbors_list):
    for max_samples in max_samples_list:
        for feature_num in feature_num_list:
            X_train, X_test = fill_missing_values(X, X_t, n_neighbors=n_neighbors)
            
            X_train, y_train = remove_outliers(X_train,y, max_samples=max_samples)

            X_train, X_test = select_features(X_train, y_train, X_test,feature_num = feature_num, n_estimators=n_estimators_feat)

            score = train_k_fold(X_train,y_train, n_estimators=n_estimators_pred)
            
            if(score > 0.623):
                print("score {}: feature estimator {}, predict estimator {}"\
                      .format(score, n_estimators_feat, n_estimators_pred))
            
            if(score > max_score):
                max_score = score
                best_n_neighbors = n_neighbors
                best_max_samples = max_samples
                best_feature_num = feature_num

  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = 

  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = 

  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = 

score 0.6233348403830655: feature estimator 80, predict estimator 190


  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = 

  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = 

  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std
  X_norma = 

In [5]:
print(best_n_neighbors)
print(best_max_samples)
print(best_feature_num)

75
200
50
