In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (f_regression, SelectFromModel)
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

import pandas as pd
from autofeat import FeatureSelector

%matplotlib inline

In [2]:
def fill_missing_values(X, X_test):
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(X)
    X = imp.transform(X)
    X_test = imp.transform(X_test)
    return X, X_test

def fill_missing_values_KNN(X, X_test): 
    #normalization
#     X_idx = np.array([[i for i in range(0,X.shape[1])]])
#     X_test_idx = np.array([[i for i in range(0,X_test.shape[1])]])
    
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    X_test_std = np.nanstd(X_test,axis=0,keepdims=True)
    X_test_ave = np.nanmean(X_test,axis=0,keepdims=True)
    
    X_norma = (X-X_ave)/X_std
    X_test_norma = (X_test-X_test_ave)/X_test_std

    imputer = KNNImputer(missing_values=np.nan, n_neighbors=75, weights = 'distance')
    X_norma_fixed = imputer.fit_transform(X_norma)
    X_test_norma_fixed = imputer.fit_transform(X_test_norma)
#     X_norma_fixed = X_norma_fixed[1:,:]
#     X_test_norma_fixed = X_test_norma_fixed[1:,:]
    
#     X_fixed = (X_norma_fixed+X_ave)*(X_max-X_min)
#     X_test_fixed = (X_test_norma+X_test_ave)*(X_test_max-X_test_min)
    return X_norma_fixed, X_test_norma_fixed

def remove_outliers(X, y):
    iforest = IsolationForest(max_samples=200,  random_state=1, contamination='auto',)
    iforest.fit(X)
    print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    outlier_pred = iforest.predict(X)
    X, y = X[(outlier_pred != -1), :], y[(outlier_pred != -1)]
    print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return X, y



# def select_features(X, y, X_test, feature_num1=150, feature_num2=100):
#     scaler = StandardScaler().fit(X, y)
#     X = scaler.transform(X)
#     X_test = scaler.transform(X_test)
    
#     features_scores = f_regression(X, y)[0]
#     indices_fr = np.asarray(list(features_scores)).argsort()[-feature_num1:][::-1]
    
#     rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, random_state=1)
#     rf.fit(X, y)
#     indices_rf = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num2:][::-1]
    
#     indices = list(np.union1d(indices_rf, indices_fr))
#     X = np.take(X, indices, axis = 1)
#     X_test = np.take(X_test, indices, axis = 1)
    
#     return X, X_test

def select_features(X, y, X_test, feature_num=100):
    scaler = StandardScaler().fit(X, y)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=75, random_state=1)
#     rf = ExtraTreesRegressor(n_jobs=-1, max_depth=None, n_estimators=180, random_state=1, min_samples_split=3, max_features=None)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    
    return X, X_test

def feature_reduction(X_train, X_test,n_component):
    pca = PCA(n_components=n_component)
    X_train_reduced = pca.fit_transform(X_train)
    singulars = pca.singular_values_
    print("chosen singular values, max: ", np.max(singulars)," and min:",np.min(singulars))
    X_test_reduced = pca.fit_transform(X_test)
    singulars = pca.singular_values_
    print("chosen singular values, max: ", np.max(singulars)," and min:",np.min(singulars))
    return X_train_reduced,X_test_reduced


#https://github.com/KonstantinosBarmpas/Advanced-Machine-Learning-Projects/blob/master/Task%201/Task_1_AML.ipynb
def auto_feature_extraction(X, y, X_test):
    fsel = FeatureSelector(verbose=1)
    
    feature_names = [str(i) for i in range(0,X.shape[1])]
    X_pd = pd.DataFrame(X, columns=feature_names)
    new_X = fsel.fit_transform(X_pd, y)
    print("len of columns:", len(new_X.columns))
    print(new_X.columns)
    index_chosen = [int(new_X.columns[i]) for i in range(0,len(new_X.columns))]
    return X[:,index_chosen],X_test[:,index_chosen]

In [3]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [4]:
indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

In [5]:
#mixed way to impute missing data
# alpha = 0.5
# X_train_data1, X_test_data1 = fill_missing_values(X_train_data, X_test_data)
# X_train_data2, X_test_data2 = fill_missing_values_KNN(X_train_data, X_test_data)
# X_train_data = alpha*X_train_data1+(1-alpha)*X_train_data2
# X_test_data = alpha*X_test_data1+(1-alpha)*X_test_data2

X_train, X_test = fill_missing_values_KNN(X_train, X_test)
print("shape of X_train: ",X_train.shape)
print("shape of X_test: ",X_test.shape)

  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_test_ave)/X_test_std


shape of X_train:  (1212, 828)
shape of X_test:  (776, 828)


## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [6]:
X, y = X_train,y_train
X, y = remove_outliers(X, y)

IsolationForest-Traing data shape before removed: (1212, 828)
IsolationForest-Traing data shape after removed: (1206, 828)


## 3. Feature Selection

In [7]:
X_feature_from_batch1, X_test_feature_from_batch1 = X, X_test

In [8]:
#splited feature selection\
split_num = 20
split_size = int(X_test_feature_from_batch1.shape[1]/split_num)
first_flag = 1
for i in range(0,split_num):
    split_index = [j for j in range(i*split_size,min((i+1)*split_size,X_test_feature_from_batch1.shape[1]))]
    X_feature_subset = X_feature_from_batch1[:,split_index]
    X_test__feature_subset = X_test_feature_from_batch1[:,split_index]
    X_feature_subset, X_test__feature_subset = auto_feature_extraction(X_feature_subset, y, X_test__feature_subset)

    if(first_flag):
        first_flag = 0
        X_feature_from_batch2 = X_feature_subset
        X_test_feature_from_batch2 = X_test__feature_subset
    else:

        X_feature_from_batch2 = np.hstack((X_feature_from_batch2,X_feature_subset))
        X_test_feature_from_batch2 = np.hstack((X_test_feature_from_batch2,X_test__feature_subset))
print("shape of X_feature_from_batch2:",X_test_feature_from_batch2.shape)
print("shape of X_test_feature_from_batch2:",X_test_feature_from_batch2.shape)



[featsel] Scaling data...

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Internal error at <numba.typeinfer.CallConstraint object at 0x7fe637cbf3a0>.
Failed in nopython mode pipeline (step: nopython frontend)
Unsupported constraint encountered: raise $24load_global.0.175

File "../../../../../anaconda3/envs/aml_project/lib/python3.8/site-packages/autofeat/nb_utils.py", line 12:
def nb_apply_along_axis(func1d, axis, arr):
    <source elided>
    assert arr.ndim == 2
    assert axis in [0, 1]
    ^

[1] During: resolving callee type: type(CPUDispatcher(<function nb_nanmean at 0x7fe6508514c0>))
[2] During: typing of call at /Users/zdh/anaconda3/envs/aml_project/lib/python3.8/site-packages/autofeat/nb_utils.py (36)

Enable logging at debug level for details.

File "../../../../../anaconda3/envs/aml_project/lib/python3.8/site-packages/autofeat/nb_utils.py", line 36:
def nb_standard_scale(array):
    return (array - nb_nanmean(array, 0)) / nb_nanstd(array, 0)
    ^


In [None]:
X_feature_from_batch3, X_test_feature_from_batch3 = select_features(X_feature_from_batch2, y, X_test_feature_from_batch2,feature_num = 53)

## 4. Extra Tree

In [None]:
def fit_model_and_pred(degree, X_train, y_train, X_test):
    model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=195, random_state=0, min_samples_split=2, max_features=None)

    model.fit(X_train, y_train)
    
    y_training_pred = model.predict(X_train)
    score = r2_score(y_train, y_training_pred)
    print('The obtained training r2 score is : ',score)
    y_pred = model.predict(X_test) 
    return y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    train_score = 0.0
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_val)
        score = r2_score(y_val, y_pred)

        print('The obtained validation r2 score is : ',score)
        train_score += score
        test_score += score

    print("Training score: %f"%(train_score/fold_num))
    print("Validation score: %f"%(test_score/fold_num))
    
def train_k_fold_predict(X, y,X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
    kf.get_n_splits(X)
    y_test_predict = np.zeros(X_test.shape[0])
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_test)
        y_test_predict += y_pred

    return y_test_predict/fold_num

In [None]:
train_k_fold(X_feature_from_batch3,y,fold_num = 10) #Knn with std and feature num = 100

In [None]:
# Y_test_pred = fit_model_and_pred(1, X, y, X_test)
Y_test_pred = train_k_fold_predict(X_feature_from_batch3, y,X_test_feature_from_batch3)
final_res = np.vstack((indices_test, Y_test_pred)).T

In [None]:
df_res = pd.DataFrame(final_res)
df_res.to_csv("our_result.csv", header = ["id", "y"], index=False)