In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.impute import (SimpleImputer,KNNImputer)
from sklearn.ensemble import (RandomForestRegressor, IsolationForest)
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures, normalize
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [10]:
def fill_missing_values_KNN(X, X_test): 
    #normalization
    X_std = np.nanstd(X,axis=0,keepdims=True)
    X_ave = np.nanmean(X,axis=0,keepdims=True)
    
    X_norma = (X-X_ave)/X_std
    X_test_norma = (X_test-X_ave)/X_std

    imputer = KNNImputer(missing_values=np.nan, n_neighbors=75, weights = 'distance')
    X_norma_fixed = imputer.fit_transform(X_norma)
    X_test_norma_fixed = imputer.fit_transform(X_test_norma)

    return X_norma_fixed, X_test_norma_fixed

# def fill_missing_values_mean(X, X_test):
#     imp = SimpleImputer(missing_values=np.nan, strategy='median')
#     imp.fit(X)
#     X = imp.transform(X)
#     imp.fit(X_test)
#     X_test = imp.transform(X_test)
#     return X, X_test

def fill_missing_values_median(X):
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    imp.fit(X)
    X = imp.transform(X)
    return X

def remove_outliers_XY(X, y, y_additional_degree):
#     print("IsolationForest-Traing data shape before removed: {}".format(X.shape))
    
    Y = y
    for degree in y_additional_degree:
        Y = np.c_[Y, y**degree]
        
    Z = np.c_[X, Y]
    iforest = IsolationForest(max_samples=200, random_state=1, contamination=0.005)
    iforest.fit(Z)
    iforest_outlier_pred = iforest.predict(Z)
    
    Z = np.c_[X, Y]
    local = LocalOutlierFactor(n_neighbors=150, contamination=0.005)
    local.fit(Z)
    local_outlier_pred = local.fit_predict(Z)

    mask = np.logical_and((iforest_outlier_pred!=-1), (local_outlier_pred!=-1))
#     X , y = X[mask, :], y[mask]
#     print("IsolationForest-Traing data shape after removed: {}".format(X.shape))
    return mask



def find_missing_value_and_move_outliers(X, y, X_test,y_additional_degree):
    print("Traing data shape before impute and outlier remove: {}".format(X.shape))
    print("Testing data shape before impute and outlier remove: {}".format(X_test.shape))
    X_nan = np.isnan(X)  
    X_median = fill_missing_values_median(X)
    print(X_median.shape)
    
    IstInLiers = remove_outliers_XY(X_median, y, y_additional_degree)
    X_median = X_median[IstInLiers, :]
    X_nan = X_nan[IstInLiers, :]
    y = y[IstInLiers]
    print(X_median.shape)
    print(X_nan.shape)    
    X_median[X_nan] = np.nan
    
    X_KNN,X_test_KNN = fill_missing_values_KNN(X_median, X_test)
    print("Traing data shape after impute and outlier remove: {}".format(X_KNN.shape))
    print("Testing data shape after impute and outlier remove: {}".format(X_test_KNN.shape))
    
    return X_KNN,X_test_KNN,y

In [3]:
def select_features(X, y, X_test, feature_num=100):
    scaler = StandardScaler().fit(X, y)
    X = scaler.transform(X)
    X_test = scaler.transform(X_test)
    
    rf = RandomForestRegressor(n_jobs=-1, n_estimators=75, random_state=1)
#     rf = ExtraTreesRegressor(n_jobs=-1, max_depth=None, n_estimators=180, random_state=1, min_samples_split=3, max_features=None)
    rf.fit(X, y)
    indices = np.asarray(list(rf.feature_importances_)).argsort()[-feature_num:][::-1]
    
    X = np.take(X, indices, axis = 1)
    X_test = np.take(X_test, indices, axis = 1)
    
    return X, X_test
def auto_feature_extraction(X, y, X_test):
    fsel = FeatureSelector(verbose=1)
    
    feature_names = [str(i) for i in range(0,X.shape[1])]
    X_pd = pd.DataFrame(X, columns=feature_names)
    new_X = fsel.fit_transform(X_pd, y)
    print("len of columns:", len(new_X.columns))
    print(new_X.columns)
    index_chosen = [int(new_X.columns[i]) for i in range(0,len(new_X.columns))]
    return X[:,index_chosen],X_test[:,index_chosen]

In [4]:
X_train_data = pd.read_csv('X_train.csv')
y_train_data = pd.read_csv('y_train.csv')
X_test_data = pd.read_csv('X_test.csv')

indices_test = np.array(X_test_data)[:,0]
X_test = np.array(X_test_data)[:,1:]
y_train = np.array(y_train_data)[:,1]
X_train = np.array(X_train_data)[:,1:]

## 1. Imputation of Missing Values
* [Reference](https://scikit-learn.org/stable/modules/impute.html)
* We use median of column instead of mean

In [5]:
# X_train_missing_indices = X_train[X_train==np]
# X_train, X_test = fill_missing_values(X_train, X_test)
# print(X_train.shape)
# print(X_test.shape)

## 2. Outlier Detection
* [reference_sklearn](https://scikit-learn.org/stable/modules/outlier_detection.html)
* [reference_in_detail](https://practicaldatascience.co.uk/machine-learning/how-to-use-the-isolation-forest-model-for-outlier-detection)

In [11]:
# X_train,y_train = remove_outliers(X_train,y_train)
X_train_processed, X_test_processed, y_processed = find_missing_value_and_move_outliers(X_train, y_train, X_test,[0.5,1.5,2,-1])

Traing data shape before impute and outlier remove: (1212, 832)
Testing data shape before impute and outlier remove: (776, 832)
(1212, 832)
(1198, 832)
(1198, 832)


  X_norma = (X-X_ave)/X_std
  X_test_norma = (X_test-X_ave)/X_std


Traing data shape after impute and outlier remove: (1198, 828)
Testing data shape after impute and outlier remove: (776, 828)


## 3. Feature Selection

In [7]:
def auto_feature_extraction(X, y, X_test):
    fsel = FeatureSelector(verbose=1)
    
    feature_names = [str(i) for i in range(0,X.shape[1])]
    X_pd = pd.DataFrame(X, columns=feature_names)
    new_X = fsel.fit_transform(X_pd, y)
    print("len of columns:", len(new_X.columns))
    print(new_X.columns)
    index_chosen = [int(new_X.columns[i]) for i in range(0,len(new_X.columns))]
    return X[:,index_chosen],X_test[:,index_chosen]

In [8]:
X_feature_from_batch1, X_test_feature_from_batch1 = X_train_processed, X_test_processed

In [9]:
from autofeat import FeatureSelector, AutoFeatRegressor
split_num = 20
split_size = int(X_test_feature_from_batch1.shape[1]/split_num)
first_flag = 1
for i in range(0,split_num):
    split_index = [j for j in range(i*split_size,min((i+1)*split_size,X_test_feature_from_batch1.shape[1]))]
    X_feature_subset = X_feature_from_batch1[:,split_index]
    X_test__feature_subset = X_test_feature_from_batch1[:,split_index]
    X_feature_subset, X_test__feature_subset = auto_feature_extraction(X_feature_subset, y_processed, X_test__feature_subset)

    if(first_flag):
        first_flag = 0
        X_feature_from_batch2 = X_feature_subset
        X_test_feature_from_batch2 = X_test__feature_subset
    else:

        X_feature_from_batch2 = np.hstack((X_feature_from_batch2,X_feature_subset))
        X_test_feature_from_batch2 = np.hstack((X_test_feature_from_batch2,X_test__feature_subset))
print("shape of X_feature_from_batch2:",X_test_feature_from_batch2.shape)
print("shape of X_test_feature_from_batch2:",X_test_feature_from_batch2.shape)

[featsel] Scaling data...

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Internal error at <numba.typeinfer.CallConstraint object at 0x7fdc5ef6fe20>.
Failed in nopython mode pipeline (step: nopython frontend)
Unsupported constraint encountered: raise $24load_global.0.175

File "../../../../../anaconda3/envs/aml_project/lib/python3.8/site-packages/autofeat/nb_utils.py", line 12:
def nb_apply_along_axis(func1d, axis, arr):
    <source elided>
    assert arr.ndim == 2
    assert axis in [0, 1]
    ^

[1] During: resolving callee type: type(CPUDispatcher(<function nb_nanmean at 0x7fdc5e981280>))
[2] During: typing of call at /Users/zdh/anaconda3/envs/aml_project/lib/python3.8/site-packages/autofeat/nb_utils.py (36)

Enable logging at debug level for details.

File "../../../../../anaconda3/envs/aml_project/lib/python3.8/site-packages/autofeat/nb_utils.py", line 36:
def nb_standard_scale(array):
    return (array - nb_nanmean(array, 0)) / nb_nanstd(array, 0)
    ^


In [None]:
X_feature_from_batch3, X_test_feature_from_batch3 = select_features(X_feature_from_batch2, y_processed, X_test_feature_from_batch2,feature_num = 60)
print("shape of X_feature_from_batch3: ", X_feature_from_batch3.shape)
print("shape of X_test_feature_from_batch3: ", X_test_feature_from_batch3.shape)

## 4. Extra Tree

In [None]:



def custom_r2(prediction, train_data):
    """Regular r2 cost function returned as a tuple to be used with lgb"""
    labels = train_data.get_label()
    return 'r2', r2_score(labels, prediction), True

def fit_model_and_pred(degree, X_train, y_train, X_val, y_val, X_test):
   
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'num_leaves': 1700,
        'learning_rate': 0.025,
        'max_depth': 11,
        'n_estimators': 1000,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0,
        'num_iterations':600,
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=500,
                    feval=custom_r2,
                    valid_sets={lgb_train, lgb_eval},
                    early_stopping_rounds=100
                   )

    y_pred = gbm.predict(X_test) 
    
    return y_pred

def train_k_fold(X, y, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=True)
    kf.get_n_splits(X)
    test_score = 0.0
    
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        y_pred = fit_model_and_pred(1, X_train, y_train, X_val, y_val, X_val)
        score = r2_score(y_val, y_pred)

        print('The obtained validation r2 score is : ',score)
        test_score += score
    print("Validation score: %f"%(test_score/fold_num))
    
def train_k_fold_predict(X, y,X_test, fold_num=10):
    kf = KFold(n_splits=fold_num, random_state=None, shuffle=True)
    kf.get_n_splits(X)
    test_score = 0.0
    y_test_predict = np.zeros(X_test.shape[0])
    for train_index, test_index in kf.split(X):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        score = r2_score(y_val, fit_model_and_pred(1, X_train, y_train, X_val, y_val, X_val))
        test_score += score
        
        y_pred = fit_model_and_pred(1, X_train, y_train, X_val, y_val, X_test)
        y_test_predict += y_pred
        
    print("Validation score: %f"%(test_score/fold_num))
    return y_test_predict/fold_num

# 以下是树训练模型
# def fit_model_and_pred(X_train, y_train, X_val, y_val, X_test):
#     model = ExtraTreesRegressor(n_jobs=1, max_depth=None, n_estimators=195, random_state=0, min_samples_split=2, max_features=None)

#     model.fit(X_train, y_train)
    
#     y_val_pred = model.predict(X_val)
#     score = r2_score(y_val, y_val_pred)
    
#     y_pred = model.predict(X_test) 
    
#     return score, y_pred


# def train_k_fold(X, y, fold_num=10):
#     kf = KFold(n_splits=fold_num, random_state=None, shuffle=False)
#     kf.get_n_splits(X)
#     test_score = 0.0
    
#     for train_index, test_index in kf.split(X):
#         X_train, X_val = X[train_index], X[test_index]
#         y_train, y_val = y[train_index], y[test_index]

#         y_pred = fit_model_and_pred(1, X_train, y_train, X_val, y_val, X_val)
#         score = r2_score(y_val, y_pred)

#         print('The obtained validation r2 score is : ',score)
#         test_score += score
#     print("Validation score: %f"%(test_score/fold_num))
    

# def train_k_fold_predict(X, y, X_test, fold_num=10):
#     kf = KFold(n_splits=fold_num)
#     kf.get_n_splits(X)
#     y_test_predict = np.zeros(X_test.shape[0])
#     cnt = 0
#     val_score = 0.0
    
#     for train_index, test_index in kf.split(X):
#         X_train, X_val = X[train_index], X[test_index]
#         y_train, y_val = y[train_index], y[test_index]

#         score, y_pred = fit_model_and_pred(X_train, y_train, X_val, y_val, X_test)
#         val_score += score
#         if(score > 0.7):
#             y_test_predict += y_pred
#             cnt += 1
#     return val_score/fold_num, y_test_predict/cnt


In [None]:
train_k_fold(X_feature_from_batch3,y_processed) #Knn with std and feature num = 100

In [None]:
Y_test_pred = train_k_fold_predict(X_feature_from_batch3,y_processed,X_test_feature_from_batch3)
final_res = np.vstack((indices_test, Y_test_pred)).T

In [None]:
pd.DataFrame(final_res).to_csv("our_result.csv", header = ["id", "y"], index=False)