In [47]:
import numpy as np
from sklearn import (
    decomposition,
    ensemble,
    feature_selection,
    impute,
    linear_model,
    model_selection,
    pipeline,
    preprocessing,
    svm,
)
import lightgbm as lgb
import catboost as cat
from xgboost import XGBRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic

# from explore import plot_outliers


def main():
    X_train, y_train, X_test = load_data()
    print(X_train.shape, y_train.shape, X_test.shape)

    X_train, y_train = remove_outliers(X_train, y_train, X_test)
    X_train, X_test = preprocess(X_train, X_test)
    X_train, X_test = select_features(X_train, y_train, X_test)
    print(X_train.shape, y_train.shape, X_test.shape)

    model = pipeline.make_pipeline(
        ensemble.StackingRegressor(
            estimators=[
                ("svr", svm.SVR(C=60.0, epsilon=1e-05, kernel='rbf')),
                ("gbm", ensemble.GradientBoostingRegressor(learning_rate=0.095)),
                ("etr", ensemble.ExtraTreesRegressor()),
                ('lgb', lgb.LGBMRegressor())

            ],
            final_estimator=linear_model.Ridge(),
        )
    )
    score = model_selection.cross_val_score(model, X_train, y_train, cv=5, n_jobs=6)
    print(score.mean(), score.std())  # 0.6844646263431688 0.02663668357699777

    create_submission(model, X_train, y_train, X_test)


def load_data():
    X_train = np.genfromtxt("X_train.csv", delimiter=",", skip_header=1)[:, 1:]
    y_train = np.genfromtxt("y_train.csv", delimiter=",", skip_header=1)[:, 1:]
    X_test = np.genfromtxt("X_test.csv", delimiter=",", skip_header=1)[:, 1:]
    y_train = y_train.ravel()
    return X_train, y_train, X_test


def remove_outliers(X_train, y_train, X_test):
    model = pipeline.make_pipeline(
        preprocessing.RobustScaler(),
        impute.SimpleImputer(strategy="median"),
        decomposition.PCA(n_components=2),
        ensemble.IsolationForest(contamination=0.0455),  # type: ignore
    )
    pred = model.fit_predict(X_train)
    # plot_outliers(model[:3].transform(X_train), model[:3].transform(X_test), pred)
    X_train, y_train = X_train[pred > 0], y_train[pred > 0]
    return X_train, y_train


def preprocess(X_train, X_test):
    model = pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        impute.SimpleImputer(strategy="median"),
    )
    X_train = model.fit_transform(X_train)
    X_test = model.transform(X_test)
    return X_train, X_test


def select_features(X_train, y_train, X_test):
    model = pipeline.make_pipeline(
        feature_selection.VarianceThreshold(),
        feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=195),
        feature_selection.SelectFromModel(linear_model.Lasso(0.1)),
    )
    model.fit(X_train, y_train)
    X_train = model.transform(X_train)
    X_test = model.transform(X_test)
    return X_train, X_test


def create_submission(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = np.vstack((np.arange(X_test.shape[0]), pred)).T
    np.savetxt("submission_test.csv", pred, delimiter=",", header="id,y", comments="")

In [48]:
main()

(1212, 832) (1212,) (776, 832)
(1156, 81) (1156,) (776, 81)
0.687761286777815 0.030280854381405153
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20655
[LightGBM] [Info] Number of data points in the train set: 1156, number of used features: 81
[LightGBM] [Info] Start training from score 69.978374
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20655
[LightGBM] [Info] Number of data points in the train set: 924, number of used features: 81
[LightGBM] [Info] Start training from score 69.954545
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20655
[LightGBM] [Info] N