In [45]:
import numpy as np
import pandas as pd
import lightgbm as lgb 
import catboost as cat
from xgboost import XGBRegressor
from sklearn import (
    decomposition,
    ensemble,
    feature_selection,
    impute,
    linear_model,
    model_selection,
    pipeline,
    preprocessing,
    svm,
)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
from sklearn.metrics import r2_score, mean_squared_error
from pyod.models.ecod import ECOD

In [46]:
def load_data():
    X_train = pd.read_csv('X_train.csv', usecols=lambda column: column != 'id')
    y_train = pd.read_csv('y_train.csv', usecols=lambda column: column != 'id')
    X_test = pd.read_csv('X_test.csv', usecols=lambda column: column != 'id')
    sample = pd.read_csv('sample.csv')
    return X_train, y_train, X_test, sample

def outlier(X_train, y_train):
    model = pipeline.make_pipeline(
        preprocessing.RobustScaler(),
        impute.SimpleImputer(strategy='median'),
        decomposition.PCA(n_components=2),
        ensemble.IsolationForest(contamination=0.0455)
    )
    mask = (model.fit_predict(X_train) > 0).astype(int)==1
    X_train= pd.DataFrame(X_train[mask]).reset_index(drop=True)
    y_train= pd.DataFrame(y_train[mask]).reset_index(drop=True)
    return X_train, y_train

def imput(X_train, X_test):
    model = pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        impute.SimpleImputer(strategy='median'),
    )
    X_train = model.fit_transform(X_train)
    X_test = model.transform(X_test)
    return X_train, X_test              

def select_features(X_train, y_train, X_test):
    model = pipeline.make_pipeline(
        feature_selection.VarianceThreshold(),
        feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=195),
        feature_selection.SelectFromModel(linear_model.Lasso(0.1))
    )              
    model.fit(X_train, y_train)
    X_train = model.transform(X_train)
    X_test = model.transform(X_test)
    return X_train, X_test

def make_submission(model, X_train, y_train, X_test, sample):
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    sample['y'] = y_test_pred
    sample.to_csv("submission.csv", index=False)
    

In [53]:
X_train_original, y_train_original, X_test_original, sample = load_data()
print('The original shapes of X_train, y_train and X_test are: ', 
      X_train_original.shape, y_train_original.shape, X_test_original.shape)

X_train, y_train = outlier(X_train_original, y_train_original)
X_train, X_test = imput(X_train, X_test_original)
X_train, X_test = select_features(X_train, y_train, X_test)
print('The preprocessed shapes of X_train, y_train and X_test are: ', 
      X_train.shape, y_train.shape, X_test.shape)

model = pipeline.make_pipeline(
    ensemble.StackingRegressor(
        estimators=[
            ("svr", svm.SVR(C=60.0, epsilon=1e-05, kernel='rbf')),
            ("etr", ensemble.ExtraTreesRegressor()),
            ('lgb', lgb.LGBMRegressor(verbose=0)),
            ("gbm", ensemble.GradientBoostingRegressor(learning_rate=0.095)),
            ('cat', cat.CatBoostRegressor(verbose=0))

        ],
        final_estimator=linear_model.ElasticNet(),
    )
)
score = model_selection.cross_val_score(model, X_train, y_train, cv=5, n_jobs=6)
print(score.mean(), score.std()) 

make_submission(model, X_train, y_train, X_test, sample)

The original shapes of X_train, y_train and X_test are:  (1212, 832) (1212, 1) (776, 832)


  y = column_or_1d(y, warn=True)


The preprocessed shapes of X_train, y_train and X_test are:  (1156, 81) (1156, 1) (776, 81)
0.6904098400193309 0.0279600465872217


  y = column_or_1d(y, warn=True)


Learning rate set to 0.041892
0:	learn: 9.1658848	total: 141ms	remaining: 2m 21s
1:	learn: 9.0196760	total: 152ms	remaining: 1m 16s
2:	learn: 8.8633310	total: 163ms	remaining: 54.1s
3:	learn: 8.7275487	total: 174ms	remaining: 43.3s
4:	learn: 8.5970570	total: 185ms	remaining: 36.9s
5:	learn: 8.4706758	total: 196ms	remaining: 32.5s
6:	learn: 8.3445886	total: 208ms	remaining: 29.5s
7:	learn: 8.2158960	total: 219ms	remaining: 27.2s
8:	learn: 8.1074071	total: 231ms	remaining: 25.5s
9:	learn: 7.9935525	total: 245ms	remaining: 24.2s
10:	learn: 7.8888924	total: 258ms	remaining: 23.2s
11:	learn: 7.7835745	total: 270ms	remaining: 22.2s
12:	learn: 7.6955582	total: 282ms	remaining: 21.4s
13:	learn: 7.6065121	total: 293ms	remaining: 20.7s
14:	learn: 7.5207623	total: 307ms	remaining: 20.2s
15:	learn: 7.4327622	total: 319ms	remaining: 19.6s
16:	learn: 7.3469717	total: 332ms	remaining: 19.2s
17:	learn: 7.2654706	total: 344ms	remaining: 18.8s
18:	learn: 7.1966112	total: 355ms	remaining: 18.3s
19:	learn

In [43]:
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)

In [50]:
# Initialize KFold
kf = model_selection.KFold(n_splits=20, shuffle=True, random_state=88)

rmse_scores = []
r2_scores = []
best_r2_score = -np.inf  # Initialize best R² to a very low value
best_model = None  # To store the best model
best_scaler = None  # To store the scaler for the best model

# Loop through each fold
for fold_num, (train_index, val_index) in enumerate(kf.split(X_train), start=1):
    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Standardize the features
    scaler = preprocessing.StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)  # Fit and transform on training data
    X_val_fold = scaler.transform(X_val_fold)

    # Create and fit the model

    model.fit(X_train_fold, y_train_fold)
    
    # model = stacked_regressor

    # model.fit(X_train_fold, y_train_fold)

    # Predict on the validation set
    y_pred = model.predict(X_val_fold)

    # Calculate RMSE and R²
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    r2 = r2_score(y_val_fold, y_pred)

    # Print metrics for the current fold
    print(f"Fold {fold_num}: RMSE = {rmse:.4f}, R² = {r2:.4f}")

    rmse_scores.append(rmse)
    r2_scores.append(r2)

    # Keep track of the best model based on R² score
    if r2 > best_r2_score:
        best_r2_score = r2
        best_model = model  # Store the model with the best R²
        best_scaler = scaler  # Store the corresponding scaler

# Print the average RMSE and R² across all folds
print(f"\nAverage RMSE: {np.mean(rmse_scores):.4f}")
print(f"Average R²: {np.mean(r2_scores):.4f}")
print(f"\nBest R² Score: {best_r2_score:.4f}")


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.912568
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001865 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.870159
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000992 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.957195
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.917995
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.998179
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001020 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 70.038724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001970 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.892532
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.886105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001530 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001682 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.960838
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.936219
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002218 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.928962
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.965831
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 70.018215
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000940 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 70.006834
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.896175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.858770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 70.031876
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.996583
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.875228
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.855353
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.989071
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.966970
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000968 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 70.033698
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 70.011390
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 70.020947
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.949886
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.880692
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.849658
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001970 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001539 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.999089
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002571 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.939636
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1098, number of used features: 77
[LightGBM] [Info] Start training from score 69.968124
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start training from score 69.953303
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 878, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1099, number of used features: 77
[LightGBM] [Info] Start training from score 69.886260
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000736 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start training from score 69.797497
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1099, number of used features: 77
[LightGBM] [Info] Start training from score 69.922657
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start training from score 69.873720
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1099, number of used features: 77
[LightGBM] [Info] Start training from score 70.013649
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000943 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start training from score 69.984073
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start tra

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 1099, number of used features: 77
[LightGBM] [Info] Start training from score 69.949045
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start training from score 69.911263
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19635
[LightGBM] [Info] Number of data points in the train set: 879, number of used features: 77
[LightGBM] [Info] Start tra

In [52]:
# Use the best model and corresponding scaler to predict on X_test
X_test_standardized = best_scaler.transform(X_test)  # Standardize X_test using the best scaler
y_test_pred = best_model.predict(X_test_standardized)

sample['y'] = y_test_pred
sample.to_csv("submission.csv", index=False)

