In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb 
import catboost as cat
from cubist import Cubist
from xgboost import XGBRegressor
from sklearn import (
    decomposition,
    ensemble,
    feature_selection,
    impute,
    linear_model,
    model_selection,
    pipeline,
    preprocessing,
    svm,
)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
from sklearn.metrics import r2_score, mean_squared_error
from pyod.models.ecod import ECOD

In [4]:
def load_data():
    X_train = pd.read_csv('X_train_rf_20.csv', usecols=lambda column: column != 'id')
    y_train = pd.read_csv('y_train.csv', usecols=lambda column: column != 'id')
    X_test = pd.read_csv('X_test_rf_20.csv', usecols=lambda column: column != 'id')
    sample = pd.read_csv('sample.csv')
    return X_train, y_train, X_test, sample

def outlier(X_train, y_train):
    model = pipeline.make_pipeline(
        preprocessing.RobustScaler(),
        # impute.SimpleImputer(strategy='median'),
        decomposition.PCA(n_components=2),
        ensemble.IsolationForest(contamination=0.0455)
    )
    mask = (model.fit_predict(X_train) > 0).astype(int)==1
    X_train= pd.DataFrame(X_train[mask]).reset_index(drop=True)
    y_train= pd.DataFrame(y_train[mask]).reset_index(drop=True)
    return X_train, y_train

def imput(X_train, X_test):
    model = pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        impute.SimpleImputer(strategy='median'),
    )
    X_train = model.fit_transform(X_train)
    X_test = model.transform(X_test)
    return X_train, X_test              

def select_features(X_train, y_train, X_test):
    model = pipeline.make_pipeline(
        feature_selection.VarianceThreshold(),
        feature_selection.SelectKBest(score_func=feature_selection.f_regression, k=195),
        feature_selection.SelectFromModel(linear_model.Lasso(0.075))
    )              
    model.fit(X_train, y_train)
    X_train = model.transform(X_train)
    X_test = model.transform(X_test)
    return X_train, X_test

def make_submission(model, X_train, y_train, X_test, sample):
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    sample['y'] = y_test_pred
    sample.to_csv("submission.csv", index=False)
    

In [6]:
X_train_original, y_train_original, X_test_original, sample = load_data()
print('The original shapes of X_train, y_train and X_test are: ', 
      X_train_original.shape, y_train_original.shape, X_test_original.shape)

X_train, y_train = outlier(X_train_original, y_train_original)
# X_train, X_test = imput(X_train, X_test_original)
X_train, X_test = select_features(X_train, y_train, X_test_original)
print('The preprocessed shapes of X_train, y_train and X_test are: ', 
      X_train.shape, y_train.shape, X_test.shape)

model = pipeline.make_pipeline(
    ensemble.StackingRegressor(
        estimators=[
            ("svr", svm.SVR(C=65.0, epsilon=1e-05, kernel='rbf')),
            ("etr", ensemble.ExtraTreesRegressor()),
            ('lgb', lgb.LGBMRegressor(verbose=0)),
            ("gbm", ensemble.GradientBoostingRegressor(learning_rate=0.085)),
            ('cat', cat.CatBoostRegressor(verbose=0)),
            # ('cubist', Cubist(verbose=0, n_committees=5, cv=10, auto=True))

        ],
        final_estimator=linear_model.ElasticNet(),
    )
)
score = model_selection.cross_val_score(model, X_train, y_train, cv=5, n_jobs=6)
print(score.mean(), score.std()) 

make_submission(model, X_train, y_train, X_test, sample)

The original shapes of X_train, y_train and X_test are:  (1212, 832) (1212, 1) (776, 832)


  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent(


The preprocessed shapes of X_train, y_train and X_test are:  (1156, 133) (1156, 1) (776, 133)
0.677244228067251 0.02507546267455542


  y = column_or_1d(y, warn=True)


In [None]:
# Initialize KFold
kf = model_selection.KFold(n_splits=20, shuffle=True, random_state=88)

rmse_scores = []
r2_scores = []
best_r2_score = -np.inf  # Initialize best R² to a very low value
best_model = None  # To store the best model
best_scaler = None  # To store the scaler for the best model

# Loop through each fold
for fold_num, (train_index, val_index) in enumerate(kf.split(X_train), start=1):
    # Split the data into training and validation sets for this fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Standardize the features
    scaler = preprocessing.StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)  # Fit and transform on training data
    X_val_fold = scaler.transform(X_val_fold)

    # Create and fit the model

    model.fit(X_train_fold, y_train_fold)
    
    # model = stacked_regressor

    # model.fit(X_train_fold, y_train_fold)

    # Predict on the validation set
    y_pred = model.predict(X_val_fold)

    # Calculate RMSE and R²
    rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred))
    r2 = r2_score(y_val_fold, y_pred)

    # Print metrics for the current fold
    print(f"Fold {fold_num}: RMSE = {rmse:.4f}, R² = {r2:.4f}")

    rmse_scores.append(rmse)
    r2_scores.append(r2)

    # Keep track of the best model based on R² score
    if r2 > best_r2_score:
        best_r2_score = r2
        best_model = model  # Store the model with the best R²
        best_scaler = scaler  # Store the corresponding scaler

# Print the average RMSE and R² across all folds
print(f"\nAverage RMSE: {np.mean(rmse_scores):.4f}")
print(f"Average R²: {np.mean(r2_scores):.4f}")
print(f"\nBest R² Score: {best_r2_score:.4f}")


In [52]:
# Use the best model and corresponding scaler to predict on X_test
X_test_standardized = best_scaler.transform(X_test)  # Standardize X_test using the best scaler
y_test_pred = best_model.predict(X_test_standardized)

sample['y'] = y_test_pred
sample.to_csv("submission.csv", index=False)

