In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from math import sqrt

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Handle missing values
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Feature Engineering: Add a Total Bases feature
train_data['TB'] = train_data['S'] + (2 * train_data['2B']) + (3 * train_data['3B']) + (4 * train_data['HR'])
test_data['TB'] = test_data['S'] + (2 * test_data['2B']) + (3 * test_data['3B']) + (4 * test_data['HR'])

# Prepare the features and target variable
X = train_data.drop(['Id', 'R', 'yearID'], axis=1)
y = train_data['R']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Define base models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Stacking Regressor
stacked_model = StackingRegressor(
    estimators=[('ridge', ridge), ('lasso', lasso), ('rf', rf), ('gb', gb)],
    final_estimator=Ridge()  # Removed the alpha parameter here because of an error in the process
)

# Hyperparameter Tuning for Stacking Regressor
param_grid_stacked = {
    'final_estimator__alpha': [1, 10, 100],
}

grid_search_stacked = GridSearchCV(estimator=stacked_model, param_grid=param_grid_stacked, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search_stacked.fit(X_train_poly, y_train)

# Best hyperparameters for Stacking Regressor
best_alpha_stacked = grid_search_stacked.best_params_['final_estimator__alpha']

# Re-train Stacking Regressor with best hyperparameters
stacked_model_best = StackingRegressor(
    estimators=[('ridge', ridge), ('lasso', lasso), ('rf', rf), ('gb', gb)],
    final_estimator=Ridge(alpha=best_alpha_stacked)
)
stacked_model_best.fit(X_train_poly, y_train)

# Validation performance for Stacking Regressor
y_pred_val_stacked = stacked_model_best.predict(X_val_poly)
rmse_val_stacked = sqrt(mean_squared_error(y_val, y_pred_val_stacked))
print(f'Stacking Regressor RMSE on Validation Set: {rmse_val_stacked}')

# Prepare the test data
X_test = test_data.drop(['Id', 'yearID'], axis=1)
X_test_scaled = scaler.transform(X_test)
X_test_poly = poly.transform(X_test_scaled)

# Predicting the target for the test data
predictions = stacked_model_best.predict(X_test_poly)

# Creating the submission file
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'R': predictions
})
submission.to_csv('submission_advanced.csv', index=False)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ...........................final_estimator__alpha=1; total time= 1.0min
[CV] END ...........................final_estimator__alpha=1; total time=  58.8s
[CV] END ...........................final_estimator__alpha=1; total time=  59.4s
[CV] END ..........................final_estimator__alpha=10; total time=  59.9s
[CV] END ..........................final_estimator__alpha=10; total time=  59.9s
[CV] END ..........................final_estimator__alpha=10; total time= 1.0min
[CV] END .........................final_estimator__alpha=100; total time=  59.8s
[CV] END .........................final_estimator__alpha=100; total time=  59.9s
[CV] END .........................final_estimator__alpha=100; total time=  59.3s
Stacking Regressor RMSE on Validation Set: 28.18742570652472
