In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
dataset = pd.read_csv('DSDataLastThreeMonths.csv')

# Select relevant columns for the model
selected_columns = ['HM_WT', 'AIM_S', 'HM_S', 'HM_C', 'HM_SI', 'HM_TI', 'HM_MN', 'CAC2', 'MG', 'HM_TEMP', 'CAC2_INJ_TIME', 'MG_INJ_TIME', 'DS_S']

dataset = dataset[selected_columns]

# Drop rows with missing values in both X and y
dataset = dataset.dropna()

# Split the dataset into input features (X) and target variable (y)
X = dataset.drop('DS_S', axis=1)
y = dataset['DS_S']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Regressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

# Create and train the Support Vector Regressor (SVR) model
svr_model = SVR()
svr_model.fit(X_train, y_train)
svr_y_pred = svr_model.predict(X_test)

# Create and train the XGBoost Regressor model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)

# Create and train the LightGBM Regressor model
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train, y_train)
lgb_y_pred = lgb_model.predict(X_test)

# Evaluate model performance using RMSE and R2 score
def evaluate_model(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return rmse, r2

# Evaluate the models
rf_rmse, rf_r2 = evaluate_model(y_test, rf_y_pred)
svr_rmse, svr_r2 = evaluate_model(y_test, svr_y_pred)
xgb_rmse, xgb_r2 = evaluate_model(y_test, xgb_y_pred)
lgb_rmse, lgb_r2 = evaluate_model(y_test, lgb_y_pred)

# Calculate the Model hit rate for each model (% data point with (Pred DS_S – Act DS_S) between +- 0.003%)
tolerance = 0.003
rf_within_tolerance = abs(rf_y_pred - y_test) <= tolerance
rf_hit_rate = (rf_within_tolerance.sum() / len(y_test)) * 100

svr_within_tolerance = abs(svr_y_pred - y_test) <= tolerance
svr_hit_rate = (svr_within_tolerance.sum() / len(y_test)) * 100

xgb_within_tolerance = abs(xgb_y_pred - y_test) <= tolerance
xgb_hit_rate = (xgb_within_tolerance.sum() / len(y_test)) * 100

lgb_within_tolerance = abs(lgb_y_pred - y_test) <= tolerance
lgb_hit_rate = (lgb_within_tolerance.sum() / len(y_test)) * 100

# Share summary of the models' performance
print("Random Forest Regressor:")
print(f"RMSE: {rf_rmse:.4f}, R2: {rf_r2:.4f}, Hit Rate: {rf_hit_rate:.2f}%")

print("\nSupport Vector Regressor (SVR):")
print(f"RMSE: {svr_rmse:.4f}, R2: {svr_r2:.4f}, Hit Rate: {svr_hit_rate:.2f}%")

print("\nXGBoost Regressor:")
print(f"RMSE: {xgb_rmse:.4f}, R2: {xgb_r2:.4f}, Hit Rate: {xgb_hit_rate:.2f}%")

print("\nLightGBM Regressor:")
print(f"RMSE: {lgb_rmse:.4f}, R2: {lgb_r2:.4f}, Hit Rate: {lgb_hit_rate:.2f}%")


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1105
[LightGBM] [Info] Number of data points in the train set: 4874, number of used features: 12
[LightGBM] [Info] Start training from score 0.007541
Random Forest Regressor:
RMSE: 0.0022, R2: 0.3851, Hit Rate: 85.81%

Support Vector Regressor (SVR):
RMSE: 0.0057, R2: -3.2803, Hit Rate: 21.99%

XGBoost Regressor:
RMSE: 0.0022, R2: 0.3871, Hit Rate: 85.73%

LightGBM Regressor:
RMSE: 0.0021, R2: 0.4245, Hit Rate: 86.96%
