In [19]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import cv, DMatrix

In [21]:
# Load your dataset
df = pd.read_csv('SG_Obs_2016_2023_NotScaled.csv')  # Replace with your file path

# Convert 'Datetime' to datetime object and extract time-related features
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Year'] = df['Datetime'].dt.year
df['Month'] = df['Datetime'].dt.month
df['Day'] = df['Datetime'].dt.day
df['Hour'] = df['Datetime'].dt.hour
# Add more time-related features if needed

# Prepare the data
X = df[['DV', 'Temp', 'VV', 'Year', 'Month', 'Day', 'Hour']]  # Features with time-related features
y = df['MP10']  # Target variable

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Create DMatrix - XGBoost's internal data structure, optimized for both memory efficiency and training speed
dtrain = DMatrix(X_train, label=y_train)

In [24]:
# Define the parameter for the XGBoost model
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,  # Use the best parameters from your GridSearchCV results
    'n_estimators': 150,
    'learning_rate': 0.1,
    # Add other parameters here
}

In [25]:
# Perform cross-validation
cv_results = cv(dtrain=dtrain, params=params, nfold=5, num_boost_round=50, early_stopping_rounds=10, metrics='rmse', as_pandas=True, seed=42)

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.



In [26]:
# Output the performance
print("CV RMSE score: ", cv_results['test-rmse-mean'].min())

CV RMSE score:  40.2250073277844


In [27]:
# Train the model with the whole training dataset
final_model = xgb.train(params, dtrain, num_boost_round=cv_results.shape[0])

Parameters: { "n_estimators" } are not used.



In [28]:
# Make predictions and evaluate the final model
dtest = DMatrix(X_test)
predictions = final_model.predict(dtest)
rmse = mean_squared_error(y_test, predictions, squared=False)
print("Final model RMSE: %f" % (rmse))

Final model RMSE: 38.042319


In [12]:
# Initialize the XGBoost regressor
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror')

In [13]:
# Initialize the Grid Search model
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

In [14]:
# Fit the grid search model
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.3s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.4s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=   0.4s
[CV] END ...learning_rate=0.01, max_depth=4, n_estimators=50; total time=   0.5s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.6s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.8s
[CV] END ...learning_rate=0.01, max_depth=4, n_estimators=50; total time=   0.5s
[CV] END ...learning_rate=0.01, max_depth=4, n_estimators=50; total time=   0.5s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time=   1.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_e

In [15]:
# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150}


In [16]:
# Use the best model to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

In [17]:
# Evaluate the best model
rmse = mean_squared_error(y_test, predictions, squared=False)
print("RMSE: %f" % (rmse))

RMSE: 37.300127
