In [None]:
%cd /app


import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import ValidationCurveDisplay

import shap


In [None]:
data_df = pd.read_csv("/app/data/final.csv", index_col=[0], parse_dates=True)
data_df = data_df.dropna(subset=['wti_cush_spot'])

In [None]:
y_var = "wti_cush_spot"
x_var = [c for c in data_df.columns if c != y_var]

validation_df = data_df.iloc[-52:]
y_valid, X_valid = validation_df[y_var], validation_df[x_var]
train_df = data_df.iloc[:-52]
y_train, X_train = data_df[y_var], data_df[x_var]

In [None]:
len(y_train)
# print the contents of the array


In [None]:
random_forest_param_grid = {
    'n_estimators': np.arange(100, 1100, step=100),
    'criterion': ['squared_error'],
    'max_depth': 2**np.arange(0, 4, step=1),
    'min_samples_split': 2**np.arange(1, 4, step=1),
    'max_features': ['sqrt', 'log2', 1.0]
}

In [None]:
tscv = TimeSeriesSplit(n_splits=5)
model_rf = RandomForestRegressor(random_state=42)

grid_search_cv = GridSearchCV(
    estimator = model_rf,
    param_grid=random_forest_param_grid,
    cv = tscv,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1,
    refit=True,
    error_score='raise'
)

cv_results = grid_search_cv.fit(X_train, y_train)

In [None]:
cv_results_df = pd.DataFrame(grid_search_cv.cv_results_)

In [None]:
def plot_cv_results(param_name):
    best_params = grid_search_cv.best_params_
    
    # Create a list of parameters to filter by, excluding the specified parameter
    filter_params = {param: value for param, value in best_params.items() if param != param_name}
    
    # Filter the results to only include rows with the best parameters for all except the specified parameter
    filtered_results = cv_results_df.copy()
    for param, value in filter_params.items():
        filtered_results = filtered_results[filtered_results[f'param_{param}'] == value]
    
    # Plot the mean test score as a function of the specified parameter
    plt.figure(figsize=(10, 6))
    plt.plot(filtered_results[f'param_{param_name}'], -filtered_results['mean_test_score'], marker='o')
    plt.xlabel(param_name)
    plt.ylabel('Negative Mean Squared Error')
    plt.title(f'Cross-Validation Score as a Function of {param_name}')
    plt.grid(True)
    plt.show()

# Example usage:
plot_cv_results('max_depth')

In [None]:
from sklearn.metrics import mean_absolute_error

# Predict on the validation set using the best model
y_pred = grid_search_cv.best_estimator_.predict(X_valid)

# Compute the mean absolute error
mae = mean_absolute_error(y_valid, y_pred)
print(f"Mean Absolute Error on the validation set: {mae}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert the predictions and actual values to binary classes based on the sign
y_valid_sign = y_valid.apply(lambda x: 1 if x > 0 else 0)
y_pred_sign = np.where(y_pred > 0, 1, 0)

# Calculate the classification metrics
accuracy = accuracy_score(y_valid_sign, y_pred_sign)
precision = precision_score(y_valid_sign, y_pred_sign)
recall = recall_score(y_valid_sign, y_pred_sign)
f1 = f1_score(y_valid_sign, y_pred_sign)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

In [None]:
explainer = shap.TreeExplainer(grid_search_cv.best_estimator_)
shap_values = explainer(X_valid)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.waterfall(shap_values[16, ...])

In [None]:
correlation_ho_m1 = X_valid['ho_m1'].corr(y_valid)
correlation_rbob_m1 = X_valid['rbob_m1'].corr(y_valid)

print(f"Correlation coefficient between ho_m1 and y_valid: {correlation_ho_m1}")
print(f"Correlation coefficient between rbob_m1 and y_valid: {correlation_rbob_m1}")