In [None]:
%cd /app


import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import ValidationCurveDisplay
from sklearn.feature_selection import RFECV, RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

import shap


In [None]:
df = pd.read_csv("data/final.csv", index_col=0, parse_dates=True)

target_variable = 'wti_cush_spot'
y = df[target_variable]
X = df[[c for c in df.columns if c != target_variable]]

le = LabelEncoder()

le.fit(y)
y = le.transform(y)

le.classes_

In [None]:
tscv = TimeSeriesSplit(n_splits=5)
rf = RandomForestClassifier()
rfecv = RFECV(
    estimator=rf,
    step=1,
    cv = tscv,
    scoring="f1",
    n_jobs=-1,
    min_features_to_select=5
)
rfecv.fit(X, y)
print(f"Optimal number of features: {rfecv.n_features_}")

In [None]:
cv_results = pd.DataFrame(rfecv.cv_results_)

fig, ax = plt.subplots()
ax.set_xlabel("Number of features selected")
ax.set_ylabel("Mean F1 score")
ax.errorbar(
    x=cv_results["n_features"],
    y=cv_results["mean_test_score"],
    yerr=cv_results["std_test_score"],
)
ax.set_title("Recursive Feature Elimination")
# add a vertical line at the best score
#ax.axvline(rfecv.n_features_, color="r", linestyle="--")
# add a horizontal line at the best score
ax.axhline(cv_results["mean_test_score"].max(), color="r", linestyle="--")
ax.axhline(0.5, color="k", linestyle="-")
plt.show()

In [None]:
# Get the selected features
selected_features = X.columns[rfecv.support_]

# Get the feature importances from the fitted model
feature_importances = rfecv.estimator_.feature_importances_

# Create a DataFrame to hold the feature names and their importances
feature_importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted DataFrame
print(feature_importance_df)