In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Load dataset (replace 'file_path' with your actual file path)
file_path = r"F:\E Drive\VIT\SEM 8\SET\data\FINAL_USO.csv"
data = pd.read_csv(file_path)

# Data preprocessing
# Convert Date to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Fill missing values (if any) with median values
data.fillna(data.median(), inplace=True)

# Feature and target selection
# Target: Gold Close Price ('Close')
# Features: Exclude 'Date' and 'Close' columns
X = data.drop(columns=['Date', 'Close'])
y = data['Close']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model initialization
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
svr_model = SVR()

# Train models
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
svr_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_gb = gb_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)

# Evaluate models
mse_rf = mean_squared_error(y_test, y_pred_rf)
mse_gb = mean_squared_error(y_test, y_pred_gb)
mse_svr = mean_squared_error(y_test, y_pred_svr)

print("Mean Squared Error (Random Forest):", mse_rf)
print("Mean Squared Error (Gradient Boosting):", mse_gb)
print("Mean Squared Error (Support Vector Regression):", mse_svr)

# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:\n", feature_importance)

# Save feature importance to a CSV file
feature_importance.to_csv("feature_importance.csv", index=False)

  data.fillna(data.median(), inplace=True)


Mean Squared Error (Random Forest): 0.07696078502614162
Mean Squared Error (Gradient Boosting): 0.06488662010408315
Mean Squared Error (Support Vector Regression): 224.4113396754919

Feature Importance:
        Feature    Importance
3    Adj Close  4.426803e-01
1         High  2.068085e-01
0         Open  1.783913e-01
2          Low  1.701096e-01
11     DJ_open  2.219335e-04
..         ...           ...
54   PLT_Trend  3.074565e-07
59   PLD_Trend  3.072528e-07
44    SF_Trend  2.499458e-07
33    OF_Trend  2.439267e-07
66  USDI_Trend  1.560655e-07

[79 rows x 2 columns]
