In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [25]:
df = pd.read_csv('/Users/edocampione/Desktop/Meng Engineering Science/4YP/data/filtered_exxon_data.csv')

In [26]:
# Prepare the features and target using only technical data
fundamental_features = [
    'Earnings_Per_Share', 
    'Long_Term_Debt', 
    'Net_Income', 
    'Revenue', 
    'Capital_Expenditure', 
    'Net_Profit_Margin', 
    'Debt_Equity_Ratio', 
    'Current_Ratio',
    'Price_To_Book_Ratio'
]
    
features = df.drop(columns=(['Date','Monthly_Percent_Change']+fundamental_features))
features = df.drop(columns=['Date','Monthly_Percent_Change'])
target = df['Monthly_Percent_Change']

# Split the data: first into training + validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
# Split the training + validation set further into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and Evaluation on Training Set
y_train_pred = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

# Predictions and Evaluation on Validation Set
y_val_pred = model.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

# Predictions and Evaluation on Test Set
y_test_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Train RMSE: {train_rmse:.4f}")
print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")

# Baseline Model: Mean Prediction
# Compute the mean of the training target
mean_train_target = y_train.mean()

# Use this mean as the prediction for all validation and test data points
y_val_baseline = np.full_like(y_val, fill_value=mean_train_target)
y_test_baseline = np.full_like(y_test, fill_value=mean_train_target)

# Baseline evaluation for validation set
val_baseline_rmse = np.sqrt(mean_squared_error(y_val, y_val_baseline))

# Baseline evaluation for test set
test_baseline_rmse = np.sqrt(mean_squared_error(y_test, y_test_baseline))

print(f"Baseline Validation RMSE: {val_baseline_rmse:.4f}")
print(f"Baseline Test RMSE: {test_baseline_rmse:.4f}")




Train RMSE: 3.7865
Validation RMSE: 4.2196
Test RMSE: 4.1087
Baseline Validation RMSE: 5.5090
Baseline Test RMSE: 5.6246


In [27]:
# Create a DataFrame for easier plotting and comparison
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred}, index=X_test.index)

print(results.head())

        Actual  Predicted
1176  1.249863   2.278477
1014  0.980801   4.232043
101  -0.807809   6.427699
439   8.284625   1.944389
58   -5.212535  -3.270654
