In [None]:
# Cell 1: Load Results and Data
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')

# Load model results
with open(os.path.join('..', 'models', 'model_results.pkl'), 'rb') as f:
    results = pickle.load(f)

model = results['model']
scaler = results['scaler']
feature_columns = results['feature_columns']
metrics = results['metrics']

# Load dataset
dataset = pd.read_csv(os.path.join('..', 'data', 'processed_dataset.csv'))

print("=== Model Results Loaded ===")
print(f"Dataset: {len(dataset)} samples")
print(f"Features: {feature_columns}")
print(f"\nMetrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")

In [None]:
# Cell 2: Model Performance Summary
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X = dataset[feature_columns].copy()
y = dataset['target_return'].copy()

X_scaled = pd.DataFrame(scaler.transform(X), columns=feature_columns)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Direction accuracy
direction_correct = np.sum(np.sign(y_test.values) == np.sign(y_pred_test))
direction_accuracy = direction_correct / len(y_test)

print("="*50)
print("    STOCK RETURN PREDICTION - MODEL SUMMARY")
print("="*50)
print(f"\nDataset: {len(dataset)} company-quarter samples")
print(f"Companies: {dataset['ticker'].nunique()}")
print(f"Train/Test split: {len(X_train)}/{len(X_test)}")
print(f"\n--- Regression Metrics ---")
print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, y_pred_train)):.4f}")
print(f"Test RMSE:  {np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f}")
print(f"Train R2:   {r2_score(y_train, y_pred_train):.4f}")
print(f"Test R2:    {r2_score(y_test, y_pred_test):.4f}")
print(f"Test MAE:   {mean_absolute_error(y_test, y_pred_test):.4f}")
print(f"\n--- Direction Accuracy ---")
print(f"Correct direction: {direction_correct}/{len(y_test)} ({direction_accuracy:.1%})")
print(f"(Baseline random: 50%)")

In [None]:
# Cell 3: Feature Importance Analysis
importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance Ranking:")
print("-" * 35)
for i, row in importance.iterrows():
    bar = '#' * int(row['importance'] * 50)
    print(f"{row['feature']:20s} {row['importance']:.3f} {bar}")

fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(importance['feature'][::-1], importance['importance'][::-1], color='steelblue')
ax.set_xlabel('Importance Score')
ax.set_title('XGBoost Feature Importance for Stock Return Prediction')
plt.tight_layout()
plt.savefig(os.path.join('..', 'results', 'feature_importance_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cell 4: Prediction Analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Predicted vs Actual
axes[0, 0].scatter(y_test, y_pred_test, alpha=0.7, edgecolor='black', s=60)
lims = [min(y_test.min(), y_pred_test.min()) - 0.05, max(y_test.max(), y_pred_test.max()) + 0.05]
axes[0, 0].plot(lims, lims, 'r--', label='Perfect prediction')
axes[0, 0].set_xlabel('Actual Return')
axes[0, 0].set_ylabel('Predicted Return')
axes[0, 0].set_title('Predicted vs Actual Quarterly Returns')
axes[0, 0].legend()

# 2. Residuals
residuals = y_test.values - y_pred_test
axes[0, 1].hist(residuals, bins=15, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 1].axvline(x=0, color='red', linestyle='--')
axes[0, 1].set_xlabel('Prediction Error')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Prediction Errors')

# 3. Returns by company
company_returns = dataset.groupby('ticker')['target_return'].mean().sort_values()
axes[1, 0].barh(company_returns.index, company_returns.values, color='steelblue')
axes[1, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Average Quarterly Return')
axes[1, 0].set_title('Average Return by Company')

# 4. Feature correlation with target
correlations = dataset[feature_columns + ['target_return']].corr()['target_return'].drop('target_return').sort_values()
axes[1, 1].barh(correlations.index, correlations.values, color='steelblue')
axes[1, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)
axes[1, 1].set_xlabel('Correlation with Target Return')
axes[1, 1].set_title('Feature Correlations')

plt.tight_layout()
plt.savefig(os.path.join('..', 'results', 'prediction_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cell 5: Correlation Heatmap
fig, ax = plt.subplots(figsize=(10, 8))

corr_matrix = dataset[feature_columns + ['target_return']].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(
    corr_matrix, mask=mask, annot=True, fmt='.2f',
    cmap='RdBu_r', center=0, square=True,
    linewidths=0.5, ax=ax
)
ax.set_title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig(os.path.join('..', 'results', 'correlation_heatmap.png'), dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Cell 6: Final Summary
print("="*50)
print("    PROJECT SUMMARY")
print("="*50)
print(f"""
Stock Return Prediction using XGBoost
--------------------------------------
Objective: Predict quarterly stock returns from
           financial fundamentals

Data:
  - {dataset['ticker'].nunique()} S&P 500 companies
  - {len(dataset)} company-quarter samples
  - Price data: Kaggle S&P 500 dataset (2013-2018)
  - Financial data: Synthetic quarterly statements

Features ({len(feature_columns)}):
  {', '.join(feature_columns)}

Model: XGBoost Regressor
  - max_depth: 3
  - n_estimators: 100
  - learning_rate: 0.1

Results:
  - Test RMSE: {metrics.get('test_rmse', 0):.4f}
  - Test R2: {metrics.get('test_r2', 0):.4f}
  - Direction Accuracy: {direction_accuracy:.1%}

Top Features: {', '.join(importance['feature'].head(3).values)}

Limitations:
  - Small dataset (60 samples)
  - Synthetic financial data
  - Limited to 2016-2018 period

Potential Improvements:
  - Use real financial statement data (SEC EDGAR)
  - Add technical indicators (RSI, MACD, etc.)
  - Include more companies and longer time period
  - Try ensemble methods or LSTM models
""")

print("\nAll results saved to results/ folder.")
print("Project complete!")