In [None]:
# Cell 1: Mount Drive and Load Data
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load dataset from Google Drive
dataset = pd.read_csv('/content/drive/MyDrive/stock_prediction_data/processed_dataset.csv')

# Sort by date for temporal splitting
dataset['date'] = pd.to_datetime(dataset['date'])
dataset = dataset.sort_values('date').reset_index(drop=True)

# Use raw returns as target (better direction accuracy than excess returns)
target_col = 'target_return'

print(f"Loaded dataset: {dataset.shape}")
print(f"Samples: {len(dataset)}, Companies: {dataset['ticker'].nunique()}")
print(f"Date range: {dataset['date'].min().date()} to {dataset['date'].max().date()}")
print(f"Target: {target_col}")
print(f"  Mean: {dataset[target_col].mean():.4f}, Std: {dataset[target_col].std():.4f}")

In [None]:
# Cell 2: Feature Selection
meta_cols = ['ticker', 'date', 'target_return', 'target_excess_return', 'benchmark_return']
all_feature_cols = [c for c in dataset.columns if c not in meta_cols]

print(f"Starting features: {len(all_feature_cols)}")

X_raw = dataset[all_feature_cols].copy()
y = dataset[target_col].copy()

# 1. Drop features with zero variance
variances = X_raw.var()
zero_var = variances[variances == 0].index.tolist()
if zero_var:
    print(f"Dropping {len(zero_var)} zero-variance features: {zero_var}")
    X_raw = X_raw.drop(columns=zero_var)

# 2. Drop highly correlated features (>0.95)
corr_matrix = X_raw.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_pairs = []
to_drop = set()
for col in upper.columns:
    correlated = upper.index[upper[col] > 0.95].tolist()
    for c in correlated:
        if c not in to_drop:
            high_corr_pairs.append((col, c, corr_matrix.loc[col, c]))
            to_drop.add(c)

if to_drop:
    print(f"\nDropping {len(to_drop)} highly correlated features (r > 0.95):")
    for col, c, r in high_corr_pairs:
        print(f"  {c} (corr={r:.3f} with {col})")
    X_raw = X_raw.drop(columns=list(to_drop))

feature_columns = list(X_raw.columns)
print(f"\nSelected features: {len(feature_columns)}")
print(f"Features: {feature_columns}")

In [None]:
# Cell 3: Temporal Train/Test Split + Time-Series CV

X = dataset[feature_columns].copy()
y = dataset[target_col].copy()

# Temporal split: train on earlier data, test on most recent 20%
unique_dates = sorted(dataset['date'].unique())
n_dates = len(unique_dates)
cutoff_idx = int(n_dates * 0.8)
cutoff_date = unique_dates[cutoff_idx]

train_mask = dataset['date'] < cutoff_date
test_mask = dataset['date'] >= cutoff_date

X_train_raw = X[train_mask]
X_test_raw = X[test_mask]
y_train = y[train_mask].values
y_test = y[test_mask].values

print(f"=== Temporal Split ===")
print(f"Cutoff date: {pd.Timestamp(cutoff_date).date()}")
print(f"Training: {len(X_train_raw)} samples ({dataset[train_mask]['date'].min().date()} to {dataset[train_mask]['date'].max().date()})")
print(f"Test:     {len(X_test_raw)} samples ({dataset[test_mask]['date'].min().date()} to {dataset[test_mask]['date'].max().date()})")

# Scale features
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train_raw), columns=feature_columns)
X_test = pd.DataFrame(scaler.transform(X_test_raw), columns=feature_columns)

# Build expanding-window CV splits for training data
train_dates = sorted(dataset[train_mask]['date'].unique())
n_train_dates = len(train_dates)
n_folds = 5
fold_size = n_train_dates // (n_folds + 1)

cv_splits = []
for i in range(n_folds):
    train_end = (i + 2) * fold_size
    val_start = train_end
    val_end = min(train_end + fold_size, n_train_dates)

    cv_train_dates = set(train_dates[:train_end])
    cv_val_dates = set(train_dates[val_start:val_end])

    train_idx = [j for j, d in enumerate(dataset[train_mask]['date']) if d in cv_train_dates]
    val_idx = [j for j, d in enumerate(dataset[train_mask]['date']) if d in cv_val_dates]

    if len(val_idx) > 0:
        cv_splits.append((train_idx, val_idx))

print(f"\n=== Time-Series CV ({len(cv_splits)} folds) ===")
for i, (tr, va) in enumerate(cv_splits):
    print(f"  Fold {i+1}: train={len(tr)} samples, val={len(va)} samples")

In [None]:
# Cell 4: Hyperparameter Tuning with Time-Series CV
!pip install xgboost -q

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.01, 0.1, 1.0],
    'reg_lambda': [1.0, 2.0, 5.0, 10.0],
}

base_model = xgb.XGBRegressor(
    random_state=42,
    objective='reg:squarederror',
)

search = RandomizedSearchCV(
    base_model,
    param_distributions=param_distributions,
    n_iter=50,
    cv=cv_splits,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=1,
)

print("Running hyperparameter search (50 iterations, 5-fold temporal CV)...")
search.fit(X_train.values, y_train)

print(f"\n=== Best Parameters ===")
for k, v in search.best_params_.items():
    print(f"  {k}: {v}")
print(f"\nBest CV RMSE: {-search.best_score_:.4f}")

In [None]:
# Cell 5: Train Final Model and Evaluate
model = search.best_estimator_

y_pred_train = model.predict(X_train.values)
y_pred_test = model.predict(X_test.values)

# Regression metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)

# Direction accuracy
direction_correct = np.sum(np.sign(y_test) == np.sign(y_pred_test))
direction_accuracy = direction_correct / len(y_test)

# Cross-validation scores from search
cv_rmse_mean = -search.best_score_
cv_results = search.cv_results_
best_idx = search.best_index_
cv_rmse_std = cv_results['std_test_score'][best_idx]

print("="*55)
print("    MODEL PERFORMANCE (Temporal Split)")
print("="*55)
print(f"\nTarget: {target_col}")
print(f"Training: {len(X_train)} samples | Test: {len(X_test)} samples")
print(f"\n--- Regression Metrics ---")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE:  {test_rmse:.4f}")
print(f"Train R2:   {train_r2:.4f}")
print(f"Test R2:    {test_r2:.4f}")
print(f"Test MAE:   {test_mae:.4f}")
print(f"Overfit ratio: {train_rmse/test_rmse:.2f} (closer to 1.0 = less overfit)")
print(f"\n--- Time-Series Cross-Validation ---")
print(f"CV RMSE:    {cv_rmse_mean:.4f} (+/- {abs(cv_rmse_std):.4f})")
print(f"\n--- Direction Accuracy ---")
print(f"Correct direction: {direction_correct}/{len(y_test)} ({direction_accuracy:.1%})")
print(f"(Baseline: 50%)")

In [None]:
# Cell 6: Feature Importance
import matplotlib.pyplot as plt

importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=True)

fig, ax = plt.subplots(figsize=(10, max(6, len(feature_columns) * 0.3)))
ax.barh(importance['feature'], importance['importance'], color='steelblue')
ax.set_xlabel('Feature Importance')
ax.set_title('XGBoost Feature Importance (After Selection)')
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/stock_prediction_data/feature_importance.png', dpi=150)
plt.show()

# Show top and bottom features
print("Top 10 features:")
for _, row in importance.tail(10).iloc[::-1].iterrows():
    bar = '#' * int(row['importance'] * 50)
    print(f"  {row['feature']:30s} {row['importance']:.3f} {bar}")

# Identify low-importance features
low_imp = importance[importance['importance'] < 0.01]
if len(low_imp) > 0:
    print(f"\n{len(low_imp)} features with <1% importance (candidates for removal):")
    for _, row in low_imp.iterrows():
        print(f"  {row['feature']}")

In [None]:
# Cell 7: Prediction Analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Predicted vs Actual
axes[0, 0].scatter(y_test, y_pred_test, alpha=0.5, edgecolor='black', s=40)
lims = [min(y_test.min(), y_pred_test.min()) - 0.05, max(y_test.max(), y_pred_test.max()) + 0.05]
axes[0, 0].plot(lims, lims, 'r--', label='Perfect prediction')
axes[0, 0].set_xlabel(f'Actual {target_col}')
axes[0, 0].set_ylabel(f'Predicted {target_col}')
axes[0, 0].set_title('Predicted vs Actual (Test Set)')
axes[0, 0].legend()

# 2. Residuals
residuals = y_test - y_pred_test
axes[0, 1].hist(residuals, bins=30, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 1].axvline(x=0, color='red', linestyle='--')
axes[0, 1].set_xlabel('Prediction Error')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Prediction Errors')

# 3. Predictions over time
test_dates = dataset[test_mask]['date'].values
axes[1, 0].scatter(test_dates, y_test, alpha=0.4, label='Actual', s=20)
axes[1, 0].scatter(test_dates, y_pred_test, alpha=0.4, label='Predicted', s=20)
axes[1, 0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel(target_col)
axes[1, 0].set_title('Predictions Over Time (Test Period)')
axes[1, 0].legend()
axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Cumulative returns (if we traded on predictions)
test_returns = dataset[test_mask]['target_return'].values
# Strategy: go long if predicted excess return > 0, else hold cash
if target_col == 'target_excess_return':
    strategy_returns = np.where(y_pred_test > 0, test_returns, 0)
else:
    strategy_returns = np.where(y_pred_test > 0, test_returns, 0)
buy_hold = np.cumprod(1 + test_returns) - 1
strategy = np.cumprod(1 + strategy_returns) - 1
axes[1, 1].plot(range(len(buy_hold)), buy_hold, label='Buy & Hold', alpha=0.7)
axes[1, 1].plot(range(len(strategy)), strategy, label='Model Strategy', alpha=0.7)
axes[1, 1].set_xlabel('Test Sample')
axes[1, 1].set_ylabel('Cumulative Return')
axes[1, 1].set_title('Cumulative Returns: Model vs Buy & Hold')
axes[1, 1].legend()
axes[1, 1].axhline(y=0, color='gray', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/stock_prediction_data/model_results.png', dpi=150)
plt.show()

In [None]:
# Cell 8: Save Model and Results
results = {
    'model': model,
    'scaler': scaler,
    'feature_columns': feature_columns,
    'target_column': target_col,
    'best_params': search.best_params_,
    'metrics': {
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'test_mae': test_mae,
        'direction_accuracy': float(direction_accuracy),
        'cv_rmse_mean': cv_rmse_mean,
        'cv_rmse_std': float(abs(cv_rmse_std)),
        'overfit_ratio': float(train_rmse / test_rmse),
    },
    'split_info': {
        'method': 'temporal',
        'cutoff_date': str(pd.Timestamp(cutoff_date).date()),
        'train_size': len(X_train),
        'test_size': len(X_test),
        'cv_folds': len(cv_splits),
    },
    # Save predictions for local analysis (avoids XGBoost version mismatch)
    'predictions': {
        'y_train': y_train.tolist(),
        'y_test': y_test.tolist(),
        'y_pred_train': y_pred_train.tolist(),
        'y_pred_test': y_pred_test.tolist(),
        'test_tickers': dataset[test_mask]['ticker'].tolist(),
        'test_dates': dataset[test_mask]['date'].astype(str).tolist(),
        'test_raw_returns': dataset[test_mask]['target_return'].tolist(),
    },
}

with open('/content/drive/MyDrive/stock_prediction_data/model_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("Saved to Google Drive: model_results.pkl")
print(f"\n=== Final Results ===")
for k, v in results['metrics'].items():
    print(f"  {k}: {v:.4f}")
print(f"\n=== Split Info ===")
for k, v in results['split_info'].items():
    print(f"  {k}: {v}")
print(f"\n=== Best Hyperparameters ===")
for k, v in results['best_params'].items():
    print(f"  {k}: {v}")