## Summary

This notebook demonstrates a complete stock forecasting workflow:

1. **Data Loading**: Fetched historical stock data using yfinance
2. **Feature Engineering**: Created 20+ technical indicators (Moving Averages, RSI, MACD, Bollinger Bands, ATR)
3. **Model Training**: Trained Linear Regression and Random Forest models
4. **Evaluation**: Assessed model performance using RMSE, MAE, and R² scores
5. **Forecasting**: Generated 5-day price forecasts using exponential smoothing and linear trend methods
6. **Visualization**: Created comprehensive charts showing prices, indicators, and forecasts
7. **Trend Analysis**: Identified support/resistance levels, volatility, and momentum

**Key Insights**:
- Technical indicators provide strong signals for trend identification
- Ensemble models (Random Forest) typically outperform simple models
- Exponential smoothing works well for short-term forecasts
- Multiple forecast methods should be used to validate predictions

In [None]:
# Trend analysis
if df is not None:
    print("=== Trend Analysis ===\n")
    
    analyzer = TrendAnalyzer()
    
    # Calculate trend
    trend = analyzer.calculate_trend(df_features)
    current_trend = trend.iloc[-1]
    trend_label = "UPTREND ↑" if current_trend == 1 else "DOWNTREND ↓" if current_trend == -1 else "NEUTRAL"
    print(f"Current Trend: {trend_label}")
    
    # Support & Resistance
    levels = analyzer.identify_support_resistance(df_features)
    print(f"\nSupport Level: ${levels['support']:.2f}")
    print(f"Resistance Level: ${levels['resistance']:.2f}")
    print(f"Gap: ${levels['gap']:.2f}")
    
    # Volatility
    returns = df_features['close'].pct_change()
    volatility = analyzer.calculate_volatility(returns)
    print(f"\nAnnualized Volatility: {volatility:.2%}")
    
    # Momentum
    momentum = analyzer.momentum_score(df_features)
    print(f"Momentum Score: {momentum:.4f} (Range: -1 to 1)")

In [None]:
# Create comprehensive dashboard
if df is not None:
    print("Creating trading dashboard...")
    fig = StockVisualizer.create_dashboard(
        df_features.tail(60),
        selected_ticker,
        forecast=forecast_exp
    )
    plt.show()
    print("✓ Dashboard created\n")

In [None]:
# Plot Volume
if df is not None:
    print("Generating visualization: Trading Volume")
    fig = StockVisualizer.plot_volume(
        df_features.tail(60),
        title=f"{selected_ticker} - Trading Volume (Last 60 days)",
        figsize=(14, 5)
    )
    plt.show()
    print("✓ Volume chart generated\n")

In [None]:
# Plot RSI
if df is not None:
    print("Generating visualization: RSI Indicator")
    fig = StockVisualizer.plot_rsi(
        df_features.tail(60),
        title=f"{selected_ticker} - RSI(14) (Last 60 days)",
        figsize=(14, 5)
    )
    plt.show()
    print("✓ RSI chart generated\n")

In [None]:
# Plot Bollinger Bands
if df is not None:
    print("Generating visualization: Bollinger Bands")
    fig = StockVisualizer.plot_bollinger_bands(
        df_features.tail(60),
        title=f"{selected_ticker} - Bollinger Bands (Last 60 days)",
        figsize=(14, 7)
    )
    plt.show()
    print("✓ Bollinger Bands chart generated\n")

In [None]:
# Generate 5-day forecast using time series method
if df is not None:
    print("Generating 5-day price forecast...")
    
    forecaster = TimeSeriesForecaster()
    forecast_exp = forecaster.forecast_next_n_days(df_features['close'], n_days=5, method='exp_smooth')
    forecast_linear = forecaster.forecast_next_n_days(df_features['close'], n_days=5, method='linear_trend')
    
    print(f"Exponential Smoothing Forecast: {[f'{p:.2f}' for p in forecast_exp]}")
    print(f"Linear Trend Forecast: {[f'{p:.2f}' for p in forecast_linear]}")
    
    # Plot forecast
    fig = StockVisualizer.plot_forecast(
        df_features.tail(30),
        forecast_exp,
        title=f"{selected_ticker} - 5-Day Price Forecast (Exponential Smoothing)",
        figsize=(14, 7)
    )
    plt.show()
    print("✓ Forecast chart generated\n")

In [None]:
# Plot price history with moving averages
if df is not None:
    print("Generating visualization: Price with Moving Averages")
    fig = StockVisualizer.plot_price_with_ma(
        df_features.tail(100),
        title=f"{selected_ticker} - Price with Moving Averages (Last 100 days)",
        figsize=(14, 7)
    )
    plt.show()
    print("✓ Chart generated\n")

## Section 6: Generate Trend Prediction Graphs

Create visualizations for price forecasts and technical indicators.

In [None]:
# Analyze residuals
if models:
    print("=== Prediction Analysis ===\n")
    
    for model_type, preds in predictions.items():
        test_predictions = preds['test']
        residuals = y_test.values - test_predictions
        
        print(f"{model_type.upper()} Model Test Predictions:")
        print(f"  Mean prediction: {test_predictions.mean():.4f}")
        print(f"  Prediction std: {test_predictions.std():.4f}")
        print(f"  Mean residual: {residuals.mean():.4f}")
        print(f"  Residual std: {residuals.std():.4f}")
        print()

In [None]:
# Evaluate models
if models:
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    
    print("=== Model Performance Evaluation ===\n")
    
    for model_type, preds in predictions.items():
        print(f"{model_type.upper()} Model:")
        
        # Train metrics
        train_mse = mean_squared_error(y_train, preds['train'])
        train_rmse = np.sqrt(train_mse)
        train_mae = mean_absolute_error(y_train, preds['train'])
        train_r2 = r2_score(y_train, preds['train'])
        
        # Test metrics
        test_mse = mean_squared_error(y_test, preds['test'])
        test_rmse = np.sqrt(test_mse)
        test_mae = mean_absolute_error(y_test, preds['test'])
        test_r2 = r2_score(y_test, preds['test'])
        
        print(f"  Train RMSE: {train_rmse:.4f} | MAE: {train_mae:.4f} | R²: {train_r2:.4f}")
        print(f"  Test  RMSE: {test_rmse:.4f} | MAE: {test_mae:.4f} | R²: {test_r2:.4f}")
        print()

## Section 5: Evaluate Model Performance

Assess model accuracy using multiple metrics.

In [None]:
# Train multiple models
if df is not None and X is not None:
    print("Training forecasting models...\n")
    
    models = {}
    predictions = {}
    
    for model_type in ['linear', 'rf']:
        print(f"Training {model_type.upper()} model...")
        
        predictor = StockPredictor()
        predictor.build_model(model_type=model_type)
        predictor.train(X_train, y_train)
        
        # Make predictions
        y_pred_train = predictor.predict(X_train)
        y_pred_test = predictor.predict(X_test)
        
        models[model_type] = predictor
        predictions[model_type] = {
            'train': y_pred_train,
            'test': y_pred_test
        }
        
        print(f"  ✓ Model trained successfully\n")

## Section 4: Train Forecasting Models

Train multiple forecasting models using scikit-learn.

In [None]:
# Prepare ML data
if df is not None:
    print("Preparing data for machine learning...")
    
    X, y = prepare_ml_data(df_features, lookahead=5, drop_na=True)
    
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Target distribution: {y.value_counts().to_dict()}")
    
    # Split into train/test
    split_idx = int(len(X) * 0.8)
    
    X_train = X.iloc[:split_idx]
    y_train = y.iloc[:split_idx]
    X_test = X.iloc[split_idx:]
    y_test = y.iloc[split_idx:]
    
    print(f"\nTrain set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    print(f"\nFeatures used for forecasting:")
    for i, col in enumerate(X.columns[:10], 1):
        print(f"  {i}. {col}")

## Section 3: Build Time Series Features

Create lag features and prepare training/test datasets for forecasting.

In [None]:
# Display technical indicators
if df is not None:
    print("=== Technical Indicators (Last 5 days) ===\n")
    
    indicator_cols = ['close', 'sma_20', 'sma_50', 'rsi_14', 'atr', 'daily_return']
    available_cols = [col for col in indicator_cols if col in df_features.columns]
    
    display_df = df_features[available_cols].tail(5).round(4)
    print(display_df)

In [None]:
# Engineer technical features
if df is not None:
    print(f"Engineering features for {selected_ticker}...")
    df_features = engineer_features(df)
    
    print(f"Original columns: {len(df.columns)}")
    print(f"Engineered columns: {len(df_features.columns)}")
    print(f"\nNew features created:")
    
    new_features = [col for col in df_features.columns if col not in df.columns]
    for feat in sorted(new_features):
        print(f"  - {feat}")
    
    print("\nFeature summary:")
    print(df_features[new_features].describe().round(4))

## Section 2: Data Preprocessing and Feature Engineering

Engineer technical indicators and create features for forecasting.

In [None]:
# Validate data and display statistics
if df is not None:
    is_valid = validate_stock_data(df)
    print(f"Data validation: {'✓ Valid' if is_valid else '✗ Invalid'}")
    
    print("\n=== Data Summary ===")
    print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"Total records: {len(df)}")
    print(f"Missing values: {df.isnull().sum().sum()}")
    
    print("\n=== Price Statistics ===")
    print(f"Current price: ${df['close'].iloc[-1]:.2f}")
    print(f"52-week high: ${df['close'].max():.2f}")
    print(f"52-week low: ${df['close'].min():.2f}")
    print(f"Average price: ${df['close'].mean():.2f}")
    print(f"Price volatility (std): ${df['close'].std():.2f}")
    
    print("\n=== Volume Statistics ===")
    print(f"Average daily volume: {df['volume'].mean():.0f}")
    print(f"Total volume: {df['volume'].sum():.0f}")

In [None]:
# Fetch stock data for multiple stocks
tickers = ['AAPL', 'GOOGL', 'MSFT']
period = '1y'

print(f"Fetching {period} of data for: {', '.join(tickers)}")

stock_data = {}
for ticker in tickers:
    try:
        df = fetch_stock_data(ticker, period=period)
        stock_data[ticker] = df
        print(f"✓ {ticker}: {len(df)} records from {df['date'].min().date()} to {df['date'].max().date()}")
    except Exception as e:
        print(f"✗ Error fetching {ticker}: {str(e)}")

# Use first available stock for demonstrations
selected_ticker = list(stock_data.keys())[0] if stock_data else None
df = stock_data.get(selected_ticker)

if df is not None:
    print(f"\nSelected ticker: {selected_ticker}")
    print(f"Data shape: {df.shape}")
    print(f"\nFirst few rows:")
    print(df.head())

## Section 1: Load and Explore Historical Stock Data

Load historical stock data from yfinance API and display basic statistics.

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import sys
import os

# Add project to path
sys.path.insert(0, '/a/stock-analysis')

from src.utils.fetch_data import fetch_stock_data, validate_stock_data
from src.models.feature_engineering import engineer_features, prepare_ml_data, TechnicalIndicators
from src.models.predictor import TimeSeriesForecaster, TrendAnalyzer, StockPredictor
from src.utils.visualization import StockVisualizer

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 7)

print("Libraries loaded successfully!")

# Stock Market Price Forecasting with PySpark

## Objective
Build a data-driven forecasting model to predict stock price movements and generate trend prediction graphs using historical data and machine learning.

**Problem**: Investors need reliable data-driven predictions for stock movements.

**Outcome**: Trend prediction graphs and forecasts for selected stocks.