In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
from transformers import TimeSeriesTransformerModel, TimeSeriesTransformerConfig
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import RandomForestRegressor
import optuna
import talib
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import requests
import json
from bs4 import BeautifulSoup
import nltk
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

In [None]:
class DataCollector:
    def __init__(self):
        self.base_data = {}
        self.sentiment_data = {}
        self.economic_indicators = {}
        
    def fetch_historical_data(self, symbols, start_date, end_date):
        """Fetch historical stock data from multiple sources"""
        print("📊 Fetching historical market data...")
        
        all_data = {}
        for symbol in tqdm(symbols):
            # Primary market data from Yahoo Finance
            stock = yf.Ticker(symbol)
            df = stock.history(start=start_date, end=end_date, interval='1d')
            
            # Add trading volume analysis
            df['Volume_MA'] = df['Volume'].rolling(window=20).mean()
            df['Volume_Ratio'] = df['Volume'] / df['Volume_MA']
            
            # Calculate additional price metrics
            df['Daily_Return'] = df['Close'].pct_change()
            df['Volatility'] = df['Daily_Return'].rolling(window=20).std()
            
            all_data[symbol] = df
            
        self.base_data = all_data
        return all_data
    
    def add_economic_indicators(self):
        """Add macroeconomic indicators"""
        print("🌍 Adding economic indicators...")
        
        # Simulated economic data (in real implementation, would fetch from FRED API)
        dates = pd.date_range(start='2010-01-01', end='2024-01-01', freq='D')
        
        indicators = {
            'GDP_Growth': np.random.normal(2.5, 0.5, len(dates)),
            'Inflation_Rate': np.random.normal(2.0, 0.3, len(dates)),
            'Unemployment_Rate': np.random.normal(5.0, 0.4, len(dates)),
            'Interest_Rate': np.random.normal(3.0, 0.2, len(dates))
        }
        
        self.economic_indicators = pd.DataFrame(indicators, index=dates)
        return self.economic_indicators
    
    def add_sentiment_analysis(self, symbols):
        """Add sentiment analysis from news and social media"""
        print("🗣️ Performing sentiment analysis...")
        
        sentiment_data = {}
        for symbol in tqdm(symbols):
            # Simulated sentiment scores (in real implementation, would fetch from news APIs)
            dates = pd.date_range(start='2010-01-01', end='2024-01-01', freq='D')
            
            sentiment_scores = {
                'News_Sentiment': np.random.normal(0.2, 0.3, len(dates)),
                'Social_Media_Score': np.random.normal(0.3, 0.4, len(dates)),
                'Analyst_Rating': np.random.normal(3.5, 0.5, len(dates))
            }
            
            sentiment_data[symbol] = pd.DataFrame(sentiment_scores, index=dates)
            
        self.sentiment_data = sentiment_data
        return sentiment_data

In [None]:
class DataCleaner:
    """Advanced data cleaning and preprocessing"""
    
    def __init__(self):
        self.scalers = {}
        self.anomaly_detectors = {}
        self.cleaning_stats = {}
    
    def remove_outliers(self, df, columns, method='isolation_forest'):
        """Remove outliers using multiple methods"""
        print("🧹 Removing outliers...")
        
        if method == 'isolation_forest':
            iso_forest = IsolationForest(contamination=0.1, random_state=42)
            outliers = iso_forest.fit_predict(df[columns])
            return df[outliers == 1]
        
        elif method == 'zscore':
            z_scores = stats.zscore(df[columns])
            return df[(z_scores < 3).all(axis=1)]
        
        elif method == 'iqr':
            Q1 = df[columns].quantile(0.25)
            Q3 = df[columns].quantile(0.75)
            IQR = Q3 - Q1
            return df[~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
    
    def handle_missing_values(self, df, method='interpolate'):
        """Handle missing values with multiple methods"""
        print("🔍 Handling missing values...")
        
        if method == 'interpolate':
            return df.interpolate(method='cubic')
        
        elif method == 'forward_fill':
            return df.fillna(method='ffill')
        
        elif method == 'backward_fill':
            return df.fillna(method='bfill')
            
    def normalize_features(self, df, columns, method='robust'):
        """Normalize features using multiple methods"""
        print("📊 Normalizing features...")
        
        if method == 'robust':
            scaler = RobustScaler()
        elif method == 'standard':
            scaler = StandardScaler()
        elif method == 'minmax':
            scaler = MinMaxScaler()
            
        df[columns] = scaler.fit_transform(df[columns])
        return df
    
    def check_stationarity(self, series):
        """Check time series stationarity"""
        print("📈 Checking stationarity...")
        
        # Augmented Dickey-Fuller test
        adf_result = adfuller(series)
        
        # KPSS test
        kpss_result = kpss(series)
        
        return {
            'adf_statistic': adf_result[0],
            'adf_pvalue': adf_result[1],
            'kpss_statistic': kpss_result[0],
            'kpss_pvalue': kpss_result[1]
        }

In [None]:
class AdvancedFeatureEngineering:
    def __init__(self):
        self.scalers = {}
        
    def generate_features(self, df, symbol):
        """Generate comprehensive feature set"""
        print(f"⚙️ Engineering features for {symbol}...")
        
        # Technical Indicators
        df['MA_7'] = talib.MA(df['Close'], timeperiod=7)
        df['MA_21'] = talib.MA(df['Close'], timeperiod=21)
        df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
        df['MACD'], df['MACD_Signal'], _ = talib.MACD(df['Close'])
        df['BB_Upper'], df['BB_Middle'], df['BB_Lower'] = talib.BBANDS(df['Close'])
        df['ADX'] = talib.ADX(df['High'], df['Low'], df['Close'])
        df['OBV'] = talib.OBV(df['Close'], df['Volume'])
        
        # Momentum Indicators
        df['ROC'] = talib.ROC(df['Close'], timeperiod=10)
        df['MOM'] = talib.MOM(df['Close'], timeperiod=10)
        df['CCI'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=14)
        
        # Volatility Indicators
        df['ATR'] = talib.ATR(df['High'], df['Low'], df['Close'], timeperiod=14)
        df['NATR'] = talib.NATR(df['High'], df['Low'], df['Close'], timeperiod=14)
        
        # Volume Indicators
        df['AD'] = talib.AD(df['High'], df['Low'], df['Close'], df['Volume'])
        df['ADOSC'] = talib.ADOSC(df['High'], df['Low'], df['Close'], df['Volume'])
        
        return df

In [None]:
class ModelTrainer:
    """Advanced model training with ensemble methods and validation"""
    
    def __init__(self):
        self.models = {}
        self.performance_metrics = {}
        self.validation_results = {}
        
    def create_sequences(self, data, seq_length):
        """Create sequences for time series prediction"""
        X, y = [], []
        for i in range(len(data) - seq_length):
            X.append(data[i:(i + seq_length)])
            y.append(data[i + seq_length])
        return np.array(X), np.array(y)
    
    def train_lstm_model(self, X_train, y_train):
        """Train LSTM model with attention"""
        model = Sequential([
            LSTM(128, return_sequences=True),
            Attention(),
            LSTM(64),
            Dense(32, activation='relu'),
            Dense(1)
        ])
        
        model.compile(optimizer='adam', loss='mse')
        return model
    
    def train_ensemble(self, X_train, y_train):
        """Train ensemble of models"""
        models = {
            'lightgbm': lgb.LGBMRegressor(),
            'xgboost': xgb.XGBRegressor(),
            'catboost': cb.CatBoostRegressor(verbose=False),
            'random_forest': RandomForestRegressor()
        }
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            self.models[name] = model
            
    def validate_models(self, X_test, y_test):
        """Validate models with multiple metrics"""
        metrics = {}
        
        for name, model in self.models.items():
            predictions = model.predict(X_test)
            
            metrics[name] = {
                'mse': mean_squared_error(y_test, predictions),
                'r2': r2_score(y_test, predictions),
                'mape': mean_absolute_percentage_error(y_test, predictions)
            }
            
        self.validation_results = metrics
        return metrics

In [None]:
class MarketPredictor:
    def __init__(self):
        self.data_collector = DataCollector()
        self.feature_engineer = AdvancedFeatureEngineering()
        self.models = {}
        
    def prepare_training_data(self, symbols, start_date, end_date):
        """Prepare comprehensive training dataset"""
        
        # Fetch all required data
        historical_data = self.data_collector.fetch_historical_data(symbols, start_date, end_date)
        economic_data = self.data_collector.add_economic_indicators()
        sentiment_data = self.data_collector.add_sentiment_analysis(symbols)
        
        # Process each symbol
        processed_data = {}
        for symbol in symbols:
            df = historical_data[symbol].copy()
            
            # Add features
            df = self.feature_engineer.generate_features(df, symbol)
            
            # Merge with economic and sentiment data
            df = df.join(economic_data, how='left')
            df = df.join(sentiment_data[symbol], how='left')
            
            # Handle missing values
            df = df.fillna(method='ffill')
            
            processed_data[symbol] = df
            
        return processed_data

In [None]:
if __name__ == "__main__":
    # Define parameters
    symbols = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA']
    start_date = '2010-01-01'
    end_date = '2024-01-01'
    
    # Initialize predictor
    predictor = MarketPredictor()
    
    # Prepare data
    print("\n🚀 Initializing Advanced Market Prediction Pipeline")
    print("================================================")
    
    processed_data = predictor.prepare_training_data(symbols, start_date, end_date)
    
    # Display sample of processed data
    for symbol in symbols:
        print(f"\n📈 Sample of processed data for {symbol}:")
        print(processed_data[symbol].tail().round(2))
        
        # Plot key metrics
        fig = go.Figure()
        
        fig.add_trace(go.Scatter(
            x=processed_data[symbol].index,
            y=processed_data[symbol]['Close'],
            name="Price",
            line=dict(color='blue')
        ))
        
        fig.add_trace(go.Scatter(
            x=processed_data[symbol].index,
            y=processed_data[symbol]['MA_21'],
            name="21-day MA",
            line=dict(color='red', dash='dash')
        ))
        
        fig.update_layout(
            title=f"{symbol} Price and Moving Average",
            xaxis_title="Date",
            yaxis_title="Price",
            template="plotly_dark"
        )
        
        fig.show()