In [None]:
# Clone the repository and set up the environment
import os
import sys

# Clone the repository if it doesn't exist
if not os.path.exists('/content/finfo'):
    print("Cloning the finfo repository...")
    !git clone https://github.com/etrigan5500/finfo.git /content/finfo
    print("Repository cloned successfully!")
else:
    print("Repository already exists, pulling latest changes...")
    !cd /content/finfo && git pull
    print("Repository updated!")

# Change to the repository directory
os.chdir('/content/finfo')
print(f"Current working directory: {os.getcwd()}")

# Add the repository to Python path so we can import modules
sys.path.append('/content/finfo')

# Verify the file structure
print("\nRepository structure:")
!ls -la

print("\nTraining directory structure:")
!ls -la training/

print("\nData collection modules:")
!ls -la data_collection/


In [None]:
# Import existing modules from the repository
try:
    # Import the news collector from the repository
    from data_collection.news.news_collector import NewsCollector as RepoNewsCollector
    print("✅ Successfully imported NewsCollector from repository")
    
    # Import other utilities if available
    from data_collection.financial.financial_collector import FinancialCollector
    print("✅ Successfully imported FinancialCollector from repository")
    
    # Use the repository's news collector
    USE_REPO_MODULES = True
    print("🎉 Using modules from the cloned repository!")
    
except ImportError as e:
    print(f"⚠️ Could not import repository modules: {e}")
    print("📝 Will use the notebook-defined classes instead")
    USE_REPO_MODULES = False


In [None]:
# Create necessary directories for the project
import os

# Create directories that might be needed
directories_to_create = [
    'models',
    'training/models',
    'training/results',
    'data/processed',
    'logs'
]

for directory in directories_to_create:
    os.makedirs(directory, exist_ok=True)
    print(f"✅ Created/verified directory: {directory}")

print("\n📁 Directory structure ready for training!")


In [None]:
# Install required packages
!pip install -q torch transformers pandas numpy scikit-learn yfinance sentence-transformers google-api-python-client python-dotenv beautifulsoup4 requests


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import yfinance as yf
from datetime import datetime, timedelta
import os
from transformers import AutoModel, AutoTokenizer
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


In [None]:
# Base Model Class
class BaseModel(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        raise NotImplementedError
        
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            return self.forward(x)

# News Model
class NewsModel(BaseModel):
    def __init__(self, model_name='distilbert-base-uncased', num_classes=5):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.num_classes = num_classes
        
        # Freeze BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False
            
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, num_classes)  # Changed to num_classes
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return self.classifier(pooled_output)

# Financial Model
class FinancialModel(BaseModel):
    def __init__(self, input_size=6, num_classes=5):
        super().__init__()
        self.num_classes = num_classes
        
        self.feature_extractor = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(0.2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, num_classes)  # Changed to num_classes
        )
        
    def forward(self, x):
        features = self.feature_extractor(x)
        return self.classifier(features)

# Price Model
class PriceModel(BaseModel):
    def __init__(self, sequence_length=24, num_classes=5):
        super().__init__()
        
        self.sequence_length = sequence_length
        self.num_classes = num_classes
        
        # CNN layers for feature extraction
        self.conv_layers = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=64,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
            dropout=0.2
        )
        
        # Final classifier
        self.classifier = nn.Sequential(
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, num_classes)  # Changed to num_classes
        )
        
    def forward(self, x):
        # Input shape: (batch_size, sequence_length)
        x = x.unsqueeze(1)  # Add channel dimension
        
        # CNN feature extraction
        features = self.conv_layers(x)
        
        # Reshape for LSTM
        features = features.permute(0, 2, 1)  # (batch_size, seq_len, features)
        
        # LSTM processing
        lstm_out, _ = self.lstm(features)
        
        # Take the last output
        last_output = lstm_out[:, -1, :]
        
        # Classification
        return self.classifier(last_output)

# Ensemble Model
class EnsembleModel(BaseModel):
    def __init__(self, num_classes=5):
        super().__init__()
        self.num_classes = num_classes
        
        # Initialize individual models
        self.news_model = NewsModel(num_classes=num_classes)
        self.financial_model = FinancialModel(num_classes=num_classes)
        self.price_model = PriceModel(num_classes=num_classes)
        
        # Ensemble classifier - takes concatenated features from all models
        self.ensemble = nn.Sequential(
            nn.Linear(num_classes * 3, 32),  # 5*3 = 15 input features
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, num_classes)  # Final 5-class output
        )
        
    def forward(self, news_input_ids, news_attention_mask, financial_data, price_data):
        # Get predictions from individual models
        news_pred = self.news_model.forward(news_input_ids, news_attention_mask)
        financial_pred = self.financial_model.forward(financial_data)
        price_pred = self.price_model.forward(price_data)
        
        # Combine predictions
        combined = torch.cat([news_pred, financial_pred, price_pred], dim=1)
        
        # Final prediction
        return self.ensemble(combined)

print("Model classes defined successfully!")


In [None]:
# News Collection Class with intelligent clustering
# Use repository version if available, otherwise use notebook version
if USE_REPO_MODULES:
    print("🔄 Using NewsCollector from repository")
    NewsCollector = RepoNewsCollector
else:
    print("📝 Using NewsCollector defined in notebook")
    
    class NewsCollector:
    def __init__(self, api_key=None, search_engine_id=None):
        self.api_key = api_key
        self.search_engine_id = search_engine_id
        self.use_real_news = api_key is not None and search_engine_id is not None
        
    def get_company_news(self, company_name, ticker, num_articles=5):
        """Get clustered news articles for a company."""
        if self.use_real_news and self.api_key:
            return self._get_real_news_clustered(company_name, ticker, num_articles)
        else:
            return self._get_synthetic_news(company_name, ticker, num_articles)
    
    def _get_real_news_clustered(self, company_name, ticker, num_articles):
        """Get real news using Google Custom Search API with clustering."""
        try:
            from googleapiclient.discovery import build
            from sklearn.cluster import KMeans
            from sentence_transformers import SentenceTransformer
            import numpy as np
            
            # Use focused financial and prediction queries (2 queries instead of 3)
            search_queries = [
                f"{company_name} financial news",
                f"{company_name} stock prediction news"
            ]
            
            all_articles = []
            print(f"Fetching news for {ticker} with {len(search_queries)} queries...")
            
            # Initialize Google Custom Search API service
            service = build("customsearch", "v1", developerKey=self.api_key)
            
            for query in search_queries:
                try:
                    # Execute search
                    result = service.cse().list(
                        q=query,
                        cx=self.search_engine_id,
                        num=10  # Get 10 articles per query = 20 total
                    ).execute()
                    
                    if "items" in result:
                        for item in result["items"]:
                            title = item.get('title', '')
                            snippet = item.get('snippet', '')
                            if title and snippet:
                                combined_text = f"{title}. {snippet}"
                                all_articles.append(combined_text)
                except Exception as e:
                    print(f"Error in search query '{query}': {str(e)}")
                    continue
            
            print(f"Collected {len(all_articles)} articles for {ticker}")
            
            # If we have enough articles, cluster them
            if len(all_articles) >= 6:  # Need at least 6 articles for 3 clusters
                return self._cluster_articles(all_articles, num_clusters=3, articles_per_cluster=num_articles//3)
            else:
                # Return what we have if not enough for clustering
                return all_articles[:num_articles]
            
        except Exception as e:
            print(f"Error fetching real news for {ticker}: {str(e)}")
            return self._get_synthetic_news(company_name, ticker, num_articles)
    
    def _cluster_articles(self, articles, num_clusters=3, articles_per_cluster=2):
        """Cluster articles by topic and return representative articles from each cluster."""
        try:
            # Initialize sentence transformer for semantic clustering
            model = SentenceTransformer('all-MiniLM-L6-v2')
            
            # Get embeddings for all articles
            embeddings = model.encode(articles)
            
            # Perform K-means clustering
            kmeans = KMeans(n_clusters=min(num_clusters, len(articles)), random_state=42)
            cluster_labels = kmeans.fit_predict(embeddings)
            
            # Get representative articles from each cluster
            clustered_articles = []
            for cluster_id in range(num_clusters):
                cluster_indices = np.where(cluster_labels == cluster_id)[0]
                
                if len(cluster_indices) > 0:
                    # Get articles closest to cluster center
                    cluster_center = kmeans.cluster_centers_[cluster_id]
                    distances = np.linalg.norm(embeddings[cluster_indices] - cluster_center, axis=1)
                    
                    # Sort by distance to center and take the closest ones
                    sorted_indices = cluster_indices[np.argsort(distances)]
                    
                    # Take up to articles_per_cluster from this cluster
                    for idx in sorted_indices[:articles_per_cluster]:
                        clustered_articles.append(articles[idx])
            
            print(f"Clustered into {num_clusters} topics, returning {len(clustered_articles)} representative articles")
            return clustered_articles
            
        except Exception as e:
            print(f"Error clustering articles: {str(e)}. Returning random sample.")
            # Fallback to random sampling if clustering fails
            import random
            return random.sample(articles, min(len(articles), num_clusters * articles_per_cluster))
    
    def _get_synthetic_news(self, company_name, ticker, num_articles):
        """Generate diverse synthetic news for training when no API key is provided."""
        # Create varied synthetic news to prevent overfitting
        news_templates = [
            f"{company_name} reports strong quarterly earnings with revenue beating expectations.",
            f"{company_name} announces new product innovations and market expansion plans.",
            f"{company_name} faces regulatory challenges but maintains growth trajectory.",
            f"{company_name} stock sees volatility amid broader market uncertainty.",
            f"{company_name} CEO discusses future strategy and investment priorities.",
            f"{company_name} partnerships drive growth in key technology sectors.",
            f"{company_name} navigates supply chain challenges while increasing market share.",
            f"{company_name} reports mixed quarterly results with strong guidance ahead.",
            f"{company_name} invests heavily in AI and cloud infrastructure development.",
            f"{company_name} stock analysts upgrade ratings citing strong fundamentals."
        ]
        
        # Add some randomness to make articles more diverse
        import random
        selected_templates = random.sample(news_templates, min(num_articles, len(news_templates)))
        
        # Add some variation with sentiment and market conditions
        sentiment_modifiers = [
            "optimistic", "cautious", "bullish", "bearish", "neutral",
            "positive", "mixed", "concerned", "confident", "uncertain"
        ]
        
        articles = []
        for i, template in enumerate(selected_templates):
            sentiment = random.choice(sentiment_modifiers)
            modified_article = f"{template} Market sentiment appears {sentiment} regarding recent developments."
            articles.append(modified_article)
            
        return articles

def classify_return(return_pct):
    """Classify return percentage into 5 categories."""
    if return_pct < -0.02:  # < -2%
        return 0  # Strong Decrease
    elif return_pct < -0.005:  # -2% to -0.5%
        return 1  # Moderate Decrease
    elif return_pct <= 0.005:  # -0.5% to +0.5%
        return 2  # Stable
    elif return_pct <= 0.02:  # +0.5% to +2%
        return 3  # Moderate Increase
    else:  # > +2%
        return 4  # Strong Increase



def collect_training_data_efficient(ticker_list, data_period='1y', interval='1wk', 
                                  resample_period='15D', api_key=None, search_engine_id=None,
                                  num_samples_per_ticker=50, sequence_length=12):
    """
    Efficiently collect training data with flexible periods and intervals.
    
    Args:
        ticker_list: List of stock tickers
        data_period: Period to fetch ('6mo', '1y', '2y', etc.)
        interval: Data interval ('1d', '1wk', '1mo')
        resample_period: Period to resample to ('15D', '1M', etc.)
        api_key: Google Custom Search API key for real news (optional)
        search_engine_id: Google Custom Search Engine ID (optional)
        num_samples_per_ticker: Number of samples per ticker
        sequence_length: Length of price sequence for model (adjusted based on available data)
    """
    data = []
    
    # Initialize news collector
    if USE_REPO_MODULES:
        news_collector = NewsCollector(api_key, search_engine_id)
        print("🔄 Using repository NewsCollector")
    else:
        news_collector = NewsCollector(api_key, search_engine_id)
        print("📝 Using notebook NewsCollector")
    
    # Company name mapping for better news search
    company_names = {
        'AAPL': 'Apple Inc',
        'MSFT': 'Microsoft Corporation', 
        'GOOGL': 'Alphabet Google',
        'AMZN': 'Amazon',
        'TSLA': 'Tesla',
        'META': 'Meta Facebook',
        'NVDA': 'NVIDIA Corporation',
        'NFLX': 'Netflix',
        'ADBE': 'Adobe',
        'CRM': 'Salesforce',
        'ORCL': 'Oracle Corporation',
        'IBM': 'IBM',
        'INTC': 'Intel Corporation',
        'AMD': 'Advanced Micro Devices',
        'CSCO': 'Cisco Systems',
        'PYPL': 'PayPal',
        'UBER': 'Uber Technologies',
        'LYFT': 'Lyft',
        'SNAP': 'Snap Inc',
        'TWTR': 'Twitter'
    }
    
    for ticker in ticker_list:
        try:
            print(f"Processing {ticker}...")
            
            # Get stock data efficiently with specified interval
            stock = yf.Ticker(ticker)
            
            # Use yfinance's built-in period and interval parameters for efficiency
            hist = stock.history(period=data_period, interval=interval)
            
            if len(hist) < 10:  # Need at least 10 data points
                print(f"Insufficient data for {ticker}, skipping...")
                continue
                
            # Only resample if the interval doesn't match our target
            if interval != resample_period and resample_period != 'no_resample':
                hist_resampled = hist.resample(resample_period).agg({
                    'Open': 'first',
                    'High': 'max',
                    'Low': 'min',
                    'Close': 'last',
                    'Volume': 'sum'
                }).dropna()
            else:
                hist_resampled = hist.copy()
            
            if len(hist_resampled) < sequence_length + 1:
                print(f"Insufficient resampled data for {ticker} ({len(hist_resampled)} points), skipping...")
                continue
            
            # Calculate future returns (next period)
            hist_resampled['Future_Return'] = hist_resampled['Close'].shift(-1) / hist_resampled['Close'] - 1
            
            # Get financial data (cached to avoid repeated API calls)
            try:
                info = stock.info
            except:
                info = {}
            
            # Get company news (cached per ticker) - get 9 articles for 3 clusters of 3 each
            company_name = company_names.get(ticker, ticker)
            news_articles = news_collector.get_company_news(company_name, ticker, num_articles=9)
            
            # Create data points efficiently
            sample_count = 0
            available_samples = len(hist_resampled) - sequence_length - 1
            
            # Sample indices to get diverse data points across the time period
            if available_samples > num_samples_per_ticker:
                sample_indices = np.random.choice(available_samples, num_samples_per_ticker, replace=False)
                sample_indices = sorted(sample_indices + sequence_length)  # Adjust for sequence length
            else:
                sample_indices = range(sequence_length, len(hist_resampled) - 1)
            
            for i in sample_indices:
                if not np.isnan(hist_resampled['Future_Return'].iloc[i]) and sample_count < num_samples_per_ticker:
                    # Price data (last sequence_length points)
                    start_idx = max(0, i - sequence_length + 1)
                    price_data = hist_resampled['Close'].iloc[start_idx:i+1].tolist()
                    
                    # Pad if necessary (shouldn't happen with our logic above, but safety check)
                    if len(price_data) < sequence_length:
                        price_data = [price_data[0]] * (sequence_length - len(price_data)) + price_data
                    
                    # Financial metrics (using cached info)
                    financial_metrics = [
                        np.random.normal(0, 0.1),  # Revenue growth (simulated)
                        np.random.normal(0, 0.1),  # Net income growth (simulated)
                        np.random.normal(0, 0.1),  # EPS growth (simulated)
                        info.get('trailingPE', 15) / 100 if info.get('trailingPE') else 0.15,  # Normalized P/E
                        np.random.normal(0, 0.1),  # Debt-to-equity (simulated)
                        np.random.normal(0, 0.1)   # ROE (simulated)
                    ]
                    
                    # Select a random news article for this sample
                    import random
                    selected_news = random.choice(news_articles) if news_articles else f"{company_name} financial update."
                    
                    # Classify the return into 5 categories
                    target_class = classify_return(hist_resampled['Future_Return'].iloc[i])
                    
                    data.append({
                        'ticker': ticker,
                        'company_name': company_name,
                        'date': hist_resampled.index[i],
                        'price_data': price_data,
                        'financial_metrics': financial_metrics,
                        'news_text': selected_news,
                        'target': target_class,
                        'return_pct': hist_resampled['Future_Return'].iloc[i]
                    })
                    sample_count += 1
                    
        except Exception as e:
            print(f"Error processing {ticker}: {str(e)}")
            
    return pd.DataFrame(data)

# Configuration options for different training scenarios
TRAINING_CONFIGS = {
    # Option 1: 6 months, weekly data, resampled to 15-day (most efficient)
    'efficient_6mo': {
        'data_period': '6mo',
        'interval': '1wk',  # Get weekly data from API (efficient)
        'resample_period': '15D',  # Resample to 15-day intervals
        'sequence_length': 12,  # Adjust sequence length to available data
        'description': '6 months of weekly data, resampled to 15-day intervals'
    },
    
    # Option 2: 1 year, weekly data, resampled to 15-day (balanced)
    'balanced_1y': {
        'data_period': '1y',
        'interval': '1wk',
        'resample_period': '15D',
        'sequence_length': 24,  # Can use full sequence length
        'description': '1 year of weekly data, resampled to 15-day intervals'
    },
    
    # Option 3: 6 months, daily data, resampled to 15-day (more granular but less efficient)
    'detailed_6mo': {
        'data_period': '6mo',
        'interval': '1d',
        'resample_period': '15D',
        'sequence_length': 12,
        'description': '6 months of daily data, resampled to 15-day intervals'
    },
    
    # Option 4: Original approach (2 years of data)
    'original_2y': {
        'data_period': '2y',
        'interval': '1d',
        'resample_period': '15D',
        'sequence_length': 24,
        'description': '2 years of daily data, resampled to 15-day intervals (original)'
    }
}

# Tech companies list (20 companies)
tech_companies = [
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'META', 'NVDA', 'NFLX', 
    'ADBE', 'CRM', 'ORCL', 'IBM', 'INTC', 'AMD', 'CSCO', 'PYPL', 
    'UBER', 'LYFT', 'SNAP', 'TWTR'
]

# Choose configuration - recommend 'balanced_1y' for best results
SELECTED_CONFIG = 'balanced_1y'  # Change this to 'efficient_6mo' if you prefer 6 months
config = TRAINING_CONFIGS[SELECTED_CONFIG]

# Set your Google Custom Search API credentials here if you want to use real news data
GOOGLE_API_KEY = None  # Replace with your actual API key: "your_google_api_key_here"
SEARCH_ENGINE_ID = None  # Replace with your actual Search Engine ID: "your_search_engine_id_here"

print(f"Using configuration: {SELECTED_CONFIG}")
print(f"Description: {config['description']}")
print(f"Data period: {config['data_period']}")
print(f"API interval: {config['interval']}")
print(f"Resample period: {config['resample_period']}")
print(f"Sequence length: {config['sequence_length']}")
print(f"Using {'real' if GOOGLE_API_KEY and SEARCH_ENGINE_ID else 'synthetic'} news data")
print()

# Collect training data with selected configuration
training_data = collect_training_data_efficient(
    ticker_list=tech_companies,
    data_period=config['data_period'],
    interval=config['interval'],
    resample_period=config['resample_period'],
    api_key=GOOGLE_API_KEY,
    search_engine_id=SEARCH_ENGINE_ID,
    num_samples_per_ticker=50,  # Increased from 30
    sequence_length=config['sequence_length']
)

print(f"Collected {len(training_data)} samples from {training_data['ticker'].nunique()} companies")
print(f"Expected sequence length in price data: {config['sequence_length']}")


In [None]:
# Let's examine the diversity of our collected data
print("Sample of collected training data:")
print("=" * 60)

# Show unique companies
print(f"Companies: {sorted(training_data['ticker'].unique())}")
print(f"Total companies: {training_data['ticker'].nunique()}")
print(f"Total samples: {len(training_data)}")
print(f"Samples per company: {training_data['ticker'].value_counts().describe()}")

print("\nSample news articles (first 5):")
print("-" * 40)
for i, (idx, row) in enumerate(training_data.head().iterrows()):
    print(f"{i+1}. {row['ticker']} - {row['company_name']}")
    print(f"   News: {row['news_text']}")
    print(f"   Target: {'INCREASE' if row['target'] == 1 else 'DECREASE'}")
    print()

# Check news diversity
unique_news = training_data['news_text'].nunique()
total_samples = len(training_data)
print(f"News diversity: {unique_news} unique articles out of {total_samples} samples")
print(f"Diversity ratio: {unique_news/total_samples:.2%}")

# Target distribution
target_dist = training_data['target'].value_counts().sort_index()
class_names = ['📉 Strong Decrease', '🔻 Moderate Decrease', '➡️ Stable', '🔺 Moderate Increase', '📈 Strong Increase']

print(f"\nTarget distribution (5-class system):")
for i in range(5):
    count = target_dist.get(i, 0)
    percentage = count/len(training_data)*100 if len(training_data) > 0 else 0
    print(f"{class_names[i]} ({i}): {count} samples ({percentage:.1f}%)")

# Show return ranges for each class
print(f"\nReturn ranges by class:")
for i in range(5):
    class_data = training_data[training_data['target'] == i]
    if len(class_data) > 0:
        min_ret = class_data['return_pct'].min() * 100
        max_ret = class_data['return_pct'].max() * 100
        avg_ret = class_data['return_pct'].mean() * 100
        print(f"{class_names[i]}: {min_ret:.2f}% to {max_ret:.2f}% (avg: {avg_ret:.2f}%)")


In [None]:
class StockDataset(Dataset):
    def __init__(self, data, tokenizer, sequence_length=24):
        self.data = data
        self.tokenizer = tokenizer
        self.sequence_length = sequence_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        
        # Process price data - ensure it matches expected sequence length
        price_data = torch.tensor(item['price_data'], dtype=torch.float32)
        
        # Pad or truncate to match expected sequence length
        if len(price_data) < self.sequence_length:
            # Pad with the first value
            padding = torch.full((self.sequence_length - len(price_data),), price_data[0])
            price_data = torch.cat([padding, price_data])
        elif len(price_data) > self.sequence_length:
            # Take the last sequence_length values
            price_data = price_data[-self.sequence_length:]
        
        # Normalize price data
        price_data = (price_data - price_data.mean()) / (price_data.std() + 1e-8)
        
        # Process news data
        news_inputs = self.tokenizer(
            item['news_text'],
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        
        # Process financial data
        financial_data = torch.tensor(item['financial_metrics'], dtype=torch.float32)
        financial_data = (financial_data - financial_data.mean()) / (financial_data.std() + 1e-8)
        
        return {
            'price_data': price_data,
            'news_input_ids': news_inputs['input_ids'].squeeze(),
            'news_attention_mask': news_inputs['attention_mask'].squeeze(),
            'financial_data': financial_data,
            'target': torch.tensor(item['target'], dtype=torch.long)  # Changed to long for CrossEntropyLoss
        }

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Split data
train_data, val_data = train_test_split(training_data, test_size=0.2, random_state=42)

# Create datasets with sequence length from config
train_dataset = StockDataset(train_data, tokenizer, sequence_length=config['sequence_length'])
val_dataset = StockDataset(val_data, tokenizer, sequence_length=config['sequence_length'])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Using sequence length: {config['sequence_length']}")


In [None]:
def train_ensemble_model(train_loader, val_loader, sequence_length=24, num_epochs=5, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on device: {device}")
    
    # Initialize ensemble model for 5-class classification with configurable sequence length
    model = EnsembleModel(num_classes=5).to(device)
    # Update the price model's sequence length
    model.price_model.sequence_length = sequence_length
    
    # Initialize optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()  # Changed from BCELoss to CrossEntropyLoss
    
    # Class names for reporting
    class_names = ['📉 Strong Decrease', '🔻 Moderate Decrease', '➡️ Stable', '🔺 Moderate Increase', '📈 Strong Increase']
    
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        for batch_idx, batch in enumerate(train_loader):
            # Move data to device
            price_data = batch['price_data'].to(device)
            news_input_ids = batch['news_input_ids'].to(device)
            news_attention_mask = batch['news_attention_mask'].to(device)
            financial_data = batch['financial_data'].to(device)
            targets = batch['target'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(news_input_ids, news_attention_mask, financial_data, price_data)
            loss = criterion(outputs, targets)  # No need to squeeze for CrossEntropyLoss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)  # Get the class with highest probability
            train_total += targets.size(0)
            train_correct += (predicted == targets).sum().item()
            
            if batch_idx % 10 == 0:
                print(f'Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f}')
        
        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                price_data = batch['price_data'].to(device)
                news_input_ids = batch['news_input_ids'].to(device)
                news_attention_mask = batch['news_attention_mask'].to(device)
                financial_data = batch['financial_data'].to(device)
                targets = batch['target'].to(device)
                
                outputs = model(news_input_ids, news_attention_mask, financial_data, price_data)
                loss = criterion(outputs, targets)  # No need to squeeze for CrossEntropyLoss
                val_loss += loss.item()
                
                # Calculate accuracy
                _, predicted = torch.max(outputs, 1)  # Get the class with highest probability
                val_total += targets.size(0)
                val_correct += (predicted == targets).sum().item()
        
        # Print epoch results
        train_acc = 100 * train_correct / train_total
        val_acc = 100 * val_correct / val_total
        
        print(f'\\nEpoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc:.2f}%')
        print(f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_acc:.2f}%')
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'epoch': epoch
            }, 'best_ensemble_model.pth')
            print(f'New best model saved with validation loss: {val_loss:.4f}')
        
        print('-' * 50)
            
    return model

# Train the ensemble model
print("Starting training...")
trained_model = train_ensemble_model(train_loader, val_loader, sequence_length=config['sequence_length'], num_epochs=5)
print("Training completed!")


In [None]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the final trained model
torch.save({
    'model_state_dict': trained_model.state_dict(),
    'model_config': {
        'news_model': 'distilbert-base-uncased',
        'financial_input_size': 6,
        'price_sequence_length': config['sequence_length'],
        'num_classes': 5,
        'training_config': SELECTED_CONFIG
    }
}, '../models/ensemble_model.pth')

# Also save individual model components
torch.save(trained_model.news_model.state_dict(), '../models/news_model.pth')
torch.save(trained_model.financial_model.state_dict(), '../models/financial_model.pth')
torch.save(trained_model.price_model.state_dict(), '../models/price_model.pth')

print("All models saved successfully!")
print("Files saved:")
print("- ../models/ensemble_model.pth")
print("- ../models/news_model.pth")
print("- ../models/financial_model.pth")
print("- ../models/price_model.pth")
print("- best_ensemble_model.pth (best checkpoint)")
