In [1]:
%pip install pandas_ta_classic
%pip install --upgrade pandas_ta_classic

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import site
import os

# Find the path to the pandas_ta_classic library and patch it
pandas_ta_classic_path = None
for sp in site.getsitepackages():
    pandas_ta_classic_path = os.path.join(sp, 'pandas_ta_classic')
    if os.path.exists(pandas_ta_classic_path):
        break

if pandas_ta_classic_path:
    squeeze_pro_path = os.path.join(pandas_ta_classic_path, 'momentum', 'squeeze_pro.py')
    if os.path.exists(squeeze_pro_path):
        try:
            with open(squeeze_pro_path, 'r') as f:
                lines = f.readlines()

            new_lines = []
            fixed = False
            for line in lines:
                if "from numpy import NaN as npNaN" in line:
                    new_lines.append(line.replace("from numpy import NaN as npNaN", "# from numpy import NaN as npNaN\nimport numpy as np\n"))
                    fixed = True
                    print("Modified import statement in squeeze_pro.py")
                else:
                    new_lines.append(line)

            if fixed:
                with open(squeeze_pro_path, 'w') as f:
                    f.writelines(new_lines)
                print("Successfully patched pandas_ta_classic/momentum/squeeze_pro.py")
            else:
                print("Could not find the problematic import line in squeeze_pro.py")

        except Exception as e:
            print(f"Error modifying squeeze_pro.py: {e}")
    else:
        print(f"Could not find squeeze_pro.py at {squeeze_pro_path}")
else:
    print("Could not find the pandas_ta_classic library installation path.")

import pandas_ta_classic as ta

Could not find the problematic import line in squeeze_pro.py


In [3]:
%pip install torch
%pip install scikit-learn
%pip install torchvision

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    precision_score, recall_score, f1_score, matthews_corrcoef,
    mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from contextlib import nullcontext
import random

In [5]:
# Utility: Determinism
from unicodedata import bidirectional


def set_global_seeds(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Optional: stricter determinism (may slow down training)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_global_seeds(42)

# Datasets
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Models
class RNNHead(nn.Module):
    """
    Shared head:
      - RNN stack (LSTM/GRU, uni/bi)
      - BatchNorm + Dense(32, ReLU) + Dropout(0.3)
      - Output layer (1 unit): linear (regression) or logits (classification)
    """
    def __init__(self, input_size, rnn_type='LSTM', bidirectional=False, problem_type='regression'):
        super().__init__()
        hidden1 = 128
        hidden2 = 64
        self.problem_type = problem_type
        self.bidirectional = bidirectional
        self.rnn_type = rnn_type.upper()

        rnn_cls = {'LSTM': nn.LSTM, 'GRU': nn.GRU}[('GRU' if 'GRU' in self.rnn_type else 'LSTM')]

        self.rnn1 = rnn_cls(
            input_size=input_size, hidden_size=hidden1, num_layers=1,
            batch_first=True, dropout=0.0, bidirectional=bidirectional
        )
        
        self.inter_rnn_drop = nn.Dropout(0.1)
        
        self.rnn2 = rnn_cls(
            input_size=hidden1*(2 if bidirectional else 1), hidden_size=hidden2, num_layers=1,
            batch_first=True, dropout=0.0, bidirectional=bidirectional
        )

        feat_dim = hidden2*(2 if bidirectional else 1)

        self.bn = nn.BatchNorm1d(feat_dim)
        self.fc = nn.Linear(feat_dim, 32)
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(32, 1)

    def forward(self, x):
        # x: [B, T, F]
        out, _ = self.rnn1(x)
        out = self.inter_rnn_drop(out)   # inter-layer dropout (sequence-wise)
        out, _ = self.rnn2(out)
        # take last timestep: [B, T, H] -> [B, H]
        out = out[:, -1, :]
        out = self.bn(out)
        out = F.relu(self.fc(out))
        out = self.drop(out)
        out = self.out(out)  # shape [B,1]
        return out  # regression: raw; classification: logits

def build_model(input_shape, model_type='LSTM', problem_type='regression'):
    seq_len, n_features = input_shape
    model_type = model_type.upper()
    if model_type == 'LSTM':
        return RNNHead(n_features, rnn_type='LSTM', bidirectional=False, problem_type=problem_type)
    elif model_type == 'BILSTM':
        return RNNHead(n_features, rnn_type='LSTM', bidirectional=True, problem_type=problem_type)
    elif model_type == 'GRU':
        return RNNHead(n_features, rnn_type='GRU', bidirectional=False, problem_type=problem_type)
    elif model_type == 'BIGRU':
        return RNNHead(n_features, rnn_type='GRU', bidirectional=True, problem_type=problem_type)
    else:
        raise ValueError("Model type must be one of: ['LSTM','BiLSTM','GRU','BiGRU']")

# Early Stopping (PyTorch)
class EarlyStopper:
    def __init__(self, patience=15, min_delta=0.0, restore_best=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best = restore_best
        self.best_loss = float('inf')
        self.counter = 0
        self.best_state = None

    def step(self, val_loss, model):
        improved = (self.best_loss - val_loss) > self.min_delta
        if improved:
            self.best_loss = val_loss
            self.counter = 0
            if self.restore_best:
                # Deep copy state dict
                self.best_state = {k: v.detach().clone() for k, v in model.state_dict().items()}
        else:
            self.counter += 1
        return self.counter >= self.patience

    def restore(self, model):
        if self.restore_best and self.best_state is not None:
            model.load_state_dict(self.best_state)

In [None]:
class StockPredictionPipeline:
    def __init__(self, df, feature_columns, model_type='LSTM', sequence_length=30, problem_type='regression'):
        self.df = df.copy()
        self.feature_columns = feature_columns
        self.model_type = model_type
        self.sequence_length = sequence_length
        self.problem_type = problem_type
        self.results = []

        # Validate
        self._validate_inputs()

        # Device & precision
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.mixed_precision = torch.cuda.is_available()

        print(f"Pipeline initialized for a '{self.problem_type}' problem. Device: {self.device}")

    def _validate_inputs(self):
        missing_cols = [col for col in self.feature_columns if col not in self.df.columns]
        if missing_cols:
            raise ValueError(f"Missing feature columns: {missing_cols}")

        if 'close' not in self.df.columns and 'close_price' not in self.df.columns:
            raise ValueError("No 'close' or 'close_price' column found in data")

        valid_models = ['LSTM', 'BiLSTM', 'GRU', 'BiGRU']
        if self.model_type not in valid_models:
            raise ValueError(f"Model type must be one of: {valid_models}")

        if self.problem_type not in ['regression', 'classification']:
            raise ValueError("Problem type must be 'regression' or 'classification'")

    def create_target_variable(self, company_data):
        company_data = company_data.copy()
        price_col = 'close' if 'close' in company_data.columns else 'close_price'
        if 'date' in company_data.columns:
            company_data = company_data.sort_values('date')

        company_data['target_regression'] = (
            np.log(company_data[price_col].shift(-1)) - np.log(company_data[price_col])
        )
        company_data['target_direction'] = (company_data['target_regression'] > 0).astype(int)
        company_data = company_data.dropna()
        return company_data

    def create_sequences(self, features, *targets):
        X = []
        y_sequences = [[] for _ in targets]
        for i in range(self.sequence_length, len(features)):
            X.append(features[i-self.sequence_length:i])
            for j, target in enumerate(targets):
                y_sequences[j].append(target[i])
        return (np.array(X),) + tuple(np.array(y) for y in y_sequences)

    def _train_one_epoch(self, model, loader, optimizer, loss_fn, scaler):
        model.train()
        total_loss = 0.0
        for xb, yb in loader:
            xb = xb.to(self.device)
            yb = yb.to(self.device).view(-1, 1)

            optimizer.zero_grad(set_to_none=True)

            ctx = torch.amp.autocast('cuda') if self.mixed_precision else nullcontext()
            with ctx:
                logits = model(xb)
                loss = loss_fn(logits, yb)

            if self.mixed_precision:
                scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

            total_loss += loss.item() * xb.size(0)

        return total_loss / len(loader.dataset)

    @torch.no_grad()
    def _eval_one_epoch(self, model, loader, loss_fn):
        model.eval()
        total_loss = 0.0
        for xb, yb in loader:
            xb = xb.to(self.device)
            yb = yb.to(self.device).view(-1, 1)
            logits = model(xb)
            loss = loss_fn(logits, yb)
            total_loss += loss.item() * xb.size(0)
        return total_loss / len(loader.dataset)

    @torch.no_grad()
    def _predict(self, model, loader):
        model.eval()
        outs = []
        for xb, _ in loader:
            xb = xb.to(self.device)
            logits = model(xb).squeeze(1).detach().cpu().numpy()
            outs.append(logits)
        return np.concatenate(outs, axis=0)

    def build_model(self, input_shape):
        model = build_model(input_shape, model_type=self.model_type, problem_type=self.problem_type)
        return model.to(self.device)

    def process_company(self, company_name, company_data, sector):
        print(f"\nProcessing {company_name} ({sector})...")
        try:
            company_data = self.create_target_variable(company_data)

            # Min samples requirement (same heuristic)
            min_samples = self.sequence_length + 150
            if len(company_data) < min_samples:
                print(f"Insufficient data for {company_name} ({len(company_data)} < {min_samples}). Skipping...")
                return None

            if company_data[self.feature_columns].isnull().any().any():
                print(f"Missing values in features for {company_name}. Skipping...")
                return None

            features = company_data[self.feature_columns].values
            target_reg = company_data['target_regression'].values
            target_dir = company_data['target_direction'].values

            # Scale features
            scaler = StandardScaler()
            features_scaled = scaler.fit_transform(features)

            # Create sequences
            X, y_reg, y_dir = self.create_sequences(features_scaled, target_reg, target_dir)

            # TimeSeriesSplit
            n_splits = min(5, len(X) // 50)
            if n_splits < 3:
                print(f"Insufficient data for proper time series validation for {company_name}. Skipping...")
                return None

            tscv = TimeSeriesSplit(n_splits=n_splits)
            splits = list(tscv.split(X))
            train_idx, test_idx = splits[-1]

            # Train/Val split (last 20% of train for val)
            val_size = int(0.2 * len(train_idx))
            final_train_idx = train_idx[:-val_size]
            val_idx = train_idx[-val_size:]

            X_train, X_val, X_test = X[final_train_idx], X[val_idx], X[test_idx]

            if self.problem_type == 'regression':
                y_train, y_val, y_test = y_reg[final_train_idx], y_reg[val_idx], y_reg[test_idx]
                target_scaler = StandardScaler()
                y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
                y_val_scaled   = target_scaler.transform(y_val.reshape(-1, 1)).flatten()
                train_target, val_target = y_train_scaled, y_val_scaled
            else:
                y_train, y_val, y_test = y_dir[final_train_idx], y_dir[val_idx], y_dir[test_idx]
                train_target, val_target = y_train, y_val
                target_scaler = None

            # Class balance note
            if self.problem_type == 'classification':
                class_ratio = np.mean(y_train)
                if class_ratio < 0.1 or class_ratio > 0.9:
                    print(f"Severe class imbalance for {company_name} ({class_ratio:.3f}). Consider using class weights.")

            # Datasets & loaders
            train_ds = SequenceDataset(X_train, train_target)
            val_ds   = SequenceDataset(X_val,   val_target)
            test_ds  = SequenceDataset(X_test,  y_test)

            train_loader = DataLoader(train_ds, batch_size=32, shuffle=False,  drop_last=False, num_workers=0)
            val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, drop_last=False, num_workers=0)
            test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, drop_last=False, num_workers=0)

            # Build model
            model = self.build_model((self.sequence_length, len(self.feature_columns)))

            # Loss functions
            if self.problem_type == 'regression':
                loss_fn = nn.HuberLoss(delta=1.0)
            else:
                # Use BCEWithLogitsLoss for numerical stability (logits input)
                loss_fn = nn.BCEWithLogitsLoss()

            # Optimizer & scheduler
            optimizer = Adam(model.parameters(), lr=1e-3, eps=1e-7)
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=7, min_lr=1e-7)
            early_stopper = EarlyStopper(patience=15, min_delta=0.0, restore_best=True)
            scaler = torch.amp.GradScaler('cuda', enabled=self.mixed_precision)

            # Training loop
            max_epochs = 100
            best_val = float('inf')
            epochs_trained = 0

            for epoch in range(1, max_epochs + 1):
                train_loss = self._train_one_epoch(model, train_loader, optimizer, loss_fn, scaler)
                val_loss = self._eval_one_epoch(model, val_loader, loss_fn)
                scheduler.step(val_loss)
                stop = early_stopper.step(val_loss, model)
                epochs_trained = epoch

                if epoch % 10 == 0 or stop:
                    print(f"  Epoch {epoch:03d} - train {train_loss:.5f} | val {val_loss:.5f}")

                if stop:
                    break

            # Restore best model weights (like Keras restore_best_weights=True)
            early_stopper.restore(model)

            # Predictions
            y_pred_raw = self._predict(model, test_loader)  # raw/regression or logits

            if self.problem_type == 'regression':
                y_pred_unscaled = target_scaler.inverse_transform(y_pred_raw.reshape(-1,1)).flatten() if target_scaler is not None else y_pred_raw
                mse = mean_squared_error(y_test, y_pred_unscaled)
                mae = mean_absolute_error(y_test, y_pred_unscaled)
                r2  = r2_score(y_test, y_pred_unscaled)

                # Directional metrics (derived)
                y_test_dir = (y_reg[test_idx] > 0).astype(int)
                y_pred_dir = (y_pred_unscaled > 0).astype(int)
            else:
                # logits -> probs via sigmoid -> threshold 0.5
                probs = 1.0 / (1.0 + np.exp(-y_pred_raw))
                y_pred_dir = (probs > 0.5).astype(int)
                y_test_dir = y_test
                mse = mae = r2 = np.nan

            precision = precision_score(y_test_dir, y_pred_dir, zero_division=0)
            recall    = recall_score(y_test_dir, y_pred_dir, zero_division=0)
            f1        = f1_score(y_test_dir, y_pred_dir, zero_division=0)
            mcc       = matthews_corrcoef(y_test_dir, y_pred_dir)
            directional_accuracy = np.mean(y_test_dir == y_pred_dir)

            result = {
                'company': company_name,
                'sector': sector,
                'model_type': self.model_type,
                'problem_type': self.problem_type,
                'mse': mse,
                'mae': mae,
                'r2': r2,
                'mcc': mcc,
                'f1': f1,
                'precision': precision,
                'recall': recall,
                'directional_accuracy': directional_accuracy,
                'n_samples': len(X),
                'train_samples': len(X_train),
                'val_samples': len(X_val),
                'test_samples': len(X_test),
                'epochs_trained': epochs_trained
            }

            if self.problem_type == 'regression':
                print(f"  Regression -> MSE: {mse:.6f}, MAE: {mae:.6f}, R²: {r2:.4f}")
            print(f"  Directional -> Accuracy: {directional_accuracy:.4f}, MCC: {mcc:.4f}, F1: {f1:.4f}")

            # Explicit cleanup (PyTorch handles this, but keeps parity with your Enrique2025 code)
            del model
            torch.cuda.empty_cache()

            return result

        except Exception as e:
            print(f"Error processing {company_name}: {str(e)}")
            torch.cuda.empty_cache()
            return None

    def run_pipeline(self):
        company_col = None
        for col_name in ['ticker', 'company', 'symbol']:
            if col_name in self.df.columns:
                company_col = col_name
                break
        if company_col is None:
            company_col = self.df.columns[0]
            print(f"Warning: Using '{company_col}' as company identifier column")

        companies = self.df[company_col].unique()
        print(f"Processing {len(companies)} companies with {self.model_type} model...")
        print(f"Problem type: {self.problem_type}")
        print(f"Sequence length: {self.sequence_length}")
        print(f"Features: {self.feature_columns}")

        successful_companies = 0
        for i, company in enumerate(companies, 1):
            print(f"\n[{i}/{len(companies)}] Processing {company}...")
            company_data = self.df[self.df[company_col] == company].copy()
            sector = company_data['sector'].iloc[0] if 'sector' in company_data.columns else 'Unknown'
            result = self.process_company(company, company_data, sector)
            if result:
                self.results.append(result)
                successful_companies += 1

        print(f"\n{'='*80}")
        print(f"Pipeline completed: {successful_companies}/{len(companies)} companies processed successfully")
        print(f"{'='*80}")

        if self.results:
            self.results_df = pd.DataFrame(self.results)
            return self.results_df
        else:
            print("No companies were processed successfully!")
            return pd.DataFrame()

    def analyze_results(self):
        if not hasattr(self, 'results_df') or self.results_df.empty:
            print("No results to analyze!")
            return None

        df = self.results_df
        analysis = {}

        print("\n" + "="*80)
        print("STOCK PREDICTION PIPELINE RESULTS")
        print("="*80)
        print(f"Model: {self.model_type} | Problem: {self.problem_type}")
        print(f"Companies analyzed: {len(df)}")
        print(f"Average samples per company: {df['n_samples'].mean():.0f}")

        print("\n" + "="*50)
        print("OVERALL PERFORMANCE")
        print("="*50)
        if self.problem_type == 'regression':
            print(f"Mean Squared Error:     {df['mse'].mean():.6f} (±{df['mse'].std():.6f})")
            print(f"Mean Absolute Error:    {df['mae'].mean():.6f} (±{df['mae'].std():.6f})")
            print(f"R² Score:              {df['r2'].mean():.4f} (±{df['r2'].std():.4f})")

        print(f"Directional Accuracy:   {df['directional_accuracy'].mean():.4f} (±{df['directional_accuracy'].std():.4f})")
        print(f"Matthews Correlation:   {df['mcc'].mean():.4f} (±{df['mcc'].std():.4f})")
        print(f"F1 Score:              {df['f1'].mean():.4f} (±{df['f1'].std():.4f})")
        print(f"Precision:             {df['precision'].mean():.4f} (±{df['precision'].std():.4f})")
        print(f"Recall:                {df['recall'].mean():.4f} (±{df['recall'].std():.4f})")

        if 'sector' in df.columns and df['sector'].nunique() > 1:
            print("\n" + "="*50)
            print("PERFORMANCE BY SECTOR")
            print("="*50)
            sector_stats = df.groupby('sector').agg({
                'directional_accuracy': ['mean', 'std', 'count'],
                'mcc': ['mean', 'std'],
                'r2': 'mean' if self.problem_type == 'regression' else lambda x: np.nan,
                'mae': 'mean' if self.problem_type == 'regression' else lambda x: np.nan
            }).round(4)
            sector_stats.columns = ['_'.join(col).strip() if col[1] else col[0] for col in sector_stats.columns]
            sector_stats = sector_stats.sort_values('directional_accuracy_mean', ascending=False)
            for sector, row in sector_stats.iterrows():
                print(f"{sector:<20} | Acc: {row['directional_accuracy_mean']:.3f}±{row['directional_accuracy_std']:.3f} | "
                      f"MCC: {row['mcc_mean']:.3f} | Companies: {int(row['directional_accuracy_count'])}")

        print("\n" + "="*50)
        print("TOP 10 PERFORMERS (by Directional Accuracy)")
        print("="*50)
        top_performers = df.nlargest(10, 'directional_accuracy')
        for _, row in top_performers.iterrows():
            print(f"{row['company']:<20} | {row['sector']:<15} | "
                  f"Acc: {row['directional_accuracy']:.3f} | MCC: {row['mcc']:.3f}")

        return analysis

    def save_results(self, output_path='results/benchmarking/stock_prediction_results.csv'):
        if hasattr(self, 'results_df') and not self.results_df.empty:
            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
            filename = f"{self.model_type}_{self.problem_type}_{timestamp}_results.csv"
            if output_path == 'results/benchmarking/stock_prediction_results.csv':
                output_path = filename
            self.results_df.to_csv(output_path, index=False)
            print(f"\nResults saved to {output_path}")
            print(f"Columns saved: {list(self.results_df.columns)}")
        else:
            print("No results to save. Run the pipeline first!")

    def get_feature_importance_analysis(self):
        print("Feature importance analysis not implemented yet.")
        print("Consider implementing SHAP values or permutation importance for better insights.")
        return None

In [7]:
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [8]:
companies = pd.read_parquet('stocknet-dataset/stock_table.parquet')
tweets = pd.read_parquet('stocknet-dataset/stock_tweets_withsentiment_withemotion_withstance_nomerge.parquet')
stocks = pd.read_parquet('stocknet-dataset/stock_prices.parquet')

companies = companies.rename(columns={'symbol': 'ticker'})

companies.columns = [x.lower() for x in companies.columns]
tweets.columns = [x.lower() for x in tweets.columns]
stocks.columns = [x.lower() for x in stocks.columns]

tweets['stance_positive'] = (tweets['stance_label'] == 'Positive').astype(int)
tweets['stance_negative'] = (tweets['stance_label'] == 'Negative').astype(int)

tweets_merged = tweets.groupby(['date', 'ticker'], as_index=False).agg({
    'text': lambda x: ' '.join(x),
    'sentiment': lambda x: x.mean(),
    'emotion_anger': 'sum',
    'emotion_disgust': 'sum',
    'emotion_fear': 'sum',
    'emotion_joy': 'sum',
    'emotion_neutral': 'sum',
    'emotion_sadness': 'sum',
    'emotion_surprize': 'sum',
    'stance_positive': 'sum',
    'stance_negative': 'sum'
})




tweets_merged['date'] = pd.to_datetime(tweets_merged['date'])
stocks['date'] = pd.to_datetime(stocks['date'])



master_df = pd.merge(
    stocks,
    tweets_merged,
    on=["date", "ticker"],
    how='left'
)

# Fill missing tweet features with 0
tweet_feature_cols = ['sentiment', 'emotion_anger', 'emotion_disgust', 'emotion_fear', 'emotion_joy', 'emotion_neutral', 'emotion_sadness', 'emotion_surprize', 'stance_positive', 'stance_negative']
for col in tweet_feature_cols:
    if col in master_df.columns:
        master_df[col].fillna(0, inplace=True)



companies = companies.rename(columns={'symbol': 'ticker'})

master_df = pd.merge(master_df, companies[['ticker', 'sector', 'company']], on='ticker', how='left')


feature_cols = ['open','high','low','volume']

master_df = master_df.rename(columns={'close': 'close_price', 'company': 'company_name'})


print(f"Shape of master_df before dropping NaNs: {master_df.shape}")
print(f"Shape of master_df after dropping NaNs: {master_df.shape}")

master_df.rename(columns={'close_price': 'close'}, inplace=True)





master_df.sort_values(by=['ticker', 'date'], inplace=True)


def apply_ta_indicators(df_group):
    df_group.set_index(pd.DatetimeIndex(df_group['date']), inplace=True)
    df_group.ta.ema(length=12, append=True)
    df_group.ta.ema(length=26, append=True)
    df_group.ta.ema(length=50, append=True)

    df_group.ta.macd(fast=12, slow=26, signal=9, append=True)


    df_group.ta.rsi(length=14, append=True)
    df_group.ta.stochrsi(length=14, append=True)

    df_group.ta.atr(length=14, append=True)

    bb = ta.bbands(df_group['close'], length=20, std=2)
    df_group['BB_upper'] = bb['BBU_20_2.0']
    df_group['BB_middle'] = bb['BBM_20_2.0']
    df_group['BB_lower'] = bb['BBL_20_2.0']

    df_group.ta.obv(append=True)
    return df_group.reset_index(drop=True)

master_df = master_df.groupby('ticker').apply(apply_ta_indicators)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  master_df[col].fillna(0, inplace=True)


Shape of master_df before dropping NaNs: (108592, 21)
Shape of master_df after dropping NaNs: (108592, 21)


  master_df = master_df.groupby('ticker').apply(apply_ta_indicators)


In [9]:
columns_to_check = ['EMA_12', 'EMA_26','EMA_50','MACD_12_26_9','MACDh_12_26_9','MACDs_12_26_9','RSI_14','ATRr_14','STOCHRSIk_14_14_3_3','STOCHRSId_14_14_3_3','ATRr_14','BB_upper','BB_middle','BB_lower','OBV']
master_df = master_df.dropna(subset=columns_to_check)


master_df.reset_index(drop=True, inplace=True)

display(master_df)

Unnamed: 0,date,open,high,low,close,adj close,volume,ticker,text,sentiment,...,MACDh_12_26_9,MACDs_12_26_9,RSI_14,STOCHRSIk_14_14_3_3,STOCHRSId_14_14_3_3,ATRr_14,BB_upper,BB_middle,BB_lower,OBV
0,2012-11-14,77.928574,78.207146,76.597145,76.697144,69.613815,119292600.0,AAPL,,0.0,...,-0.538077,-3.703588,25.771012,21.054629,19.582354,2.377852,94.648550,84.401357,74.154164,-1.014356e+09
1,2012-11-15,76.790001,77.071426,74.660004,75.088570,68.153778,197477700.0,AAPL,,0.0,...,-0.537725,-3.838019,23.573491,15.792949,19.993462,2.380310,93.761634,83.514428,73.267223,-1.211834e+09
2,2012-11-16,75.028572,75.714287,72.250000,75.382858,68.420891,316723400.0,AAPL,,0.0,...,-0.455544,-3.951905,24.836267,12.243346,16.363641,2.459547,92.716200,82.679214,72.642228,-8.951103e+08
3,2012-11-19,77.244286,81.071426,77.125717,80.818573,73.354591,205829400.0,AAPL,,0.0,...,0.002769,-3.951213,43.429047,39.692073,22.576123,2.695187,91.617665,82.201285,72.784906,-6.892809e+08
4,2012-11-20,81.701431,81.707146,79.225716,80.129997,72.729614,160688500.0,AAPL,,0.0,...,0.281964,-3.880722,42.011353,69.373201,40.436207,2.679612,91.027780,81.851785,72.675790,-8.499694e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104215,2017-08-28,76.900002,76.940002,76.260002,76.470001,76.470001,8229700.0,XOM,,0.0,...,-0.107548,-0.972858,31.975492,35.117121,31.775404,0.786087,81.525829,78.243500,74.961171,-2.688251e+08
104216,2017-08-29,76.209999,76.489998,76.080002,76.449997,76.449997,7060400.0,XOM,,0.0,...,-0.069077,-0.990127,31.851847,48.597552,38.712818,0.759224,81.303475,78.057500,74.811525,-2.758855e+08
104217,2017-08-30,76.239998,76.449997,76.059998,76.099998,76.099998,8218000.0,XOM,,0.0,...,-0.054652,-1.003790,29.688704,55.025431,46.246701,0.732850,80.964170,77.832500,74.700830,-2.841035e+08
104218,2017-08-31,76.269997,76.489998,76.050003,76.330002,76.330002,15641700.0,XOM,,0.0,...,-0.018917,-1.008519,32.913052,73.940933,59.187972,0.711932,80.569554,77.624500,74.679446,-2.684618e+08


In [10]:
print(master_df.columns)

Index(['date', 'open', 'high', 'low', 'close', 'adj close', 'volume', 'ticker',
       'text', 'sentiment', 'emotion_anger', 'emotion_disgust', 'emotion_fear',
       'emotion_joy', 'emotion_neutral', 'emotion_sadness', 'emotion_surprize',
       'stance_positive', 'stance_negative', 'sector', 'company_name',
       'EMA_12', 'EMA_26', 'EMA_50', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'RSI_14', 'STOCHRSIk_14_14_3_3', 'STOCHRSId_14_14_3_3',
       'ATRr_14', 'BB_upper', 'BB_middle', 'BB_lower', 'OBV'],
      dtype='object')


In [11]:
feature_columns = [
    'open', 'high', 'low', 'close', 'volume'
    # 'stance_positive', 'stance_negative',
    # 'sentiment'
]

new_indicator_columns = [
    'EMA_12', 'EMA_26', 'EMA_50', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9',
    'RSI_14', 'ATRr_14', 'STOCHRSIk_14_14_3_3', 'STOCHRSId_14_14_3_3',
    'BB_upper', 'BB_middle', 'BB_lower', 'OBV'
]
feature_columns.extend(new_indicator_columns)

sequence_length=12



all_pipelines = {}
all_results_dfs = {}
all_analyses = {}

In [None]:
import os


print(f"\n{'='*25}\n  RUNNING PIPELINE FOR: BiGRU\n{'='*25}\n")

pipeline_BiGRU = StockPredictionPipeline(
    df=master_df,
    feature_columns=feature_columns,
    model_type='BiGRU',
    sequence_length=sequence_length,
    problem_type='regression'
)


results_BiGRU = pipeline_BiGRU.run_pipeline()

if results_BiGRU is not None and not results_BiGRU.empty:
    analysis_BiGRU = pipeline_BiGRU.analyze_results()
    model_name = pipeline_BiGRU.model_type
    if pipeline_BiGRU.problem_type == 'regression':
        out_dir = 'results/regression'
    else:
        out_dir = 'results/classification'

    os.makedirs(out_dir, exist_ok=True)
    output_path = os.path.join(out_dir, f"{model_name}.csv")
    pipeline_BiGRU.save_results(output_path)

    all_pipelines["BiGRU"] = pipeline_BiGRU
    all_results_dfs["BiGRU"] = results_BiGRU
    all_analyses["BiGRU"] = analysis_BiGRU

    print("\nDisplaying first 5 rows of BiGRU results:")
    display(results_BiGRU.head())
else:
    print(f"\n[FAILED] Pipeline for BiGRU did not produce any results.")

del pipeline_BiGRU


  RUNNING PIPELINE FOR: BiGRU

Pipeline initialized for a 'regression' problem. Device: cpu
Processing 88 companies with BiGRU model...
Problem type: regression
Sequence length: 12
Features: ['open', 'high', 'low', 'close', 'volume', 'EMA_12', 'EMA_26', 'EMA_50', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'RSI_14', 'ATRr_14', 'STOCHRSIk_14_14_3_3', 'STOCHRSId_14_14_3_3', 'BB_upper', 'BB_middle', 'BB_lower', 'OBV']

[1/88] Processing AAPL...

Processing AAPL (Consumer Goods)...
  Epoch 010 - train 0.31426 | val 0.76905
  Epoch 016 - train 0.31714 | val 0.64255
  Regression -> MSE: 0.000246, MAE: 0.012470, R²: -0.0017
  Directional -> Accuracy: 0.4872, MCC: 0.0576, F1: 0.6000

[2/88] Processing ABB...

Processing ABB (Industrial Goods)...
Insufficient data for ABB (62 < 162). Skipping...

[3/88] Processing ABBV...

Processing ABBV (Healthcare)...
  Epoch 010 - train 0.32846 | val 0.35215
  Epoch 019 - train 0.29298 | val 0.36072
  Regression -> MSE: 0.000688, MAE: 0.019372, R²: 0

Unnamed: 0,company,sector,model_type,problem_type,mse,mae,r2,mcc,f1,precision,recall,directional_accuracy,n_samples,train_samples,val_samples,test_samples,epochs_trained
0,AAPL,Consumer Goods,BiGRU,regression,0.000246,0.01247,-0.00172,0.057639,0.6,0.461538,0.857143,0.487179,468,312,78,78,16
1,ABBV,Healthcare,BiGRU,regression,0.000688,0.019372,0.005072,0.2719,0.666667,0.68,0.653846,0.638298,283,189,47,47,19
2,AMGN,Healthcare,BiGRU,regression,0.000463,0.016906,-0.341294,0.191805,0.142857,1.0,0.076923,0.5,293,196,49,48,23
3,AMZN,Services,BiGRU,regression,0.000408,0.015737,-0.183802,0.0,0.0,0.0,0.0,0.460526,458,306,76,76,16
4,BA,Industrial Goods,BiGRU,regression,0.000191,0.009179,0.048403,0.338926,0.721311,0.666667,0.785714,0.673077,315,211,52,52,21
