In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import spacy
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F
from tqdm.auto import tqdm
import os
import shutil
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from captum.attr import LayerIntegratedGradients
import logging
import gc

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('training.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize models
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'lemmatizer'])

# Initialize all BERT models we're using
model_configs = {
    'sentiment': {
        'name': 'nlptown/bert-base-multilingual-uncased-sentiment',
        'tokenizer_class': BertTokenizer,
        'model_class': BertModel,
        'embedding_size': 768
    },
    'base': {
        'name': 'bert-base-uncased',
        'tokenizer_class': BertTokenizer,
        'model_class': BertModel,
        'embedding_size': 768
    },
    'roberta': {
        'name': 'roberta-base',
        'tokenizer_class': RobertaTokenizer,
        'model_class': RobertaModel,
        'embedding_size': 768
    }
}

def clear_gpu_memory():
    """Helper function to clear GPU memory"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

# Initialize tokenizers and models
tokenizers = {}
bert_models = {}
cache_dir = "./my_cache_dir"
os.makedirs(cache_dir, exist_ok=True)

logger.info("Initializing tokenizers and models...")
for key, config in tqdm(model_configs.items(), desc="Loading models", position=0):
    try:
        tokenizer_class = config['tokenizer_class']
        model_class = config['model_class']
        tokenizers[key] = tokenizer_class.from_pretrained(
            config['name'],
            cache_dir=cache_dir,
            force_download=True
        )
        bert_models[key] = model_class.from_pretrained(
            config['name'],
            cache_dir=cache_dir,
            force_download=True
        ).to(device)
    except Exception as e:
        logger.error(f"Error initializing tokenizer or model for {config['name']}: {e}")
        raise

class MultiEmbeddingBERT(nn.Module):
    def __init__(self, num_labels, feature_dim, hyperparams):
        super().__init__()
        self.dropout = nn.Dropout(hyperparams["Dropout"])
        self.feature_dim = feature_dim

        # Project combined embeddings to classifier dimension using hyperparams
        hidden_size = hyperparams["Hidden Size"]
        projection_size = hyperparams["Projection Size"]

        self.projection = nn.Sequential(
            nn.Linear(feature_dim, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.ReLU(),
            nn.Dropout(hyperparams["Dropout"]),
            nn.Linear(hidden_size, projection_size)
        )

        # Classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(projection_size),
            nn.ReLU(),
            nn.Dropout(hyperparams["Dropout"]),
            nn.Linear(projection_size, num_labels)
        )

        self.all_preds = []
        self.all_labels = []
        self.all_texts = []

    def forward(self, features):
        # Project combined features
        projected = self.projection(features)
        # Apply classifier
        output = self.classifier(projected)
        return output

class GPUOptimizedTrainer:
    def __init__(self, df, text_column, labels, hyperparams, embedding_dir="embeddings"):
        required_params = [
            "Epochs", "Batch Size", "Learning Rate", "Dropout", "Weight Decay",
            "Label Smoothing", "Early Stopping Patience", "Gradient Accumulation Steps",
            "Embedding Batch Size", "Embedding Sub Batch Size", "Feature Chunk Size",
            "Hidden Size", "Projection Size"
        ]
        for param in required_params:
            if param not in hyperparams:
                raise ValueError(f"Missing required hyperparameter: {param}")

        self.device = device
        self.hyperparams = hyperparams
        self.batch_size = hyperparams["Batch Size"]
        self.embedding_dir = embedding_dir
        self.text_column = text_column

        torch.backends.cudnn.benchmark = True
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

        # Calculate total embedding dimension
        total_bert_dim = sum(config['embedding_size'] for config in model_configs.values())
        feature_dim = total_bert_dim + 4  # Add 4 for syntactic features

        self.model = MultiEmbeddingBERT(
            num_labels=5,
            feature_dim=feature_dim,
            hyperparams=hyperparams
        ).to(self.device)

        self.scaler = GradScaler()

        self.train_metrics = {
            'loss': [],
            'accuracy': [],
            'grad_norm': [],
            'layer_stats': []
        }
        self.val_metrics = {
            'loss': [],
            'accuracy': []
        }

        self.prepare_data(df, labels)
        self.setup_training()

    def extract_model_embeddings(self, texts, model_key):
        """Extract embeddings with highly optimized GPU usage"""
        embeddings_list = []
        chunk_size = self.hyperparams["Embedding Batch Size"]
        sub_batch_size = self.hyperparams["Embedding Sub Batch Size"]
        total_chunks = (len(texts) + chunk_size - 1) // chunk_size

        # Pre-process all texts to clean format
        processed_texts = [str(text).strip() if pd.notna(text) else "" for text in texts]

        # Process in large batches with sub-batching
        pbar = tqdm(range(0, len(processed_texts), chunk_size),
                   desc=f"Extracting {model_key} embeddings ({len(processed_texts)} texts)",
                   position=1, leave=False)

        current_embeddings = []
        for i in pbar:
            # Get current batch
            batch_texts = processed_texts[i:i + chunk_size]
            current_chunk = (i // chunk_size) + 1

            # Update progress bar
            pbar.set_postfix({
                'Chunk': f"{current_chunk}/{total_chunks}",
                'Batch Size': len(batch_texts)
            })

            # Skip empty batches
            if not any(batch_texts):
                continue

            # Process in sub-batches
            for j in range(0, len(batch_texts), sub_batch_size):
                sub_batch = batch_texts[j:j + sub_batch_size]

                # Tokenize with padding to max length in batch
                inputs = tokenizers[model_key](
                    sub_batch,
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors="pt"
                )

                # Move to GPU efficiently
                inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}

                # Extract embeddings with mixed precision
                with torch.no_grad(), autocast():
                    outputs = bert_models[model_key](**inputs)
                    embeddings = outputs.last_hidden_state[:, 0, :].cpu()
                    current_embeddings.append(embeddings.numpy())

                # Clear GPU cache after each sub-batch
                del inputs, outputs, embeddings
                torch.cuda.empty_cache()

            # Combine sub-batches and add to main list
            if current_embeddings:
                embeddings_list.append(np.concatenate(current_embeddings, axis=0))
                current_embeddings = []

            # Major cleanup every few chunks
            if current_chunk % 2 == 0:
                clear_gpu_memory()

        # Combine all embeddings efficiently
        all_embeddings = np.concatenate(embeddings_list, axis=0)
        del embeddings_list, current_embeddings
        clear_gpu_memory()

        return all_embeddings

    def extract_syntactic_features(self, texts):
        """Extract syntactic features in parallel chunks"""
        features_list = []
        chunk_size = self.hyperparams["Feature Chunk Size"]

        for i in tqdm(range(0, len(texts), chunk_size), desc="Extracting syntactic features", position=1, leave=False):
            chunk_texts = texts[i:i + chunk_size]
            chunk_features = []

            for text in chunk_texts:
                text = str(text).strip()
                if not text:
                    chunk_features.append([0, 0, 0, 0])
                    continue

                doc = nlp(text)
                pos_tags = [token.pos_ for token in doc]
                chunk_features.append([
                    len(doc),
                    len(set(pos_tags)) / max(len(pos_tags), 1),
                    pos_tags.count('NOUN') / max(len(pos_tags), 1),
                    pos_tags.count('VERB') / max(len(pos_tags), 1),
                ])

            features_list.append(np.array(chunk_features))

        return np.concatenate(features_list, axis=0)

    def extract_features(self, texts):
        """Extract features with optimized GPU usage"""
        logger.info("Extracting features...")
        os.makedirs(self.embedding_dir, exist_ok=True)

        # Extract embeddings for all models in parallel
        all_embeddings = {}
        total_models = len(model_configs)

        for idx, (key, config) in enumerate(model_configs.items(), 1):
            embedding_file = os.path.join(self.embedding_dir, f"{key}_embeddings.npy")

            if os.path.exists(embedding_file):
                logger.info(f"Loading {key} embeddings from disk ({idx}/{total_models})")
                all_embeddings[key] = np.load(embedding_file, mmap_mode='r')  # Memory-mapped reading
            else:
                logger.info(f"Calculating {key} embeddings ({idx}/{total_models})")
                embeddings = self.extract_model_embeddings(texts, key)
                np.save(embedding_file, embeddings)
                all_embeddings[key] = embeddings

            # Clear GPU memory after each model
            clear_gpu_memory()

        # Extract syntactic features (CPU operation)
        syntactic_file = os.path.join(self.embedding_dir, "syntactic_features.npy")
        if os.path.exists(syntactic_file):
            logger.info("Loading syntactic features from disk")
            syntactic_features = np.load(syntactic_file, mmap_mode='r')
        else:
            logger.info("Calculating syntactic features")
            syntactic_features = self.extract_syntactic_features(texts)
            np.save(syntactic_file, syntactic_features)

        # Combine features efficiently
        logger.info("Combining features...")
        feature_parts = [all_embeddings[key] for key in model_configs.keys()]
        feature_parts.append(syntactic_features)

        # Use memory-efficient concatenation
        total_width = sum(part.shape[1] for part in feature_parts)
        combined_features = np.empty((len(texts), total_width), dtype=np.float32)

        current_col = 0
        for part in feature_parts:
            width = part.shape[1]
            combined_features[:, current_col:current_col + width] = part
            current_col += width
            del part

        # Clean up
        del all_embeddings, syntactic_features, feature_parts
        clear_gpu_memory()

        return combined_features

    def prepare_data(self, df, labels, val_split=0.2):
        logger.info("Preparing data...")

        # Validate texts
        texts = df[self.text_column].tolist()
        valid_labels = []
        valid_texts = []

        for i, text in enumerate(tqdm(texts, desc="Validating texts", position=0)):
            text = str(text).strip()
            if pd.notna(text) and text:
                valid_texts.append(text)
                valid_labels.append(labels[i])

        # Extract features in chunks
        features = self.extract_features(valid_texts)

        # Split data
        split_idx = int(len(valid_texts) * (1 - val_split))
        indices = np.random.permutation(len(valid_texts))
        train_idx = indices[:split_idx]
        val_idx = indices[split_idx:]

        # Scale features
        scaler = StandardScaler()
        train_features = scaler.fit_transform(features[train_idx])
        val_features = scaler.transform(features[val_idx])

        # Convert to tensors
        self.train_features = torch.tensor(train_features, dtype=torch.float32)
        self.train_labels = torch.tensor(valid_labels)[train_idx]
        self.train_texts = [valid_texts[i] for i in train_idx]

        self.val_features = torch.tensor(val_features, dtype=torch.float32)
        self.val_labels = torch.tensor(valid_labels)[val_idx]
        self.val_texts = [valid_texts[i] for i in val_idx]

        clear_gpu_memory()
        self.create_dataloaders()

    def create_dataloaders(self):
        logger.info("Creating dataloaders...")
        train_dataset = TensorDataset(self.train_features, self.train_labels)
        val_dataset = TensorDataset(self.val_features, self.val_labels)

        self.train_loader = DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            pin_memory=True,
            num_workers=4,
            prefetch_factor=3,
            persistent_workers=True
        )

        val_batch_size = min(self.batch_size * 2, len(val_dataset))
        self.val_loader = DataLoader(
            val_dataset,
            batch_size=val_batch_size,
            pin_memory=True,
            num_workers=4,
            persistent_workers=True
        )

    def setup_training(self):
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=self.hyperparams["Learning Rate"],
            weight_decay=self.hyperparams["Weight Decay"]
        )

        total_steps = (len(self.train_loader) // self.hyperparams["Gradient Accumulation Steps"]) * self.hyperparams["Epochs"]

        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
            self.optimizer,
            max_lr=self.hyperparams["Learning Rate"] * 10,
            steps_per_epoch=len(self.train_loader) // self.hyperparams["Gradient Accumulation Steps"],
            epochs=self.hyperparams["Epochs"],
            pct_start=0.1
        )

        self.early_stopping = EarlyStopping(patience=self.hyperparams["Early Stopping Patience"])

    def train_epoch(self, accumulation_steps):
        clear_gpu_memory()
        self.model.train()
        total_loss = 0
        correct = 0
        total = 0

        batch_pbar = tqdm(self.train_loader, desc="Training batches", position=1, leave=False)
        for batch_idx, (features, target) in enumerate(batch_pbar):
            features = features.to(self.device, non_blocking=True)
            target = target.to(self.device, non_blocking=True)

            with autocast():
                output = self.model(features)
                loss = F.cross_entropy(output, target, label_smoothing=self.hyperparams["Label Smoothing"])
                loss = loss / accumulation_steps

            self.scaler.scale(loss).backward()

            if (batch_idx + 1) % accumulation_steps == 0:
                self.scaler.step(self.optimizer)
                self.scaler.update()
                self.optimizer.zero_grad(set_to_none=True)

            total_loss += loss.item() * accumulation_steps
            pred = output.argmax(dim=1)
            correct += pred.eq(target).sum().item()
            total += target.size(0)

            batch_pbar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'acc': f"{correct/total:.4f}"
            })

            clear_gpu_memory()

        self.scheduler.step()
        return {'loss': total_loss / len(self.train_loader), 'acc': correct / total}

    def validate(self):
        clear_gpu_memory()
        self.model.eval()
        total_loss = 0
        correct = 0
        total = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            val_pbar = tqdm(self.val_loader, desc="Validating", position=1, leave=False)
            for features, target in val_pbar:
                features = features.to(self.device, non_blocking=True)
                target = target.to(self.device, non_blocking=True)

                with autocast():
                    output = self.model(features)
                    loss = F.cross_entropy(output, target)

                total_loss += loss.item()
                pred = output.argmax(dim=1)
                correct += pred.eq(target).sum().item()
                total += target.size(0)

                all_preds.extend(pred.cpu().numpy())
                all_labels.extend(target.cpu().numpy())

                val_pbar.set_postfix({
                    'loss': f"{loss.item():.4f}",
                    'acc': f"{correct/total:.4f}"
                })

                clear_gpu_memory()

        self.model.all_preds = all_preds
        self.model.all_labels = all_labels

        return {'loss': total_loss / len(self.val_loader), 'acc': correct / total}

    def train(self):
        epochs = self.hyperparams["Epochs"]
        accumulation_steps = self.hyperparams["Gradient Accumulation Steps"]

        best_val_loss = float('inf')
        train_losses, val_losses = [], []
        train_accs, val_accs = [], []

        epoch_pbar = tqdm(range(epochs), desc="Training epochs", position=0)
        for epoch in epoch_pbar:
            train_metrics = self.train_epoch(accumulation_steps)
            val_metrics = self.validate()

            train_losses.append(train_metrics['loss'])
            val_losses.append(val_metrics['loss'])
            train_accs.append(train_metrics['acc'])
            val_accs.append(val_metrics['acc'])

            epoch_pbar.set_postfix({
                'train_loss': f"{train_metrics['loss']:.4f}",
                'train_acc': f"{train_metrics['acc']:.4f}",
                'val_loss': f"{val_metrics['loss']:.4f}",
                'val_acc': f"{val_metrics['acc']:.4f}"
            })

            if val_metrics['loss'] < best_val_loss:
                best_val_loss = val_metrics['loss']
                torch.save(self.model.state_dict(), 'best_model.pt')

            if self.early_stopping(val_metrics['loss']):
                logger.info("Early stopping triggered")
                break

            clear_gpu_memory()

        return {
            'train_loss': train_losses,
            'val_loss': val_losses,
            'train_acc': train_accs,
            'val_acc': val_accs
        }

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                return True
        else:
            self.best_loss = val_loss
            self.counter = 0
        return False

if __name__ == "__main__":
    df = pd.read_csv("sample.csv")
    df['rating'] = df['rating'] - 1  # Convert to 0-4 scale
    text_column = 'text'

    hyperparams = {
        # Training hyperparameters
        "Epochs": 10,
        "Batch Size": 50,
        "Learning Rate": 1e-5,
        "Dropout": 0.3,
        "Weight Decay": 0.1,
        "Label Smoothing": 0.1,
        "Early Stopping Patience": 3,
        "Gradient Accumulation Steps": 4,  # Increased for better batch efficiency

        # Embedding extraction hyperparameters
        "Embedding Batch Size": 512,  # Batch size for embedding extraction
        "Embedding Sub Batch Size": 128,  # Sub-batch size for GPU memory management
        "Feature Chunk Size": 1000,  # Chunk size for feature processing

        # Model architecture
        "Hidden Size": 1024,  # Size of intermediate layers
        "Projection Size": 512,  # Size of final projection

        # Optimizer and scheduler
        "Optimizer": "AdamW",
        "Scheduler": "OneCycleLR",
        "Model": "Multi-BERT with Multiple Embeddings"
    }

    try:
        trainer = GPUOptimizedTrainer(df, text_column, df['rating'].values, hyperparams)
        metrics = trainer.train()
        print("\nTraining completed!")
        print(f"Final Training Accuracy: {metrics['train_acc'][-1]:.4f}")
        print(f"Final Validation Accuracy: {metrics['val_acc'][-1]:.4f}")
    except Exception as e:
        logger.error(f"An error occurred during training: {e}")
        print(f"Training failed due to an error. See training.log for details.")




Loading models:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = GradScaler()


Validating texts:   0%|          | 0/15215 [00:00<?, ?it/s]

Training epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

  with autocast():


Validating:   0%|          | 0/31 [00:00<?, ?it/s]

  with autocast():


Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]

Training batches:   0%|          | 0/244 [00:00<?, ?it/s]

Validating:   0%|          | 0/31 [00:00<?, ?it/s]


Training completed!
Final Training Accuracy: 0.6059
Final Validation Accuracy: 0.5965
