<a href="https://colab.research.google.com/github/col-a-guo/guo_chen_jang_ms_project/blob/main/BERTearnings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers datasets torchmetrics scikit-learn numpy huggingface-hub pandas imblearn pytorch_metric_learning

Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting pytorch_metric_learning
  Downloading pytorch_metric_learning-2.9.0-py3-none-any.whl.metadata (18 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading pytorch_metric_learning-2.9.0-py3-none-any.whl (127 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.8/127.8 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_m

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import zipfile
import os

FILE_PATH = '/content/drive/MyDrive/BERTearningsdata/'
print(os.listdir('/content/drive/MyDrive/BERTearningsdata/'))

['train_sept22_combined.csv', 'test_sept22_combined.csv', 'sept22_combined.csv', 'sept1_combined.csv', 'test_sept1_combined.csv', 'train_sept1_combined.csv', 'sept22_combined.gsheet', 'train_nov25_combined.csv', 'test_nov25_combined.csv', 'nov25_combined.csv', 'test_dec1_combined.csv', 'train_dec1_combined.csv', 'dec1_combined.csv']


In [6]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import get_scheduler
from datasets import load_dataset
from collections import Counter
import torchmetrics
from pytorch_metric_learning import losses
from sklearn.metrics import confusion_matrix, classification_report
import random
import numpy as np
from huggingface_hub import PyTorchModelHubMixin
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from torch.optim.lr_scheduler import LambdaLR, ExponentialLR
import os
import matplotlib.pyplot as plt
import seaborn as sns

if __name__ == "__main__":

    seed_value = 1
    random.seed(seed_value)
    np.random.seed(seed_value)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    version_list = ["bottleneckBERT"]
    # Default hyperparameters
    default_lr = 2e-5
    target_lr = 9e-6
    default_eps = 6.748313060587885e-08
    default_batch_size = 32
    num_epochs = 200
    patience = 3
    warmup_proportion = 0.2
    supcon_weight = 0.1
    supcon_temperature = 0.1

    # Feature names for importance analysis
    FEATURE_NAMES = ['year', 'word_count', 'scarcity', 'nonuniform_progress',
                     'performance_constraints', 'user_heterogeneity', 'cognitive',
                     'external', 'internal', 'coordination', 'transactional',
                     'technical', 'demand']

    def compute_feature_importance(model, dataloader, tokenizer, num_features=13, num_Bottid_categories=29, max_samples=100):
        """
        Compute feature importance using gradient-based method.

        Args:
            model: Trained model
            dataloader: DataLoader for the dataset
            tokenizer: Tokenizer to decode tokens
            num_features: Number of numerical features
            num_Bottid_categories: Number of Bottid categories
            max_samples: Maximum number of samples to analyze for text importance

        Returns:
            Dictionary with feature importances
        """
        model.eval()

        # Store gradients for each feature
        feature_gradients = torch.zeros(num_features).to(device)
        bottid_gradients = torch.zeros(num_Bottid_categories).to(device)
        num_samples = 0

        # For text importance: collect token-level importance scores
        token_importance_scores = []
        sample_texts = []
        sample_predictions = []

        for batch_idx, batch in enumerate(dataloader):
            input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]

            # Enable gradient computation for features
            features.requires_grad = True
            Bottid_encoded.requires_grad = True

            # For text analysis, we need to embed the tokens
            if batch_idx < max_samples // default_batch_size:  # Limit samples for text analysis
                # Get embeddings from BERT's embedding layer
                embeddings = model.bert.embeddings(input_ids=input_ids)
                embeddings.retain_grad()

                # Forward pass through the rest of the model
                outputs = model.bert(inputs_embeds=embeddings, attention_mask=attention_mask)
                last_hidden_state = outputs.last_hidden_state
                pooled_output = model.pooling(last_hidden_state.permute(0, 2, 1)).squeeze(-1)
                bert_output = model.cls_head(pooled_output)
                linear_features_output = model.linear_features(features)
                Bottid_output = model.linear_Bottid(Bottid_encoded)
                combined_output = torch.cat((bert_output, linear_features_output, Bottid_output), dim=1)
                linear_layer_output = model.linear_combined_layer(combined_output)
                logits = model.final_classifier(linear_layer_output)
            else:
                # Standard forward pass for numerical features only
                logits, _ = model(input_ids, attention_mask, features, Bottid_encoded)

            # Get predictions
            preds = torch.argmax(logits, dim=1)

            # Compute gradient of prediction w.r.t. features
            for i in range(len(labels)):
                model.zero_grad()
                logits[i, preds[i]].backward(retain_graph=True)

                # Accumulate absolute gradients for numerical features
                if features.grad is not None:
                    feature_gradients += torch.abs(features.grad[i])
                if Bottid_encoded.grad is not None:
                    bottid_gradients += torch.abs(Bottid_encoded.grad[i])

                # For text importance (limited samples)
                if batch_idx < max_samples // default_batch_size and embeddings.grad is not None:
                    # Calculate token importance as L2 norm of gradients
                    token_grads = embeddings.grad[i]  # [seq_len, hidden_dim]
                    token_importance = torch.norm(token_grads, dim=1).cpu().numpy()  # [seq_len]

                    # Get the actual tokens
                    tokens = tokenizer.convert_ids_to_tokens(input_ids[i].cpu().numpy())

                    # Store results
                    token_importance_scores.append(token_importance)
                    sample_texts.append(tokens)
                    sample_predictions.append(preds[i].item())

                # Clear gradients
                features.grad = None
                Bottid_encoded.grad = None
                if batch_idx < max_samples // default_batch_size:
                    embeddings.grad = None

            num_samples += len(labels)

            if batch_idx >= max_samples // default_batch_size:
                break  # Stop early for efficiency

        # Average the gradients
        feature_gradients = feature_gradients / num_samples
        bottid_gradients = bottid_gradients / num_samples

        return {
            'feature_importance': feature_gradients.cpu().numpy(),
            'bottid_importance': bottid_gradients.cpu().numpy(),
            'token_importance_scores': token_importance_scores,
            'sample_texts': sample_texts,
            'sample_predictions': sample_predictions
        }

    def plot_feature_importance(importance_dict, feature_names, save_path, version):
        """
        Plot and save feature importance visualizations.

        Args:
            importance_dict: Dictionary with feature importances
            feature_names: List of feature names
            save_path: Path to save the plots
            version: Model version string
        """
        # Create figure with subplots
        fig, axes = plt.subplots(2, 1, figsize=(12, 10))

        # Plot numerical features importance
        feature_imp = importance_dict['feature_importance']
        sorted_idx = np.argsort(feature_imp)[::-1]

        axes[0].barh(range(len(feature_names)), feature_imp[sorted_idx])
        axes[0].set_yticks(range(len(feature_names)))
        axes[0].set_yticklabels([feature_names[i] for i in sorted_idx])
        axes[0].set_xlabel('Importance Score')
        axes[0].set_title(f'Feature Importance - Numerical Features ({version})')
        axes[0].grid(axis='x', alpha=0.3)

        # Plot Bottid categories importance (top 20)
        bottid_imp = importance_dict['bottid_importance']
        top_n = min(20, len(bottid_imp))
        sorted_bottid_idx = np.argsort(bottid_imp)[::-1][:top_n]

        axes[1].barh(range(top_n), bottid_imp[sorted_bottid_idx])
        axes[1].set_yticks(range(top_n))
        axes[1].set_yticklabels([f'Bottid_{i}' for i in sorted_bottid_idx])
        axes[1].set_xlabel('Importance Score')
        axes[1].set_title(f'Feature Importance - Top {top_n} Bottid Categories ({version})')
        axes[1].grid(axis='x', alpha=0.3)

        plt.tight_layout()

        # Save the plot
        plot_filename = os.path.join(save_path, f'feature_importance_inc_token_{version}.png')
        plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
        print(f"Feature importance plot saved to {plot_filename}")
        plt.close()

        # Save importance scores to CSV
        csv_filename = os.path.join(save_path, f'feature_importance_inc_token_{version}.csv')

        # Create DataFrame for numerical features
        df_features = pd.DataFrame({
            'Feature': feature_names,
            'Importance': feature_imp
        }).sort_values('Importance', ascending=False)

        # Create DataFrame for Bottid features
        df_bottid = pd.DataFrame({
            'Feature': [f'Bottid_{i}' for i in range(len(bottid_imp))],
            'Importance': bottid_imp
        }).sort_values('Importance', ascending=False)

        # Combine and save
        with open(csv_filename, 'w') as f:
            f.write(f"Feature Importance Analysis - {version}\n\n")
            f.write("Numerical Features:\n")
            df_features.to_csv(f, index=False)
            f.write("\n\nBottid Categories:\n")
            df_bottid.to_csv(f, index=False)

        print(f"Feature importance scores saved to {csv_filename}")

        return df_features, df_bottid

    # function to generate classification report for multi-class
    def generate_classification_report(model, dataloader, num_classes, epoch=None, version=None, split_name="Test"):
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in dataloader:
                input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]
                logits, _ = model(input_ids, attention_mask, features, Bottid_encoded)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)

        report = classification_report(all_labels, all_preds, target_names=[str(i) for i in range(num_classes)], digits=4)

        cm = confusion_matrix(all_labels, all_preds, labels=list(range(num_classes)))
        cm_report = "\nConfusion Matrix:\n"
        cm_report += "            Predicted\n"
        cm_report += "           " + "    ".join(map(str, range(num_classes))) + "\n"
        cm_report += "Actual\n"
        for i, row in enumerate(cm):
            cm_report += f"      {i}   " + "    ".join(map(str, row)) + "\n"

        final_report = f"""
    Classification Report ({split_name}, Version: {version}, Epoch {epoch if epoch is not None else 'Final'}):\n
    {report}\n
    {cm_report}
    """

        print(final_report)
        with open("classification_report.txt", "a") as f:
            f.write(final_report + "\n")

        f1 = classification_report(all_labels, all_preds, target_names=[str(i) for i in range(num_classes)], output_dict=True, zero_division=0)['weighted avg']['f1-score']

        return f1, all_preds, all_labels

    def create_test_sets(test_dataset, num_sets=10, subset_size=0.9):
        """
        Splits the test set into `num_sets` subsets, each containing `subset_size` proportion
        of the data for each label, for Monte Carlo cross validation (MCCV)
        """
        label_0_indices = [i for i, item in enumerate(test_dataset) if item[-1] == 0]
        label_1_indices = [i for i, item in enumerate(test_dataset) if item[-1] == 1]
        label_2_indices = [i for i, item in enumerate(test_dataset) if item[-1] == 2]

        num_label_0_samples = int(len(label_0_indices) * subset_size)
        num_label_1_samples = int(len(label_1_indices) * subset_size)
        num_label_2_samples = int(len(label_2_indices) * subset_size)

        test_sets = []
        for _ in range(num_sets):
            subset_label_0_indices = random.sample(label_0_indices, num_label_0_samples)
            subset_label_1_indices = random.sample(label_1_indices, num_label_1_samples)
            subset_label_2_indices = random.sample(label_2_indices, num_label_2_samples)

            subset_indices = subset_label_0_indices + subset_label_1_indices + subset_label_2_indices
            random.shuffle(subset_indices)

            subset = Subset(test_dataset, subset_indices)
            test_sets.append(subset)

        return test_sets

    def evaluate_on_multiple_test_sets(model, test_sets, num_classes=3, version=None):
        """
        Evaluates the model on multiple test sets and calculates the average performance and standard deviations.
        """
        all_reports = []
        all_preds = []
        all_labels = []

        for i, test_set in enumerate(test_sets):
            dataloader = DataLoader(test_set, batch_size=default_batch_size)
            f1, preds, labels = generate_classification_report(model, dataloader, num_classes, version=version, split_name=f"Test Set {i+1}")
            all_reports.append(classification_report(labels, preds, target_names=[str(i) for i in range(num_classes)], output_dict=True, zero_division=0))
            all_preds.extend(preds)
            all_labels.extend(labels)

        metrics = {}
        for class_idx in range(num_classes):
            class_str = str(class_idx)
            metrics[f'precision_{class_str}'] = [report[class_str]['precision'] for report in all_reports]
            metrics[f'recall_{class_str}'] = [report[class_str]['recall'] for report in all_reports]
            metrics[f'f1-score_{class_str}'] = [report[class_str]['f1-score'] for report in all_reports]
            metrics[f'support_{class_str}'] = [report[class_str]['support'] for report in all_reports]

        metrics['macro_avg_precision'] = [report['macro avg']['precision'] for report in all_reports]
        metrics['macro_avg_recall'] = [report['macro avg']['recall'] for report in all_reports]
        metrics['macro_avg_f1-score'] = [report['macro avg']['f1-score'] for report in all_reports]
        metrics['macro_avg_support'] = [report['macro avg']['support'] for report in all_reports]

        metrics['weighted_avg_precision'] = [report['weighted avg']['precision'] for report in all_reports]
        metrics['weighted_avg_recall'] = [report['weighted avg']['recall'] for report in all_reports]
        metrics['weighted_avg_f1-score'] = [report['weighted avg']['f1-score'] for report in all_reports]
        metrics['weighted_avg_support'] = [report['weighted avg']['support'] for report in all_reports]

        results = {}
        for metric_name, values in metrics.items():
            results[metric_name + "_avg"] = np.mean(values)
            results[metric_name + "_std"] = np.std(values)

        final_report = "Averaged performance across all test sets:\n"
        for metric_name, value in results.items():
            if "_avg" in metric_name:
                std_name = metric_name.replace("_avg", "_std")
            if std_name in results:
                final_report += f"{metric_name}: {value:.4f} +/- {results[std_name]:.4f}\n"

        print(final_report)
        with open("classification_report.txt", "a") as f:
            f.write(final_report + "\n")

        return results

    class BertClassifier(nn.Module, PyTorchModelHubMixin):
        def __init__(self, version, num_labels=3, freeze_bert=False, num_Bottid_categories=29):
            super(BertClassifier, self).__init__()

            if version == "bert-uncased":
                self.bert = AutoModel.from_pretrained('google-bert/bert-base-uncased')
            elif version == "businessBERT":
                self.bert = AutoModel.from_pretrained('pborchert/BusinessBERT')
            elif version == "bottleneckBERT":
                self.bert = AutoModel.from_pretrained('colaguo/bottleneckBERTsmall')
            else:
                raise ValueError(f"Invalid model version: {version}")

            self.version = version

            self.linear_features = nn.Sequential(
                nn.Linear(13, 32),
                nn.ReLU()
            )

            self.linear_Bottid = nn.Sequential(
                nn.Linear(num_Bottid_categories, 8),
                nn.ReLU()
            )

            self.cls_head = nn.Sequential(
                nn.Linear(self.bert.config.hidden_size, 256),
                nn.ReLU()
            )

            self.linear_combined_layer = nn.Sequential(
                nn.Linear(256 + 32 + 8, 32),
                nn.ReLU())

            self.final_classifier = nn.Linear(32, num_labels)

            self.pooling = nn.AdaptiveAvgPool1d(1)

            if freeze_bert:
                for param in self.bert.parameters():
                    param.requires_grad = False

        def forward(self, input_ids, attention_mask, features, Bottid_encoded):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state
            pooled_output = self.pooling(last_hidden_state.permute(0, 2, 1)).squeeze(-1)

            bert_output = self.cls_head(pooled_output)

            linear_features_output = self.linear_features(features)
            Bottid_output = self.linear_Bottid(Bottid_encoded)

            combined_output = torch.cat((bert_output, linear_features_output, Bottid_output), dim=1)

            linear_layer_output = self.linear_combined_layer(combined_output)

            logits = self.final_classifier(linear_layer_output)
            return logits, linear_layer_output

    def load_tokenizer(version):
        if version == "bert-uncased":
            return AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
        elif version == "businessBERT":
            return AutoTokenizer.from_pretrained('pborchert/BusinessBERT')
        elif version == "bottleneckBERT":
            return AutoTokenizer.from_pretrained('colaguo/bottleneckBERTsmall')
        else:
            raise ValueError(f"Invalid model version: {version}")

    ogpath = "sept22_combined.csv"
    dataset = load_dataset('csv', data_files={'train': "/content/drive/MyDrive/BERTearningsdata/" + "train_" + ogpath, 'test': "/content/drive/MyDrive/BERTearningsdata/" +"test_" + ogpath})

    train_df = pd.read_csv("/content/drive/MyDrive/BERTearningsdata/" + "train_" + ogpath)
    test_df = pd.read_csv("/content/drive/MyDrive/BERTearningsdata/" + "test_" + ogpath)
    test_df.loc[test_df['label'] > 2, 'label'] = 2

    encoder = OneHotEncoder(handle_unknown='ignore')

    encoder.fit(train_df[['Bottid']])
    train_encoded = encoder.transform(train_df[['Bottid']]).toarray()
    test_encoded = encoder.transform(test_df[['Bottid']]).toarray()

    feature_names = [f"Bottid_{i}" for i in range(train_encoded.shape[1])]

    train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
    test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)

    train_df = pd.concat([train_df, train_encoded_df], axis=1)
    test_df = pd.concat([test_df, test_encoded_df], axis=1)

    train_df = train_df.drop('Bottid', axis=1)
    test_df = test_df.drop('Bottid', axis=1)

    dataset['train'] = dataset['train'].from_pandas(train_df)
    dataset['test'] = dataset['test'].from_pandas(test_df)

    def truncate_dataset(dataset):
        k = round(len(dataset)*0.99)
        random_indices = random.sample(range(len(dataset)), k)
        return dataset.select(random_indices)

    dataset = {k: truncate_dataset(v) for k, v in dataset.items()}

    def tokenize_function(examples, tokenizer):
        return tokenizer(examples["paragraph"], padding="max_length", truncation=True, max_length=512)

    class CustomDataset(Dataset):
        def __init__(self, dataset, Bottid_categories=29):
            self.dataset = dataset
            self.Bottid_categories = Bottid_categories

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            item = self.dataset[idx]
            input_ids = torch.tensor(item['input_ids'])
            attention_mask = torch.tensor(item['attention_mask'])
            label = torch.tensor(item['label'], dtype=torch.long)
            features = torch.tensor([item['year'], item['word_count'], item['scarcity'], item['nonuniform_progress'], item['performance_constraints'], item['user_heterogeneity'], item['cognitive'], item['external'], item['internal'], item['coordination'], item['transactional'], item['technical'], item['demand']], dtype=torch.float)

            Bottid_encoded = torch.tensor([item[f"Bottid_{i}"] for i in range(self.Bottid_categories)], dtype=torch.float)

            return input_ids, attention_mask, features, Bottid_encoded, label

    def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, epochs, loss_fn, supcon_loss_fn, supcon_weight, patience=4, num_classes=3, version=None, test_sets=None):
        model.to(device)
        best_f1 = 0.0
        patience_counter = 0
        best_epoch = 0
        output_dir = "model_output"
        best_model_state = None

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for epoch in range(epochs):
            model.train()
            total_loss = 0
            total_ce_loss = 0
            total_supcon_loss = 0

            for batch in train_dataloader:
                input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]
                model.zero_grad()

                logits, embeddings = model(input_ids, attention_mask, features, Bottid_encoded)

                ce_loss = loss_fn(logits, labels)

                normalized_embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
                supcon_loss = supcon_loss_fn(normalized_embeddings, labels)

                loss = ce_loss + supcon_weight * supcon_loss

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                total_loss += loss.item()
                total_ce_loss += ce_loss.item()
                total_supcon_loss += supcon_loss.item()

            avg_train_loss = total_loss / len(train_dataloader)
            avg_ce_loss = total_ce_loss / len(train_dataloader)
            avg_supcon_loss = total_supcon_loss / len(train_dataloader)

            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]
                    logits, _ = model(input_ids, attention_mask, features, Bottid_encoded)
                    val_loss += loss_fn(logits, labels).item()

            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f} (CE: {avg_ce_loss:.4f}, SupCon: {avg_supcon_loss:.4f}), Validation Loss: {avg_val_loss:.4f}")

            f1_score,_,_ = generate_classification_report(model, val_dataloader, num_classes, epoch=epoch+1, version=version, split_name="Val")

            if f1_score > best_f1:
                best_f1 = f1_score
                best_epoch = epoch + 1
                patience_counter = 0
                best_model_state = model.state_dict()
                print(f"New best F1 score: {best_f1:.4f} at epoch {epoch+1}.")

            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

        if best_model_state is not None:
            model.load_state_dict(best_model_state)
            model_filename = f"model_output/model_version_{version}.pth"
            torch.save(model.state_dict(), model_filename)
            print(f"Best model (version {version}) saved to {model_filename} with F1 {best_f1:.4f}")

            if test_sets is not None:
                print("Evaluating on multiple test sets...")
                evaluate_on_multiple_test_sets(model, test_sets, num_classes=num_classes, version=version)
                print("Evaluation on multiple test sets complete.")

        print(f"Training completed. Best F1 score: {best_f1:.4f} achieved at epoch {best_epoch}.")
        return best_f1

    # Main loop
    for version in version_list:
        print(f"\n----- Running with {version} -----")

        tokenizer = load_tokenizer(version)
        tokenized_datasets = {split: data.map(lambda examples: tokenize_function(examples, tokenizer), batched=True) for split, data in dataset.items()}
        train_dataset = tokenized_datasets["train"]
        test_dataset = tokenized_datasets["test"]

        num_Bottid_categories = train_encoded.shape[1]
        train_data = CustomDataset(train_dataset, Bottid_categories=num_Bottid_categories)
        test_data = CustomDataset(test_dataset, Bottid_categories=num_Bottid_categories)

        test_sets = create_test_sets(test_data)

        train_labels = [item['label'] for item in train_dataset]
        label_counts = Counter(train_labels)
        print("Original label distribution:", label_counts)

        min_count = min(label_counts.values())

        train_data_loader = DataLoader(train_data, batch_size=default_batch_size, shuffle=True)
        normalized_weights = torch.tensor([1.0, 2, 1.3])
        loss_fn = nn.CrossEntropyLoss(weight=normalized_weights.to(device), label_smoothing=0.1)

        supcon_loss_fn = losses.SupConLoss(temperature=supcon_temperature)

        model = BertClassifier(version, num_labels=3, num_Bottid_categories=num_Bottid_categories).to(device)

        train_dataloader = train_data_loader
        val_dataloader = DataLoader(test_data, batch_size=default_batch_size)

        # Standard Adam optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=default_lr)

        train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, epochs=num_epochs, loss_fn=loss_fn, supcon_loss_fn=supcon_loss_fn, supcon_weight=supcon_weight, num_classes=3, version=version, test_sets=test_sets)

        evaluate_on_multiple_test_sets(model, test_sets, num_classes=3, version=version)
        val_dataloader = DataLoader(test_data, batch_size=default_batch_size)
        generate_classification_report(model, val_dataloader, num_classes=3, version=version)

        # COMPUTE AND SAVE FEATURE IMPORTANCE
        print("\n----- Computing Feature Importance -----")
        importance_dict = compute_feature_importance(model, val_dataloader, num_features=13, num_Bottid_categories=num_Bottid_categories)

        # Save to Google Drive
        save_path = "/content/drive/MyDrive/BERTearningsdata/"
        df_features, df_bottid = plot_feature_importance(importance_dict, FEATURE_NAMES, save_path, version)

        print(f"\nTop 5 Most Important Numerical Features:")
        print(df_features.head())
        print(f"\nTop 5 Most Important Bottid Categories:")
        print(df_bottid.head())

Using device: cuda


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]


----- Running with bottleneckBERT -----


Map:   0%|          | 0/3902 [00:00<?, ? examples/s]

Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

Original label distribution: Counter({0.0: 2107, 1.0: 1148, 2.0: 647})


Some weights of BertModel were not initialized from the model checkpoint at colaguo/bottleneckBERTsmall and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/200, Training Loss: 1.5019 (CE: 1.1585, SupCon: 3.4335), Validation Loss: 1.1442


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



    Classification Report (Val, Version: bottleneckBERT, Epoch 1):

                  precision    recall  f1-score   support

           0     0.5359    1.0000    0.6978       896
           1     0.0000    0.0000    0.0000       481
           2     0.0000    0.0000    0.0000       295

    accuracy                         0.5359      1672
   macro avg     0.1786    0.3333    0.2326      1672
weighted avg     0.2872    0.5359    0.3740      1672


    
Confusion Matrix:
            Predicted
           0    1    2
Actual
      0   896    0    0
      1   481    0    0
      2   295    0    0

    
New best F1 score: 0.3740 at epoch 1.
Epoch 2/200, Training Loss: 1.4914 (CE: 1.1480, SupCon: 3.4335), Validation Loss: 1.1304


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



    Classification Report (Val, Version: bottleneckBERT, Epoch 2):

                  precision    recall  f1-score   support

           0     0.5359    1.0000    0.6978       896
           1     0.0000    0.0000    0.0000       481
           2     0.0000    0.0000    0.0000       295

    accuracy                         0.5359      1672
   macro avg     0.1786    0.3333    0.2326      1672
weighted avg     0.2872    0.5359    0.3740      1672


    
Confusion Matrix:
            Predicted
           0    1    2
Actual
      0   896    0    0
      1   481    0    0
      2   295    0    0

    
Epoch 3/200, Training Loss: 1.4717 (CE: 1.1283, SupCon: 3.4337), Validation Loss: 1.1079


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



    Classification Report (Val, Version: bottleneckBERT, Epoch 3):

                  precision    recall  f1-score   support

           0     0.5359    1.0000    0.6978       896
           1     0.0000    0.0000    0.0000       481
           2     0.0000    0.0000    0.0000       295

    accuracy                         0.5359      1672
   macro avg     0.1786    0.3333    0.2326      1672
weighted avg     0.2872    0.5359    0.3740      1672


    
Confusion Matrix:
            Predicted
           0    1    2
Actual
      0   896    0    0
      1   481    0    0
      2   295    0    0

    
Epoch 4/200, Training Loss: 1.4365 (CE: 1.0926, SupCon: 3.4389), Validation Loss: 1.0639


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



    Classification Report (Val, Version: bottleneckBERT, Epoch 4):

                  precision    recall  f1-score   support

           0     0.5370    0.9967    0.6979       896
           1     0.3333    0.0062    0.0122       481
           2     0.0000    0.0000    0.0000       295

    accuracy                         0.5359      1672
   macro avg     0.2901    0.3343    0.2367      1672
weighted avg     0.3837    0.5359    0.3775      1672


    
Confusion Matrix:
            Predicted
           0    1    2
Actual
      0   893    3    0
      1   478    3    0
      2   292    3    0

    
New best F1 score: 0.3775 at epoch 4.
Epoch 5/200, Training Loss: 1.3805 (CE: 1.0359, SupCon: 3.4454), Validation Loss: 1.0175

    Classification Report (Val, Version: bottleneckBERT, Epoch 5):

                  precision    recall  f1-score   support

           0     0.5656    0.9665    0.7136       896
           1     0.3554    0.0894    0.1429       481
           2     0.4000    0.