<a href="https://colab.research.google.com/github/col-a-guo/guo_chen_jang_ms_project/blob/main/BERTearnings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets torchmetrics scikit-learn numpy huggingface-hub pandas imblearn pytorch_metric_learning

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import zipfile
import os

FILE_PATH = '/content/drive/MyDrive/BERTearningsdata/'
print(os.listdir('/content/drive/MyDrive/BERTearningsdata/'))

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset, Subset
from transformers import get_scheduler
from datasets import load_dataset
from collections import Counter
import torchmetrics
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, classification_report
import random
import numpy as np
from huggingface_hub import PyTorchModelHubMixin
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from torch.optim.lr_scheduler import LambdaLR, ExponentialLR
import os

if __name__ == "__main__":

    seed_value = 1
    random.seed(seed_value)
    np.random.seed(seed_value)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    version_list = ["bottleneckBERT"]
    # Default hyperparameters
    default_lr = 2e-5
    target_lr = 9e-6
    default_eps = 6.748313060587885e-08
    default_batch_size = 32
    num_epochs = 200
    patience = 10
    warmup_proportion = 0.2
    focal_weight = 0
    focal_alpha = 0.25
    focal_gamma = 2.0

    # function to generate classification report for binary classification
    def generate_classification_report(model, dataloader, num_classes, epoch=None, version=None, split_name="Test", model_name=""):
        model.eval()
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in dataloader:
                input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]
                logits, _ = model(input_ids, attention_mask, features, Bottid_encoded)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)

        report = classification_report(all_labels, all_preds, target_names=[str(i) for i in range(num_classes)], digits=4)

        cm = confusion_matrix(all_labels, all_preds, labels=list(range(num_classes)))
        cm_report = "\nConfusion Matrix:\n"
        cm_report += "            Predicted\n"
        cm_report += "           " + "    ".join(map(str, range(num_classes))) + "\n"
        cm_report += "Actual\n"
        for i, row in enumerate(cm):
            cm_report += f"      {i}   " + "    ".join(map(str, row)) + "\n"

        final_report = f"""
    Classification Report ({split_name}, Version: {version}, Epoch {epoch if epoch is not None else 'Final'}):\n
    {report}\n
    {cm_report}
    """

        print(final_report)
        with open(f"classification_report_{model_name}.txt", "a") as f:
            f.write(final_report + "\n")

        f1 = classification_report(all_labels, all_preds, target_names=[str(i) for i in range(num_classes)], output_dict=True, zero_division=0)['weighted avg']['f1-score']

        return f1, all_preds, all_labels

    def create_test_sets(test_dataset, num_sets=10, subset_size=0.9, num_classes=2):
        """
        Splits the test set into `num_sets` subsets for binary classification
        """
        # Get indices of samples for each label
        label_indices = {}
        for label_idx in range(num_classes):
            label_indices[label_idx] = [i for i, item in enumerate(test_dataset) if item[-1] == label_idx]

        # Calculate the number of samples to select for each label in each subset
        num_samples_per_label = {label_idx: int(len(indices) * subset_size)
                                  for label_idx, indices in label_indices.items()}

        test_sets = []
        for _ in range(num_sets):
            subset_indices = []
            for label_idx in range(num_classes):
                subset_label_indices = random.sample(label_indices[label_idx], num_samples_per_label[label_idx])
                subset_indices.extend(subset_label_indices)

            random.shuffle(subset_indices)
            subset = Subset(test_dataset, subset_indices)
            test_sets.append(subset)

        return test_sets

    def evaluate_on_multiple_test_sets(model, test_sets, num_classes=2, version=None, model_name=""):
        """
        Evaluates the model on multiple test sets and calculates the average performance and standard deviations.
        """
        all_reports = []
        all_preds = []
        all_labels = []

        for i, test_set in enumerate(test_sets):
            dataloader = DataLoader(test_set, batch_size=default_batch_size)
            f1, preds, labels = generate_classification_report(model, dataloader, num_classes, version=version, split_name=f"Test Set {i+1}", model_name=model_name)
            all_reports.append(classification_report(labels, preds, target_names=[str(i) for i in range(num_classes)], output_dict=True, zero_division=0))
            all_preds.extend(preds)
            all_labels.extend(labels)

        metrics = {}
        for class_idx in range(num_classes):
            class_str = str(class_idx)
            metrics[f'precision_{class_str}'] = [report[class_str]['precision'] for report in all_reports]
            metrics[f'recall_{class_str}'] = [report[class_str]['recall'] for report in all_reports]
            metrics[f'f1-score_{class_str}'] = [report[class_str]['f1-score'] for report in all_reports]
            metrics[f'support_{class_str}'] = [report[class_str]['support'] for report in all_reports]

        metrics['macro_avg_precision'] = [report['macro avg']['precision'] for report in all_reports]
        metrics['macro_avg_recall'] = [report['macro avg']['recall'] for report in all_reports]
        metrics['macro_avg_f1-score'] = [report['macro avg']['f1-score'] for report in all_reports]
        metrics['macro_avg_support'] = [report['macro avg']['support'] for report in all_reports]

        metrics['weighted_avg_precision'] = [report['weighted avg']['precision'] for report in all_reports]
        metrics['weighted_avg_recall'] = [report['weighted avg']['recall'] for report in all_reports]
        metrics['weighted_avg_f1-score'] = [report['weighted avg']['f1-score'] for report in all_reports]
        metrics['weighted_avg_support'] = [report['weighted avg']['support'] for report in all_reports]

        results = {}
        for metric_name, values in metrics.items():
            results[metric_name + "_avg"] = np.mean(values)
            results[metric_name + "_std"] = np.std(values)

        final_report = "Averaged performance across all test sets:\n"
        for metric_name, value in results.items():
            if "_avg" in metric_name:
                std_name = metric_name.replace("_avg", "_std")
                if std_name in results:
                    final_report += f"{metric_name}: {value:.4f} +/- {results[std_name]:.4f}\n"

        print(final_report)
        with open(f"classification_report_{model_name}.txt", "a") as f:
            f.write(final_report + "\n")

        return results

    # Main model architecture - modified for binary classification
    class BertClassifier(nn.Module, PyTorchModelHubMixin):
        def __init__(self, version, num_labels=2, freeze_bert=False, num_Bottid_categories=30):
            super(BertClassifier, self).__init__()

            if version == "bert-uncased":
                self.bert = AutoModel.from_pretrained('google-bert/bert-base-uncased')
            elif version == "businessBERT":
                self.bert = AutoModel.from_pretrained('pborchert/BusinessBERT')
            elif version == "bottleneckBERT":
                self.bert = AutoModel.from_pretrained('colaguo/bottleneckBERTsmall')
            else:
                raise ValueError(f"Invalid model version: {version}")

            self.version = version

            self.linear_features = nn.Sequential(
                nn.Linear(13, 16),
                nn.ReLU()
            )

            self.linear_Bottid = nn.Sequential(
                nn.Linear(num_Bottid_categories, 8),
                nn.ReLU()
            )

            self.cls_head = nn.Sequential(
                nn.Linear(self.bert.config.hidden_size, 128),
                nn.ReLU()
            )

            self.linear_combined_layer = nn.Sequential(
                nn.Linear(128 + 16 + 8, 32),
                nn.ReLU())

            self.final_classifier = nn.Linear(32, num_labels)

            self.pooling = nn.AdaptiveAvgPool1d(1)

            if freeze_bert:
                for param in self.bert.parameters():
                    param.requires_grad = False

        def forward(self, input_ids, attention_mask, features, Bottid_encoded):
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state
            pooled_output = self.pooling(last_hidden_state.permute(0, 2, 1)).squeeze(-1)

            bert_output = self.cls_head(pooled_output)
            linear_features_output = self.linear_features(features)
            Bottid_output = self.linear_Bottid(Bottid_encoded)

            combined_output = torch.cat((bert_output, linear_features_output, Bottid_output), dim=1)
            linear_layer_output = self.linear_combined_layer(combined_output)

            logits = self.final_classifier(linear_layer_output)
            return logits, linear_layer_output

    def load_tokenizer(version):
        if version == "bert-uncased":
            return AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
        elif version == "businessBERT":
            return AutoTokenizer.from_pretrained('pborchert/BusinessBERT')
        elif version == "bottleneckBERT":
            return AutoTokenizer.from_pretrained('colaguo/bottleneckBERTsmall')
        else:
            raise ValueError(f"Invalid model version: {version}")

    # Load original dataset
    ogpath = "nov25_combined.csv"
    train_df_original = pd.read_csv("/content/drive/MyDrive/BERTearningsdata/" + "train_" + ogpath)
    test_df_original = pd.read_csv("/content/drive/MyDrive/BERTearningsdata/" + "test_" + ogpath)

    def tokenize_function(examples, tokenizer):
        return tokenizer(examples["paragraph"], padding="max_length", truncation=True, max_length=512)

    class CustomDataset(Dataset):
        def __init__(self, dataset, Bottid_categories=30):
            self.dataset = dataset
            self.Bottid_categories = Bottid_categories

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            item = self.dataset[idx]
            input_ids = torch.tensor(item['input_ids'])
            attention_mask = torch.tensor(item['attention_mask'])
            label = torch.tensor(item['label'], dtype=torch.long)
            features = torch.tensor([item['year'], item['word_count'], item['scarcity'],
                                    item['nonuniform_progress'], item['performance_constraints'],
                                    item['user_heterogeneity'], item['cognitive'], item['external'],
                                    item['internal'], item['coordination'], item['transactional'],
                                    item['technical'], item['demand']], dtype=torch.float)

            Bottid_encoded = torch.tensor([item[f"Bottid_{i}"] for i in range(self.Bottid_categories)], dtype=torch.float)

            return input_ids, attention_mask, features, Bottid_encoded, label

    class FocalLoss(nn.Module):
        def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
            super(FocalLoss, self).__init__()
            self.alpha = alpha
            self.gamma = gamma
            self.reduction = reduction

        def forward(self, inputs, targets):
            """
            Focal Loss for binary or multi-class classification

            Args:
                inputs: logits from model (batch_size, num_classes)
                targets: ground truth labels (batch_size)
            """
            ce_loss = F.cross_entropy(inputs, targets, reduction='none')
            pt = torch.exp(-ce_loss)
            focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss

            if self.reduction == 'mean':
                return focal_loss.mean()
            elif self.reduction == 'sum':
                return focal_loss.sum()
            else:
                return focal_loss

    def get_exponential_warmup_schedule(optimizer, warmup_steps, initial_lr, target_lr, num_epochs, total_steps):
        def warmup_lr_lambda(current_step):
            if current_step < warmup_steps:
                return float(current_step) / float(max(1, warmup_steps))
            return 1.0

        warmup_scheduler = LambdaLR(optimizer, lr_lambda=warmup_lr_lambda)
        decay_rate = (target_lr / initial_lr)**(1 / (total_steps - warmup_steps))
        decay_scheduler = ExponentialLR(optimizer, gamma=decay_rate)

        return warmup_scheduler, decay_scheduler

    def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, warmup_scheduler, decay_scheduler,
                          epochs, loss_fn, focal_loss_fn, focal_weight, patience, num_classes,
                          version, test_sets, model_name):
        model.to(device)
        best_f1 = 0.0
        patience_counter = 0
        current_step = 0
        best_epoch = 0
        output_dir = f"model_output_{model_name}"
        best_model_state = None

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for epoch in range(epochs):
            model.train()
            total_loss = 0
            total_ce_loss = 0
            total_focal_loss = 0

            for batch in train_dataloader:
                input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]
                model.zero_grad()

                logits, embeddings = model(input_ids, attention_mask, features, Bottid_encoded)

                # Cross-entropy loss
                ce_loss = loss_fn(logits, labels)

                # Focal loss
                focal_loss = focal_loss_fn(logits, labels)

                # Combined loss
                loss = ce_loss + focal_weight * focal_loss

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

                if current_step < warmup_steps:
                    warmup_scheduler.step()
                decay_scheduler.step()

                current_step += 1
                total_loss += loss.item()
                total_ce_loss += ce_loss.item()
                total_focal_loss += focal_loss.item()

            avg_train_loss = total_loss / len(train_dataloader)
            avg_ce_loss = total_ce_loss / len(train_dataloader)
            avg_focal_loss = total_focal_loss / len(train_dataloader)

            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids, attention_mask, features, Bottid_encoded, labels = [t.to(device) for t in batch]
                    logits, _ = model(input_ids, attention_mask, features, Bottid_encoded)
                    val_loss += loss_fn(logits, labels).item()

            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f} (CE: {avg_ce_loss:.4f}, Focal: {avg_focal_loss:.4f}), Validation Loss: {avg_val_loss:.4f}")

            f1_score, _, _ = generate_classification_report(model, val_dataloader, num_classes, epoch=epoch+1, version=version, split_name="Val", model_name=model_name)

            if f1_score > best_f1:
                best_f1 = f1_score
                best_epoch = epoch + 1
                patience_counter = 0
                best_model_state = model.state_dict()
                print(f"New best F1 score: {best_f1:.4f} at epoch {epoch+1}.")
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

        if best_model_state is not None:
            model.load_state_dict(best_model_state)
            model_filename = f"{output_dir}/model_version_{version}.pth"
            torch.save(model.state_dict(), model_filename)
            print(f"Best model (version {version}) saved to {model_filename} with F1 {best_f1:.4f}")

            if test_sets is not None:
                print("Evaluating on multiple test sets...")
                evaluate_on_multiple_test_sets(model, test_sets, num_classes=num_classes, version=version, model_name=model_name)
                print("Evaluation on multiple test sets complete.")

        print(f"Training completed. Best F1 score: {best_f1:.4f} achieved at epoch {best_epoch}.")
        return best_f1, model

    def prepare_dataset(train_df, test_df, binary_model):
        """Prepare dataset for specific binary model"""
        train_df = train_df.copy()
        test_df = test_df.copy()

        if binary_model == "model1":
            # Model 1: Binary classification - label 0 vs label 1/2
            train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 0 else 1)
            test_df['label'] = test_df['label'].apply(lambda x: 0 if x == 0 else 1)

        elif binary_model == "model2":
            # Model 2: Binary classification - label 1 vs label 2
            train_df = train_df[train_df['label'].isin([1, 2])].copy()
            test_df = test_df[test_df['label'].isin([1, 2])].copy()
            train_df['label'] = train_df['label'].apply(lambda x: 0 if x == 1 else 1)
            test_df['label'] = test_df['label'].apply(lambda x: 0 if x == 1 else 1)

        # One-hot encode Bottid with fixed categories 1-30
        encoder = OneHotEncoder(categories=[list(range(1, 31))], handle_unknown='ignore')
        encoder.fit(train_df[['Bottid']])

        train_encoded = encoder.transform(train_df[['Bottid']]).toarray()
        test_encoded = encoder.transform(test_df[['Bottid']]).toarray()

        # Always create 30 Bottid columns
        feature_names = [f"Bottid_{i}" for i in range(30)]
        train_encoded_df = pd.DataFrame(train_encoded, columns=feature_names)
        test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)

        train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_df], axis=1)
        test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_df], axis=1)

        train_df = train_df.drop('Bottid', axis=1)
        test_df = test_df.drop('Bottid', axis=1)

        return train_df, test_df, 30  # Always return 30 categories

    def truncate_dataset(dataset):
        k = round(len(dataset)*0.99)
        random_indices = random.sample(range(len(dataset)), k)
        return dataset.select(random_indices)

    # Dictionary to store trained models
    trained_models = {}

    # Train both models
    for binary_model in ["model1", "model2"]:
        print(f"\n{'='*80}")
        if binary_model == "model1":
            print("TRAINING MODEL 1: Label 0 vs Label 1/2")
        else:
            print("TRAINING MODEL 2: Label 1 vs Label 2")
        print(f"{'='*80}\n")

        # Prepare dataset
        train_df, test_df, num_Bottid_categories = prepare_dataset(train_df_original, test_df_original, binary_model)

        # Convert to HuggingFace datasets
        from datasets import Dataset as HFDataset
        dataset = {
            'train': HFDataset.from_pandas(train_df),
            'test': HFDataset.from_pandas(test_df)
        }
        dataset = {k: truncate_dataset(v) for k, v in dataset.items()}

        for version in version_list:
            print(f"\n----- Running {binary_model} with {version} -----")

            tokenizer = load_tokenizer(version)
            tokenized_datasets = {split: data.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
                                 for split, data in dataset.items()}
            train_dataset = tokenized_datasets["train"]
            test_dataset = tokenized_datasets["test"]

            train_data = CustomDataset(train_dataset, Bottid_categories=num_Bottid_categories)
            test_data = CustomDataset(test_dataset, Bottid_categories=num_Bottid_categories)

            test_sets = create_test_sets(test_data, num_classes=2)

            train_labels = [item['label'] for item in train_dataset]
            label_counts = Counter(train_labels)
            print("Binary label distribution:", label_counts)

            train_data_loader = DataLoader(train_data, batch_size=default_batch_size, shuffle=True)

            # Binary class weights
            if binary_model == "model1":
                normalized_weights = torch.tensor([1.0, 1.3])
            else:
                normalized_weights = torch.tensor([1.0, 1.3])

            loss_fn = nn.CrossEntropyLoss(weight=normalized_weights.to(device), label_smoothing=0.1)
            focal_loss_fn = FocalLoss(alpha=focal_alpha, gamma=focal_gamma)

            model = BertClassifier(version, num_labels=2, num_Bottid_categories=num_Bottid_categories).to(device)

            train_dataloader = train_data_loader
            val_dataloader = DataLoader(test_data, batch_size=default_batch_size)

            optimizer = torch.optim.AdamW(model.parameters(), lr=default_lr, eps=default_eps)

            total_steps = len(train_dataloader) * num_epochs
            warmup_steps = int(warmup_proportion * total_steps)

            warmup_scheduler, decay_scheduler = get_exponential_warmup_schedule(
                optimizer, warmup_steps, default_lr, target_lr, num_epochs, total_steps
            )

            best_f1, trained_model = train_and_evaluate(
                model, train_dataloader, val_dataloader, optimizer, warmup_scheduler,
                decay_scheduler, epochs=num_epochs, loss_fn=loss_fn, focal_loss_fn=focal_loss_fn,
                focal_weight=focal_weight, num_classes=2, patience=patience, version=version, test_sets=test_sets,
                model_name=binary_model
            )

            # Store the trained model and its dataset info
            trained_models[binary_model] = {
                'model': trained_model,
                'version': version,
                'num_Bottid_categories': num_Bottid_categories,
                'tokenizer': tokenizer
            }

    # HIERARCHICAL CLASSIFICATION ON ORIGINAL TEST SET
    print(f"\n{'='*80}")
    print("HIERARCHICAL CLASSIFICATION: Combining Both Models on Original Test Set")
    print(f"{'='*80}\n")

    # Prepare original test set (with original 3-class labels)
    test_df_hier = test_df_original.copy()

    # Store original labels
    original_labels = test_df_hier['label'].values

    # One-hot encode for hierarchical prediction with fixed categories
    encoder = OneHotEncoder(categories=[list(range(1, 31))], handle_unknown='ignore')
    encoder.fit(test_df_hier[['Bottid']])
    test_encoded = encoder.transform(test_df_hier[['Bottid']]).toarray()

    # Always create 30 Bottid columns
    feature_names = [f"Bottid_{i}" for i in range(30)]
    test_encoded_df = pd.DataFrame(test_encoded, columns=feature_names)
    test_df_hier = pd.concat([test_df_hier.reset_index(drop=True), test_encoded_df], axis=1)
    test_df_hier = test_df_hier.drop('Bottid', axis=1)

    # Create dataset
    from datasets import Dataset as HFDataset
    test_dataset_hier = HFDataset.from_pandas(test_df_hier)

    # Get model 1 info
    model1_info = trained_models['model1']
    tokenizer = model1_info['tokenizer']

    # Tokenize
    test_dataset_hier = test_dataset_hier.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
    test_data_hier = CustomDataset(test_dataset_hier, Bottid_categories=30)

    # Hierarchical prediction
    model1 = trained_models['model1']['model']
    model2 = trained_models['model2']['model']

    model1.eval()
    model2.eval()

    hierarchical_predictions = []

    print("Making hierarchical predictions...")
    with torch.no_grad():
        for idx in range(len(test_data_hier)):
            input_ids, attention_mask, features, Bottid_encoded, _ = test_data_hier[idx]

            # Add batch dimension
            input_ids = input_ids.unsqueeze(0).to(device)
            attention_mask = attention_mask.unsqueeze(0).to(device)
            features = features.unsqueeze(0).to(device)
            Bottid_encoded = Bottid_encoded.unsqueeze(0).to(device)

            # First model: 0 vs 1/2
            logits1, _ = model1(input_ids, attention_mask, features, Bottid_encoded)
            pred1 = torch.argmax(logits1, dim=1).item()

            if pred1 == 0:
                # Predicted class 0
                final_pred = 0
            else:
                # Predicted 1 or 2, use model2 to distinguish
                logits2, _ = model2(input_ids, attention_mask, features, Bottid_encoded)
                pred2 = torch.argmax(logits2, dim=1).item()
                # pred2: 0 means original label 1, 1 means original label 2
                final_pred = 1 if pred2 == 0 else 2

            hierarchical_predictions.append(final_pred)

    hierarchical_predictions = np.array(hierarchical_predictions)

    # Generate comprehensive report
    report = classification_report(original_labels, hierarchical_predictions,
                                   target_names=['0', '1', '2'], digits=4)

    cm = confusion_matrix(original_labels, hierarchical_predictions, labels=[0, 1, 2])
    cm_report = "\nConfusion Matrix:\n"
    cm_report += "            Predicted\n"
    cm_report += "           " + "    ".join(map(str, [0, 1, 2])) + "\n"
    cm_report += "Actual\n"
    for i, row in enumerate(cm):
        cm_report += f"      {i}   " + "    ".join(map(str, row)) + "\n"

    final_report = f"""
{'='*80}
HIERARCHICAL CLASSIFICATION REPORT (3-Class on Original Test Set)
Model 1 (0 vs 1/2) -> Model 2 (1 vs 2)
{'='*80}

{report}

{cm_report}
"""

    print(final_report)
    with open("classification_report_hierarchical.txt", "w") as f:
        f.write(final_report + "\n")

    print("\nHierarchical classification complete!")
    print("Results saved to: classification_report_hierarchical.txt")