In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
import os, re

if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4

!pip install evaluate seaborn

In [3]:
# Install required packages if needed
import os
import time
import random
from datetime import datetime
from typing import List, Dict, Any, Optional
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
import evaluate
from transformers import pipeline
import torch

print("All imports successful!")


All imports successful!


In [4]:
torch.cuda.memory_summary

In [5]:

# Configuration
MODEL_PATH = "cike-dev/Distilbert_toxic"  # Your fine-tuned DistilBERT model
DATA_PATHS = [
    "cike-dev/gab",
    "cike-dev/olid",
    "cike-dev/hatexplain",
    "cike-dev/stormfront",
    "cike-dev/tdavidson",
]
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/Cyberbullying/tests/distilBERT/"
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"

print(f"Model Path: {MODEL_PATH}")
print(f"Data Paths: {DATA_PATHS}")
print(f"Output Directory: {OUTPUT_DIR}")



Model Path: cike-dev/Distilbert_toxic
Data Paths: ['cike-dev/gab', 'cike-dev/olid', 'cike-dev/hatexplain', 'cike-dev/stormfront', 'cike-dev/tdavidson']
Output Directory: /content/drive/MyDrive/Colab Notebooks/Cyberbullying/tests/distilBERT/


In [6]:
class ModelEvaluator:
    """A class for evaluating fine-tuned DistilBERT models on classification tasks."""

    def __init__(self, model_path: str, device: Optional[str] = None):
        """Initialize the evaluator with a DistilBERT model."""
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        self.model_path = model_path
        self.pipe = self._load_model(model_path)

    def _load_model(self, model_path: str):
        """Load the DistilBERT model and tokenizer using transformers pipeline."""
        print(f"Loading model from {model_path}...")
        pipe = pipeline(
            task="text-classification",
            model=model_path,
            tokenizer=model_path,
            device=0 if self.device == "cuda" else -1
        )
        print("Model loaded successfully!")
        return pipe

    def _generate_prediction(self, input_text: str) -> str:
        """Generate prediction for a single input using the pipeline."""
        with torch.no_grad():
            result = self.pipe(input_text, top_k=1)[0]
            return result['label'].lower()


In [6]:

def evaluate_multiple_datasets(
    evaluator,
    dataset_paths: List[str],
    output_dir: str = "./distilbert_multi_dataset_evaluation/",
    split: str = "test",
    sample_size: Optional[int] = None,
    text_column: str = "text",
    label_column: str = "label",
    system_message: Optional[str] = None,
    save_individual_cms: bool = True,
    save_summary_report: bool = True
) -> Dict[str, Dict[str, Any]]:
    """
    Evaluate a DistilBERT model on multiple datasets and generate comprehensive reports.

    Args:
        evaluator: ModelEvaluator instance
        dataset_paths: List of dataset paths
        output_dir: Directory to save all results
        split: Dataset split to use for evaluation
        text_column: Name of the text column in datasets
        label_column: Name of the label column in datasets
        system_message: Not used for DistilBERT pipeline
        save_individual_cms: Whether to save individual confusion matrices
        save_summary_report: Whether to save summary text report

    Returns:
        Dictionary with results for each dataset
    """

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Store results for all datasets
    all_results = {}
    summary_lines = []

    # Add header to summary
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    summary_lines.extend([
        "="*80,
        "MULTI-DATASET EVALUATION REPORT",
        "="*80,
        f"Evaluation Date: {timestamp}",
        f"Model Path: {evaluator.model_path}",
        f"Number of Datasets: {len(dataset_paths)}",
        f"Split Used: {split}",
        "="*80,
        ""
    ])

    print(f"\n🚀 Starting evaluation on {len(dataset_paths)} datasets...")
    print(f"📁 Results will be saved to: {output_dir}")

    # Process each dataset
    for idx, dataset_path in enumerate(dataset_paths, 1):
        print(f"\n{'='*60}")
        print(f"📊 DATASET {idx}/{len(dataset_paths)}: {dataset_path}")
        print(f"{'='*60}")

        try:
            # Load and prepare dataset
            start_time = time.time()
            dataset_results = _evaluate_single_dataset(
                evaluator=evaluator,
                dataset_path=dataset_path,
                split=split,
                text_column=text_column,
                label_column=label_column,
                output_dir=output_dir,
                save_cm=save_individual_cms,
                sample_size=sample_size,
            )
            end_time = time.time()

            # Store results
            dataset_results['evaluation_time_seconds'] = round(end_time - start_time, 2)
            all_results[dataset_path] = dataset_results

            # Add to summary
            summary_lines.extend(_format_dataset_summary(dataset_path, dataset_results))

            print(f"✅ Completed {dataset_path} in {dataset_results['evaluation_time_seconds']:.1f}s")

            torch.cuda.memory_summary(device=  )

        except Exception as e:
            error_msg = f"❌ Error processing {dataset_path}: {str(e)}"
            print(error_msg)
            summary_lines.extend([
                f"Dataset: {dataset_path}",
                f"Status: FAILED",
                f"Error: {str(e)}",
                "-" * 40,
                ""
            ])
            all_results[dataset_path] = {"status": "failed", "error": str(e)}

    # Generate comparison visualization
    if len([r for r in all_results.values() if r.get('status') != 'failed']) > 1:
        _create_comparison_chart(all_results, output_dir)

    # Save summary report
    if save_summary_report:
        _save_summary_report(summary_lines, all_results, output_dir)

    print(f"\n🎉 Evaluation complete! Results saved to: {output_dir}")
    return all_results

def _evaluate_single_dataset(
    evaluator, dataset_path: str, split: str, text_column: str,
    label_column: str, output_dir: str, save_cm: bool, sample_size: Optional[int] = None
) -> Dict[str, Any]:
    """Evaluate model on a single dataset."""

    # Load dataset
    print(f"📥 Loading {dataset_path}...")
    try:
        dataset = load_dataset(dataset_path)
        if sample_size is not None:
            eval_dataset = dataset[split].select(range(min(sample_size, len(dataset[split]))))
        else:
            eval_dataset = dataset[split]
        print(f"✅ Loaded {len(eval_dataset)} samples")
    except Exception as e:
        raise Exception(f"Failed to load dataset: {e}")

    # Check if columns exist
    if text_column not in eval_dataset.column_names:
        possible_text_cols = ['text', 'cleaned_text', 'content', 'message', 'tweet']
        found_col = None
        for col in possible_text_cols:
            if col in eval_dataset.column_names:
                found_col = col
                break
        if found_col:
            print(f"⚠️  '{text_column}' not found, using '{found_col}' instead")
            text_column = found_col
        else:
            raise Exception(f"Text column not found. Available: {eval_dataset.column_names}")

    if label_column not in eval_dataset.column_names:
        possible_label_cols = ['label', 'labels', 'target', 'class', 'category']
        found_col = None
        for col in possible_label_cols:
            if col in eval_dataset.column_names:
                found_col = col
                break
        if found_col:
            print(f"⚠️  '{label_column}' not found, using '{found_col}' instead")
            label_column = found_col
        else:
            raise Exception(f"Label column not found. Available: {eval_dataset.column_names}")

    # Prepare data
    print("🔄 Preparing data...")
    def convert_to_binary(example):
        raw_label = example.get(label_column)
        if raw_label is None:
            label = "normal"
        else:
            if isinstance(raw_label, (int, float)):
                label = "toxic" if int(raw_label) == 1 else "normal"
            else:
                normalized = str(raw_label).lower().strip()
                if normalized in {'1', 'toxic', 'bully', 'cyberbullying', 'hate'} or 'toxic' in normalized:
                    label = "toxic"
                else:
                    label = "normal"
        return {"text": example.get(text_column, example.get("text", "")), "label": label}

    eval_dataset = eval_dataset.map(convert_to_binary)

    # Run evaluation
    print("🧠 Running model inference...")
    predictions = []
    references = []

    # Prepare log file
    log_file = os.path.join(output_dir, "invalid_predictions.log")
    with open(log_file, "a", encoding="utf-8") as f:
        f.write("=" * 80 + "\n")
        f.write(f"Invalid Predictions Log - Dataset: {dataset_path}\n")
        f.write("=" * 80 + "\n\n")

    def log_invalid(sample, true_label, pred_text):
        """Append invalid predictions to a text file."""
        with open(log_file, "a", encoding="utf-8") as f:
            f.write("#" + "-" * 76 + "#\n")
            f.write(f"Text: {sample['text']}\n")
            f.write(f"True label: {true_label}\n")
            f.write(f"Predicted label: {pred_text}\n")
            f.write("#" + "-" * 76 + "#\n")

    exs = ""
    for i, sample in enumerate(tqdm(eval_dataset, desc="Evaluating", unit="sample", total=len(eval_dataset))):
        try:
            input_text = sample["text"]
            true_label = sample["label"]
            pred_text = evaluator._generate_prediction(input_text)

            # Debug: print raw outputs for the first few toxic samples
            if (i < 5) and (true_label == "toxic"):
                exs += f"Input: {input_text}\nPred label: {pred_text}\nTrue label: {true_label}\n"

            # Classify
            if pred_text not in ["toxic", "normal"]:
                log_invalid(sample, true_label, pred_text)
                continue

            pred_label = 1 if pred_text == "toxic" else 0
            true_label_binary = 1 if true_label == "toxic" else 0

            predictions.append(pred_label)
            references.append(true_label_binary)

        except Exception as e:
            with open(log_file, "a", encoding="utf-8") as f:
                f.write("-----------------------\n")
                f.write(f"Error processing sample: {str(e)}\n")
                f.write("-----------------------\n\n")
            continue

    print("Sample input and pred:\n", exs)

    # Calculate metrics
    print("📊 Calculating metrics...")
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    if not predictions:
        raise ValueError("Predictions list is empty")
    if not references:
        raise ValueError("References list is empty")
    if len(predictions) != len(references):
        raise ValueError(
            f"Length mismatch: predictions={len(predictions)}, references={len(references)}"
        )

    from collections import Counter
    pred_count = Counter(predictions)
    print(f"\nLabel distribution in predictions: {pred_count}\n")

    accuracy = accuracy_metric.compute(predictions=predictions, references=references)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=references, average="weighted")["f1"]

    report = classification_report(
        references, predictions,
        target_names=['normal', 'toxic'],
        digits=4,
        output_dict=True
    )

    cm_path = None
    if save_cm:
        dataset_name = dataset_path.split('/')[-1]
        cm_path = _save_confusion_matrix(references, predictions, output_dir, dataset_name)

    return {
        "status": "success",
        "dataset_size": len(eval_dataset),
        "accuracy": accuracy,
        "f1_score": f1,
        "classification_report": report,
        "confusion_matrix_path": cm_path,
        "text_column_used": text_column,
        "label_column_used": label_column,
        "pred_count": pred_count
    }

def _save_confusion_matrix(references: List[int], predictions: List[int],
                          output_dir: str, dataset_name: str
) -> str:
    """Save confusion matrix for a dataset."""

    cm = confusion_matrix(references, predictions)

    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=['normal', 'toxic'],
        yticklabels=['normal', 'toxic'],
        cbar=False,
        square=True,
        annot_kws={"fontsize": 14}
    )
    plt.xlabel("Predicted Label", fontsize=12)
    plt.ylabel("True Label", fontsize=12)
    plt.title(f"DistilBERT CM - {dataset_name}", fontsize=14, pad=20)
    plt.tight_layout()

    safe_name = dataset_name.replace('/', '_').replace(' ', '_')
    cm_path = os.path.join(output_dir, f"distilbert_cm_{safe_name}.png")
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()

    return cm_path

def _format_dataset_summary(dataset_path: str, results: Dict[str, Any]) -> List[str]:
    """Format summary for a single dataset."""

    if results.get('status') == 'failed':
        return [
            f"Dataset: {dataset_path}",
            f"Status: FAILED",
            f"Error: {results.get('error', 'Unknown error')}",
            "-" * 40,
            ""
        ]

    lines = [
        f"Dataset: {dataset_path}",
        f"Samples: {results['dataset_size']:,}",
        f"Accuracy: {results['accuracy']:.4f}",
        f"Weighted F1 Score: {results['f1_score']:.4f}",
        f"Evaluation Time: {results['evaluation_time_seconds']:.1f}s",
    ]

    report = results['classification_report']
    lines.extend([
        "",
        "Detailed Metrics:",
        f"  Normal - Precision: {report['normal']['precision']:.4f}, Recall: {report['normal']['recall']:.4f}, F1: {report['normal']['f1-score']:.4f}",
        f"  Toxic  - Precision: {report['toxic']['precision']:.4f}, Recall: {report['toxic']['recall']:.4f}, F1: {report['toxic']['f1-score']:.4f}",
        f"  Macro Avg - Precision: {report['macro avg']['precision']:.4f}, Recall: {report['macro avg']['recall']:.4f}, F1: {report['macro avg']['f1-score']:.4f}",
        ""
    ])

    if results.get('confusion_matrix_path'):
        lines.append(f"Confusion Matrix: {results['confusion_matrix_path']}")

    lines.extend(["-" * 40, ""])
    return lines

def _create_comparison_chart(all_results: Dict[str, Dict], output_dir: str):
    """Create comparison chart across datasets."""

    successful_results = {k: v for k, v in all_results.items() if v.get('status') == 'success'}

    if len(successful_results) < 2:
        return

    dataset_names = [path.split('/')[-1] for path in successful_results.keys()]
    accuracies = [results['accuracy'] for results in successful_results.values()]
    f1_scores = [results['f1_score'] for results in successful_results.values()]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

    bars1 = ax1.bar(dataset_names, accuracies, color='skyblue', alpha=0.8)
    ax1.set_title('distilBERT - Accuracy Comparison Across Datasets', fontsize=14)
    ax1.set_ylabel('Accuracy', fontsize=12)
    ax1.set_ylim(0, 1)
    ax1.tick_params(axis='x', rotation=45)

    for bar, acc in zip(bars1, accuracies):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{acc:.3f}', ha='center', va='bottom')

    bars2 = ax2.bar(dataset_names, f1_scores, color='lightcoral', alpha=0.8)
    ax2.set_title('F1 Score Comparison Across Datasets', fontsize=14)
    ax2.set_ylabel('F1 Score', fontsize=12)
    ax2.set_ylim(0, 1)
    ax2.tick_params(axis='x', rotation=45)

    for bar, f1 in zip(bars2, f1_scores):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{f1:.3f}', ha='center', va='bottom')

    plt.tight_layout()
    comparison_path = os.path.join(output_dir, "distilbert_performance_comparison.png")
    plt.savefig(comparison_path, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"📊 Comparison chart saved to: {comparison_path}")

def _save_summary_report(summary_lines: List[str], all_results: Dict[str, Dict], output_dir: str):
    """Save comprehensive summary report to text file."""

    successful_results = [r for r in all_results.values() if r.get('status') == 'success']
    failed_count = len(all_results) - len(successful_results)

    if successful_results:
        avg_accuracy = sum(r['accuracy'] for r in successful_results) / len(successful_results)
        avg_f1 = sum(r['f1_score'] for r in successful_results) / len(successful_results)
        total_samples = sum(r['dataset_size'] for r in successful_results)

        summary_lines.extend([
            "",
            "OVERALL SUMMARY:",
            f"✅ Successful evaluations: {len(successful_results)}",
            f"❌ Failed evaluations: {failed_count}",
            f"📊 Average Accuracy: {avg_accuracy:.4f}",
            f"📊 Average F1 Score: {avg_f1:.4f}",
            f"📈 Total samples evaluated: {total_samples:,}",
            "",
            "="*80,
            "DETAILED RESULTS:",
            "="*80,
            ""
        ])

    report_path = os.path.join(output_dir, "distilbert_evaluation_summary.txt")
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(summary_lines))

    print(f"📄 Summary report saved to: {report_path}")

def run_multi_dataset_evaluation_example():
    """Example of how to use the multi-dataset evaluation function."""

    evaluator = ModelEvaluator(MODEL_PATH)

    results = evaluate_multiple_datasets(
        evaluator=evaluator,
        dataset_paths=DATA_PATHS,
        output_dir=OUTPUT_DIR,
        split="test",
        sample_size=2000,  # For testing, consistent with your example
        text_column=TEXT_COLUMN,
        label_column=LABEL_COLUMN
    )

    return results



In [7]:
# Run the evaluation
results = run_multi_dataset_evaluation_example()

Loading model from cike-dev/Distilbert_toxic...


config.json:   0%|          | 0.00/673 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded successfully!

🚀 Starting evaluation on 5 datasets...
📁 Results will be saved to: /content/drive/MyDrive/Colab Notebooks/Cyberbullying/tests/distilBERT/

📊 DATASET 1/5: cike-dev/gab
📥 Loading cike-dev/gab...


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/386k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4000 [00:00<?, ? examples/s]

✅ Loaded 2000 samples
⚠️  'text' not found, using 'cleaned_text' instead
🔄 Preparing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

🧠 Running model inference...


Evaluating:   0%|          | 4/2000 [00:00<04:31,  7.36sample/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|██████████| 2000/2000 [00:15<00:00, 126.49sample/s]


Sample input and pred:
 Input: two bits of feedback. first, that catholic church shill that calls in is getting annoying as fuck. second, when people ask you to let jews talk when they call in, tell them that they can turn on any msm if they are so desperate to listen to jews. your show is an oasis of jew free content, please dont let them speak there.
Pred label: toxic
True label: toxic
Input: international womens day today eh, hope the ladies are doing their bit to level up the 95 of workplace deaths and 66 suicide rate. i worry for boys growing up now, told they are toxic, brought up with few male role modelsteachers and assumed to be potential rapists.  never mind, we are disposable.
Pred label: toxic
True label: toxic

📊 Calculating metrics...


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


Label distribution in predictions: Counter({0: 1228, 1: 772})

✅ Completed cike-dev/gab in 26.5s

📊 DATASET 2/5: cike-dev/olid
📥 Loading cike-dev/olid...


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/313k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4000 [00:00<?, ? examples/s]

✅ Loaded 2000 samples
⚠️  'text' not found, using 'cleaned_text' instead
🔄 Preparing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

🧠 Running model inference...


Evaluating: 100%|██████████| 2000/2000 [00:08<00:00, 224.67sample/s]


Sample input and pred:
 Input: it makes my skin crawl just looking at this woman  theresa may is a liability to our nation and  and must be removed asap
Pred label: toxic
True label: toxic
Input: how? you cant just say wrong with no evidence for why it is. you said you agree with sargon im pretty much every subject but gun control and healthcare. thats not what a centrist believes.
Pred label: toxic
True label: toxic

📊 Calculating metrics...

Label distribution in predictions: Counter({0: 1180, 1: 820})

✅ Completed cike-dev/olid in 17.2s

📊 DATASET 3/5: cike-dev/hatexplain
📥 Loading cike-dev/hatexplain...


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/345k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4000 [00:00<?, ? examples/s]

✅ Loaded 2000 samples
⚠️  'text' not found, using 'cleaned_text' instead
🔄 Preparing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

🧠 Running model inference...


Evaluating: 100%|██████████| 2000/2000 [00:09<00:00, 201.79sample/s]


Sample input and pred:
 Input: low iq mad maxine living up to her dumb nigger name
Pred label: toxic
True label: toxic
Input: another brain fart from morbious stone no actually you just displayed you are some king of retard boomer who can not read rotfl stop sucking alex jewns dick must have just graduated from kinder garden lowest iq list lmao
Pred label: toxic
True label: toxic

📊 Calculating metrics...

Label distribution in predictions: Counter({1: 1219, 0: 781})

✅ Completed cike-dev/hatexplain in 18.3s

📊 DATASET 4/5: cike-dev/stormfront
📥 Loading cike-dev/stormfront...


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2382 [00:00<?, ? examples/s]

✅ Loaded 2000 samples
⚠️  'text' not found, using 'cleaned_text' instead
🔄 Preparing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

🧠 Running model inference...


Evaluating: 100%|██████████| 2000/2000 [00:09<00:00, 208.32sample/s]


Sample input and pred:
 Input: the negro is generally terrified of whites , the second you stand up to them they usally back away , thats unless they re carrying a gun at which point the savages are rarely reluctant to use .
Pred label: toxic
True label: toxic

📊 Calculating metrics...

Label distribution in predictions: Counter({0: 1291, 1: 709})

✅ Completed cike-dev/stormfront in 17.9s

📊 DATASET 5/5: cike-dev/tdavidson
📥 Loading cike-dev/tdavidson...


README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/209k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4000 [00:00<?, ? examples/s]

✅ Loaded 2000 samples
⚠️  'text' not found, using 'cleaned_text' instead
🔄 Preparing data...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

🧠 Running model inference...


Evaluating: 100%|██████████| 2000/2000 [00:17<00:00, 111.47sample/s]


Sample input and pred:
 Input: im happy for the bung holes and negativity desperate guys spew on here ;a good example of what not to be and what i don;
Pred label: normal
True label: toxic
Input: smh u niggers and ur music knowledge
Pred label: toxic
True label: toxic

📊 Calculating metrics...

Label distribution in predictions: Counter({1: 1062, 0: 938})

✅ Completed cike-dev/tdavidson in 26.2s
📊 Comparison chart saved to: /content/drive/MyDrive/Colab Notebooks/Cyberbullying/tests/distilBERT/distilbert_performance_comparison.png
📄 Summary report saved to: /content/drive/MyDrive/Colab Notebooks/Cyberbullying/tests/distilBERT/distilbert_evaluation_summary.txt

🎉 Evaluation complete! Results saved to: /content/drive/MyDrive/Colab Notebooks/Cyberbullying/tests/distilBERT/
