<center><br><font size=6>Final Project</font><br>
<font size=5>Advanced Topics in Deep Learning</font><br>
<b><font size=4>Part B</font></b>
<br><font size=4>Load Final Models</font><br><br>
Authors: Ido Rappaport & Eran Tascesme
</font></center>

**Submission Details:**
<font size=2>
<br>Ido Rappaport, ID: 322891623
<br>Eran Tascesme , ID: 205708720 </font>


**Import libraries**

❗Note the versions of the packages, we have included information in requirements.txt❗

In [None]:
# Standard libraries
import os
import re
import string
import random
import warnings
import time

# Data handling and visualization
import pandas as pd
import numpy as np

# Machine learning and deep learning
import torch
import torch.nn.utils.prune as prune
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
    matthews_corrcoef,
    normalized_mutual_info_score,
    ConfusionMatrixDisplay
)

# Hugging Face Transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    DataCollatorWithPadding,
    RobertaForSequenceClassification,
    MarianMTModel,
    MarianTokenizer
)
from datasets import Dataset, DatasetDict, load_dataset
from transformers.modeling_outputs import SequenceClassifierOutput
import evaluate

# Other libraries
from tqdm import tqdm

# Filter warnings
warnings.filterwarnings('ignore')


**Load CSV Files**


In [None]:
test_data = pd.read_csv("/data/test_clean.csv", encoding="ISO-8859-1")
path_dir = "/final_models/"

**Load Models**

Load the models and create a list containing all of them. Each model type has its own specific loading function.

In [None]:
model_list = []

In [None]:
def load_base_model(model_name, base_weights_path):

  model = AutoModelForSequenceClassification.from_pretrained(
      model_name, num_labels=5, ignore_mismatched_sizes=True
  )
  state_dict = torch.load(base_weights_path, map_location="cpu")
  model.load_state_dict(state_dict)

  tokenizer = AutoTokenizer.from_pretrained(model_name)

  return model, tokenizer


def load_compressed_model(save_model_path, device="cpu"):
    model_path = os.path.join(save_model_path, "model.pt")
    model = torch.load(model_path, map_location=device, weights_only=False)
    model.eval()

    tokenizer = AutoTokenizer.from_pretrained(save_model_path)
    return model, tokenizer


def load_quantized_model(model_name, quantized_model_path):
  q_state_path = os.path.join(quantized_model_path, "model.pt")

  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # Rebuild the same base architecture
  loaded_model = AutoModelForSequenceClassification.from_pretrained(
      model_name, num_labels=5, ignore_mismatched_sizes=True
  )

  # Apply the same dynamic quantization to convert Linear -> quantized Linear
  loaded_model = torch.quantization.quantize_dynamic(
      loaded_model, {nn.Linear}, dtype=torch.qint8
  )

  # Now load the quantized weights (keys will match)
  loaded_model.load_state_dict(torch.load(q_state_path, map_location="cpu"))
  loaded_model.eval()

  return loaded_model, tokenizer

**Base Models**

The models who trained by exc4 and exc5 without compression (kivutz)

In [None]:
#roberta
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"

base_weights_path = "/final_models/roberta_sentiment_exc4_weights.pt"
model, tokenizer = load_base_model(model_name, base_weights_path)
model_list.append(("roberta_sentiment_exc4", model, tokenizer))

base_weights_path = "/final_models/roberta_sentiment_weights.pt"
model, tokenizer = load_base_model(model_name, base_weights_path)
model_list.append(("roberta_sentiment_exc5", model, tokenizer))

# distilbert
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

base_weights_path = "/final_models/distilbert_exc4_weights.pt"
model, tokenizer = load_base_model(model_name, base_weights_path)
model_list.append(("distilbert_exc4", model, tokenizer))

base_weights_path = "/final_models/distilbert_weights.pt"
model, tokenizer = load_base_model(model_name, base_weights_path)
model_list.append(("distilbert_exc5", model, tokenizer))

**Pruned & Distilled Models**

In [None]:
models_pathes = ["roberta_exc4_pruned", "roberta_pruned", "distilbert_exc4_pruned", "distilbert_pruned",
                 "distilroberta_exc4-base", "distilroberta-base", "tinybert_exc4", "tinybert"]


for model_path in models_pathes:
    save_model_path = os.path.join(path_dir, model_path)
    model, tokenizer = load_compressed_model(save_model_path, device="cpu")

    model_list.append((model_path, model, tokenizer))

**Quantized Models**

In [None]:
#roberta
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
models_pathes = ["roberta_sentiment_exc4_quantized", "roberta_sentiment_quantized"]

for model_path in models_pathes:
  quantized_model_path = os.path.join(path_dir, model_path)
  model, tokenizer = load_quantized_model(model_name, quantized_model_path)

  model_list.append((model_path, model, tokenizer))

#distilbert
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
models_pathes = ["distilbert_exc4_quantized", "distilbert_quantized"]

for model_path in models_pathes:
  quantized_model_path = os.path.join(path_dir, model_path)
  model, tokenizer = load_quantized_model(model_name, quantized_model_path)

  model_list.append((model_path, model, tokenizer))

**Evaluation**

Evaluate all models and calculate a final score for each.

In [None]:
def evaluate_model_metrics(model, tokenizer, test_data, batch_size=32, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """Compute metrics for a single model on test data."""
    is_quantized = any(
        isinstance(m, nn.quantized.dynamic.Linear) or isinstance(m, nn.quantized.Linear)
        for m in model.modules()
    )
    if is_quantized:
      device = "cpu"

    model.to(device)
    model.eval()

    texts = test_data['text'].tolist()
    labels = torch.tensor(test_data['label'].tolist()).to(device)

    start_time = time.time()
    all_preds = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = [str(t) for t in texts[i:i+batch_size]]
            encodings = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
            outputs = model(**encodings)

            logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
    runtime = time.time() - start_time

    accuracy = accuracy_score(labels.cpu().numpy(), all_preds)
    f1 = f1_score(labels.cpu().numpy(), all_preds, average='macro')

    mcc = matthews_corrcoef(labels.cpu().numpy(), all_preds)
    nit = normalized_mutual_info_score(labels.cpu().numpy(), all_preds)

    conf_matrix = confusion_matrix(labels.cpu().numpy(), all_preds)

    total_params = sum(p.numel() for p in model.parameters())
    nonzero_params = sum(torch.count_nonzero(p).item() for p in model.parameters())

    return {
        'accuracy': accuracy,
        'f1': f1,
        'mcc': mcc,
        'nit': nit,
        'conf_matrix': conf_matrix,
        'runtime_sec': runtime,
        'total_params': total_params,
        'nonzero_params': nonzero_params
    }

def evaluate_and_score_models(model_list, test_data, weights=None, batch_size=32):
    """
    Evaluate multiple HuggingFace models and compute a relative weighted score.

    Args:
        models: list of (name, model, tokenizer)
        test_data: pd.DataFrame with 'text' and 'label'
        weights: dict with weights for metrics
        batch_size: evaluation batch size

    Returns:
        pd.DataFrame with metrics and final weighted score
    """
    if weights is None:
        weights = {'accuracy': 0.4, 'mcc': 0.2, 'nit': 0.2,'runtime': 0.1, 'params': 0.05, 'nonzero_params': 0.05}

    all_metrics = {}

    # Step 1: compute metrics for all models
    for name, model, tokenizer in model_list:
      print(f"Evaluating {name}...")
      try:
          metrics = evaluate_model_metrics(model, tokenizer, test_data, batch_size=batch_size)
          all_metrics[name] = metrics
      except Exception as e:
          print(f"Error evaluating {name}: {e}")

    df = pd.DataFrame(all_metrics).T

    # Step 2: min-max scale each metric (higher is better for final score)
    df_scaled = df.copy()

    # For metrics where lower is better (runtime, total_params, nonzero_params)
    for col in ['runtime_sec', 'total_params', 'nonzero_params']:
        col_normalization = col + "_norm"
        df_scaled[col_normalization] = 1 / ((df[col] - df[col].min()) / (df[col].max() - df[col].min() + 1e-8) + 0.5)

    # Step 3: compute final weighted score
    df_scaled['final_score'] = (
        weights['accuracy'] * df_scaled['accuracy'] +
        weights['mcc'] * df_scaled['mcc'] +
        weights['nit'] * df_scaled['nit'] +
        weights['runtime'] * df_scaled['runtime_sec_norm'] +
        weights['params'] * df_scaled['total_params_norm'] +
        weights['nonzero_params'] * df_scaled['nonzero_params_norm']
    )

    # Sort by final score
    df_scaled = df_scaled.sort_values(by='final_score', ascending=False)

    return df_scaled


In [None]:
evaluation_df = evaluate_and_score_models(model_list, test_data)
print(evaluation_df.columns)

In [None]:
evaluation_df[['accuracy', 'f1', 'mcc', 'nit', 'runtime_sec', 'params', 'final_score']]