# 02 ModernBERT

## a. Notebook setup

In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Define data path
INPUT_PATH = '/content/drive/MyDrive/266_Final_Project/Input_Data/NumHG-main/Dataset/'
PROCESSED_PATH = '/content/drive/MyDrive/266_Final_Project/Processed_Data/'
MODELS_PATH = '/content/drive/MyDrive/266_Final_Project/Models/'

In [3]:
!pip install -q transformers
!pip install -q datasets
!pip install -q evaluate
!pip install -q math-verify
!pip install -q word2number
!pip install -q peft
!pip install -q accelerate

In [4]:
# Import libraries
import numpy as np
import pandas as pd
import torch

import transformers
from peft import get_peft_model, LoraConfig, TaskType

import matplotlib.pyplot as plt
from datasets import Dataset, load_from_disk

from transformers import AutoTokenizer, ModernBertForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

import csv
from math_verify import parse, verify
from word2number import w2n

In [5]:
# Set seed
np.random.seed(42)

### a.i. Functions - preprocessing

In [6]:
# Function for embedding inputs
def preprocess_data(data, tokenizer):

    # Grab features and labels
    text = data['text']
    labels_text = data['labels']

    # Tokenize input text
    tokenized_inputs = tokenizer(text,
                                 max_length=100,
                                 truncation=True,
                                 padding='max_length',
                                 return_tensors='pt')

    # Token labels
    tokenized_labels = tokenizer(labels_text,
                                 max_length=100,
                                 truncation=True,
                                 padding='max_length',
                                 add_special_tokens=False,
                                 return_tensors='pt').input_ids

    # Create labels: copy input_ids and replace non-masked tokens with -100
    labels = tokenized_inputs.input_ids.clone()
    mask = labels == tokenizer.mask_token_id
    for i in range(labels.shape[0]):
        mask_idx = torch.where(mask[i])[0]
        labels[i, mask_idx] = tokenized_labels[i, 0]
    labels[~mask] = -100

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

### a.ii. Functions - predictions

In [7]:
# Function to convert words to numbers (as string)
def convert_word_to_number(word):
  try:
    number = str(w2n.word_to_num(word))
    return number
  except ValueError:
    return word

# Function to convert predictions to numerics
def convert_predictions_to_numerics(predictions):
  predictions_nums = [convert_word_to_number(word) for word in predictions]
  return predictions_nums

In [8]:
# Function for generating predictions
def predict_masked_token(data):
    inputs = preprocess_data(data, tokenizer)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)

    masked_indices = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    predictions = []

    for masked_index in masked_indices:
        predicted_id = outputs.logits[0, masked_index].argmax(dim=-1)
        predicted_word = tokenizer.decode(predicted_id)
        predictions.append(predicted_word)

    predictions = [prediction.replace(',', '') for prediction in predictions]
    predictions = convert_predictions_to_numerics(predictions)
    predictions = [prediction.strip() for prediction in predictions]

    return {"predictions": predictions}

In [9]:
# Check numerical form of data
def is_decimal(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

In [10]:
# Function to calculate accuracy
def calculate_accuracy(predictions, data):
    matches = []
    for i in range(len(predictions)):
      if data['labels'][i] == predictions[i][0]:
        matches.append(1)
      else:
        matches.append(0)

    accuracy = sum(matches)/len(matches)
    return accuracy

In [11]:
# Function for Mean Absolute Percentage Error (MAPE)
def calculate_mean_absolute_percentage_error(predictions, data):
    pct_err_all = []
    pct_err_no_outliers = []
    non_calculable = []
    for i in range(len(predictions)):
      if predictions[i] is None:
        non_calculable.append(i)
      else:
        if is_decimal(predictions[i][0]) and is_decimal(data['labels'][i]) and float(data['labels'][i]) != 0:
          abs_err = abs( float(predictions[i][0]) - float(data['labels'][i]) )
          pct_err = abs_err / float(data['labels'][i])
          if pct_err <= 10: # outlier if wrong by over 1000%
            pct_err_no_outliers.append(pct_err)
          pct_err_all.append(pct_err)
        else:
          non_calculable.append(i)
    print("Count non-calculable predictions:", len(non_calculable))
    mape_all = sum(pct_err_all)/len(pct_err_all)
    mape_no_outliers = sum(pct_err_no_outliers)/len(pct_err_no_outliers)
    return [mape_all, mape_no_outliers]

In [12]:
# Function for Symmetric Mean Absolute Percentage Error (SMAPE)
def calculate_symm_mean_absolute_percentage_error(predictions, data):
    pct_err_all = []
    for i in range(len(predictions)):
      if predictions[i] is not None:
        if is_decimal(predictions[i][0]) and is_decimal(data['labels'][i]) and abs( (float(predictions[i][0]) + float(data['labels'][i]))/2 ) != 0:
          abs_err = abs( float(predictions[i][0]) - float(data['labels'][i]) )
          avg_val = abs( (float(predictions[i][0]) + float(data['labels'][i]))/2 )
          pct_err = abs_err / avg_val
          pct_err_all.append(pct_err)
    smape_all = sum(pct_err_all)/len(pct_err_all)
    return smape_all

### a.iii. Functions - fine-tuning

In [13]:
# Training function
def fine_tune_model(model,
                    tokenizer,
                    train_data,
                    dev_data,
                    batch_size = 16,
                    num_epochs = 3):

    preprocessed_train_data = train_data.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_data.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})

    training_args = TrainingArguments(
        output_dir="modernbert_fine_tuned",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy="epoch",
        save_strategy="epoch",
        report_to='none',
        prediction_loss_only=True,
        label_names=['labels']
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=preprocessed_train_data,
        eval_dataset=preprocessed_dev_data,
    )

    trainer.train()

## b. Intake of train/test data

In [14]:
# Read in processed HF data
train_hf = load_from_disk(PROCESSED_PATH + 'train_bert.hf')
test_hf = load_from_disk(PROCESSED_PATH + 'test_bert.hf')

# Read in reasoning-only HF data
train_hf_reasoning_small = load_from_disk(PROCESSED_PATH + 'train_bert_reasoning_small.hf')
train_hf_reasoning = load_from_disk(PROCESSED_PATH + 'train_bert_reasoning.hf')

# Strip commas from labels in train/test
train_hf = train_hf.map(lambda example: {'labels': example['labels'].replace(',', '')})
train_hf_reasoning_small = train_hf_reasoning_small.map(lambda example: {'labels': example['labels'].replace(',', '')})
train_hf_reasoning = train_hf_reasoning.map(lambda example: {'labels': example['labels'].replace(',', '')})
test_hf = test_hf.map(lambda example: {'labels': example['labels'].replace(',', '')})

In [15]:
for i in range(2):
  print(train_hf['text'][i])
  print(train_hf['labels'][i])
  print()

Mel Gibson Loses Half His $ [MASK] M Fortune. In what's believed to be the largest Hollywood divorce settlement in history, Mel Gibson is having to cough up $425 million—half of his estimated fortune—to ex-wife Robyn Denise Moore. The two, married 31 years and parents to seven children, didn't have a prenuptial agreement, so Moore is entitled to half his wealth under California law. Moore, a former dental nurse, is also entitled to half of any future residuals from films Gibson, 55, worked on while the two were married, reports People. The settlement was finalized by a judge last week. Gibson earned $600 million alone from his The Passion of the Christ. He also made money from real estate, including a Fiji island he bought for $15 million. Moore, 55, filed for divorce from Gibson in 2009 after he began a highly publicized, crazed affair with Oksana Grigorieva. Moore and Gibson met in the late 1970s when they were both tenants in a house in Australia. Of all their children, only a 12-ye

## c. Baseline - Masked predictions on pre-trained ModernBERT

In [16]:
# Import ModernBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = ModernBertForMaskedLM.from_pretrained("answerdotai/ModernBERT-base")
_= model.to("cuda")

In [17]:
# Apply prediction function to the dataset
predictions_baseline = test_hf.map(predict_masked_token)["predictions"]

# Read predictions from file
# predictions_baseline_raw = pd.read_csv(PROCESSED_PATH + "modernbert_baseline_predictions.csv",header=None)
# predictions_baseline_raw = predictions_baseline_raw.values.tolist()
# predictions_baseline = []
# for row in predictions_baseline_raw:
#     for value in row:
#         value_cleaned = value.strip("[]'")
#         predictions_baseline.append([value_cleaned])

# Examine predictions
print(predictions_baseline[:10])

[['18'], ['[SEP]'], ['100'], ['10'], ['on'], ['3'], ['•'], ['[SEP]'], ['7'], ['9']]


In [18]:
# Save predictions
with open(PROCESSED_PATH + "modernbert_baseline_predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions_baseline:
        writer.writerow([item])

In [19]:
# Extract reasoning-only indices
reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 1]

# Extract non-reasoning-only indices
non_reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 0]

# Calculate accuracy
overall_accuracy = calculate_accuracy(predictions_baseline, test_hf)
reasoning_accuracy = calculate_accuracy([predictions_baseline[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_accuracy = calculate_accuracy([predictions_baseline[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("Accuracy on all test data:", overall_accuracy)
print("Accuracy on reasoning-only test data:", reasoning_accuracy)
print("Accuracy on non-reasoning-only test data:", non_reasoning_accuracy)

Accuracy on all test data: 0.5223503965392935
Accuracy on reasoning-only test data: 0.48409090909090907
Accuracy on non-reasoning-only test data: 0.5401267159450898


In [20]:
vocab = tokenizer.get_vocab()

# Calculate MAPE
overall_mape = calculate_mean_absolute_percentage_error(predictions_baseline, test_hf)
reasoning_mape = calculate_mean_absolute_percentage_error([predictions_baseline[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_mape = calculate_mean_absolute_percentage_error([predictions_baseline[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("MAPE on all test data:", overall_mape)
print("MAPE on reasoning-only test data:", reasoning_mape)
print("MAPE on non-reasoning-only test data:", non_reasoning_mape)

Count non-calculable predictions: 829
Count non-calculable predictions: 285
Count non-calculable predictions: 544
MAPE on all test data: [0.8204259390741431, 0.19381910158349747]
MAPE on reasoning-only test data: [0.674620579259893, 0.24063256025971094]
MAPE on non-reasoning-only test data: [0.8846883013626454, 0.17324214101611676]


In [21]:
# Calculate SMAPE
overall_smape = calculate_symm_mean_absolute_percentage_error(predictions_baseline, test_hf)
reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_baseline[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_baseline[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("SMAPE on all test data:", overall_smape)
print("SMAPE on reasoning-only test data:", reasoning_smape)
print("SMAPE on non-reasoning-only test data:", non_reasoning_smape)

SMAPE on all test data: 0.21688618099346157
SMAPE on reasoning-only test data: 0.2731209476589586
SMAPE on non-reasoning-only test data: 0.19210122827792792


# d. Fine-tuned models

### d.i. Fine-tuning with mixed-reasoning data

In [22]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[f"layers.{i}.attn.Wqkv" for i in range(22)] + [f"layers.{i}.attn.Wo" for i in range(22)],
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

trainable params: 405,504 || all params: 150,060,736 || trainable%: 0.2702


In [23]:
# Fine-tuning with mixed-reasoning data
fine_tune_model(model, tokenizer, train_hf, test_hf)

Epoch,Training Loss,Validation Loss
1,1.9578,1.140014
2,1.0722,1.091988
3,1.0062,1.084016


W0413 21:19:10.677000 21740 torch/_inductor/utils.py:1137] [1/1] Not enough SMs to use max_autotune_gemm mode


In [24]:
# Save model
model.save_pretrained(MODELS_PATH + "modernbert_mixed_ft")

# Load model
# model = ModernBertForMaskedLM.from_pretrained(MODELS_PATH + "modernbert_mixed_ft")
# _= model.to("cuda")

In [25]:
# Apply prediction function to the dataset
predictions_mixed_ft = test_hf.map(predict_masked_token)["predictions"]

# Read predictions from file
# predictions_mixed_ft_raw = pd.read_csv(PROCESSED_PATH + "modernbert_mixed_ft_predictions.csv",header=None)
# predictions_mixed_ft_raw = predictions_mixed_ft_raw.values.tolist()
# predictions_mixed_ft = []
# for row in predictions_mixed_ft_raw:
#     for value in row:
#         value_cleaned = value.strip("[]'")
#         predictions_mixed_ft.append([value_cleaned])

# Examine predictions
print(predictions_mixed_ft[:10])

Map:   0%|          | 0/2774 [00:00<?, ? examples/s]

[['18'], ['20'], ['100'], ['10'], ['2018'], ['90'], ['800'], ['200'], ['7'], ['9']]


In [26]:
# Save predictions
with open(PROCESSED_PATH + "modernbert_mixed_ft_predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions_mixed_ft:
        writer.writerow([item])

In [27]:
# Extract reasoning-only indices
reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 1]

# Extract non-reasoning-only indices
non_reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 0]

# Calculate accuracy
overall_accuracy = calculate_accuracy(predictions_mixed_ft, test_hf)
reasoning_accuracy = calculate_accuracy([predictions_mixed_ft[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_accuracy = calculate_accuracy([predictions_mixed_ft[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("Accuracy on all test data:", overall_accuracy)
print("Accuracy on reasoning-only test data:", reasoning_accuracy)
print("Accuracy on non-reasoning-only test data:", non_reasoning_accuracy)

Accuracy on all test data: 0.7256669069935112
Accuracy on reasoning-only test data: 0.7238636363636364
Accuracy on non-reasoning-only test data: 0.7265047518479408


In [28]:
vocab = tokenizer.get_vocab()

# Calculate MAPE
overall_mape = calculate_mean_absolute_percentage_error(predictions_mixed_ft, test_hf)
reasoning_mape = calculate_mean_absolute_percentage_error([predictions_mixed_ft[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_mape = calculate_mean_absolute_percentage_error([predictions_mixed_ft[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("MAPE on all test data:", overall_mape)
print("MAPE on reasoning-only test data:", reasoning_mape)
print("MAPE on non-reasoning-only test data:", non_reasoning_mape)

Count non-calculable predictions: 31
Count non-calculable predictions: 1
Count non-calculable predictions: 30
MAPE on all test data: [0.8950128147184828, 0.19005968805935333]
MAPE on reasoning-only test data: [2.1938425297419006, 0.22130277813386703]
MAPE on non-reasoning-only test data: [0.28252820124982037, 0.1754302540632139]


In [29]:
# Calculate SMAPE
overall_smape = calculate_symm_mean_absolute_percentage_error(predictions_mixed_ft, test_hf)
reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_mixed_ft[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_mixed_ft[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("SMAPE on all test data:", overall_smape)
print("SMAPE on reasoning-only test data:", reasoning_smape)
print("SMAPE on non-reasoning-only test data:", non_reasoning_smape)

SMAPE on all test data: 0.2285660385852483
SMAPE on reasoning-only test data: 0.23710671857571336
SMAPE on non-reasoning-only test data: 0.22453853981292068


### d.ii. Fine-tuning with reasoning-only data (small)

In [30]:
# Re-import ModernBERT tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
# model = ModernBertForMaskedLM.from_pretrained("answerdotai/ModernBERT-base")
# _= model.to("cuda")

In [31]:
# Define LoRA configuration
# lora_config = LoraConfig(
#     r=4,
#     lora_alpha=16,
#     lora_dropout=0.1,
#     target_modules=[f"layers.{i}.attn.Wqkv" for i in range(22)] + [f"layers.{i}.attn.Wo" for i in range(22)],
#     bias="none",
#     task_type=TaskType.CAUSAL_LM
# )

# Wrap the model with LoRA
# model = get_peft_model(model, lora_config)

# Print trainable parameters
# model.print_trainable_parameters()

In [32]:
# Fine-tuning with mixed-reasoning data
# fine_tune_model(model, tokenizer, train_hf_reasoning_small, test_hf)

In [33]:
# Save model
# model.save_pretrained(MODELS_PATH + "modernbert_reasoning_ft_small")

# Load model
# model = ModernBertForMaskedLM.from_pretrained(MODELS_PATH + "modernbert_reasoning_ft_small")
# _= model.to("cuda")

In [35]:
# Apply prediction function to the dataset
# predictions_reasoning_ft_small = test_hf.map(predict_masked_token)["predictions"]

# Read predictions from file
# predictions_reasoning_ft_small_raw = pd.read_csv(PROCESSED_PATH + "modernbert_reasoning_ft_small_pred.csv",header=None)
# predictions_reasoning_ft_small_raw = predictions_reasoning_ft_small_raw.values.tolist()
# predictions_reasoning_ft_small = []
# for row in predictions_reasoning_ft_small_raw:
#     for value in row:
#         value_cleaned = value.strip("[]'")
#         predictions_reasoning_ft_small.append([value_cleaned])

# Examine predictions
# print(predictions_reasoning_ft_small[:10])

In [36]:
# Save predictions
# with open(PROCESSED_PATH + "modernbert_reasoning_ft_small_pred.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     for item in predictions_reasoning_ft_small:
#         writer.writerow([item])

In [37]:
# Extract reasoning-only indices
# reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 1]

# Extract non-reasoning-only indices
# non_reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 0]

# Calculate accuracy
# overall_accuracy = calculate_accuracy(predictions_reasoning_ft_small, test_hf)
# reasoning_accuracy = calculate_accuracy([predictions_reasoning_ft_small[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
# non_reasoning_accuracy = calculate_accuracy([predictions_reasoning_ft_small[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

# print("Accuracy on all test data:", overall_accuracy)
# print("Accuracy on reasoning-only test data:", reasoning_accuracy)
# print("Accuracy on non-reasoning-only test data:", non_reasoning_accuracy)

In [38]:
# vocab = tokenizer.get_vocab()

# Calculate MAPE
# overall_mape = calculate_mean_absolute_percentage_error(predictions_reasoning_ft_small, test_hf)
# reasoning_mape = calculate_mean_absolute_percentage_error([predictions_reasoning_ft_small[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
# non_reasoning_mape = calculate_mean_absolute_percentage_error([predictions_reasoning_ft_small[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

# print("MAPE on all test data:", overall_mape)
# print("MAPE on reasoning-only test data:", reasoning_mape)
# print("MAPE on non-reasoning-only test data:", non_reasoning_mape)

In [39]:
# Calculate SMAPE
# overall_smape = calculate_symm_mean_absolute_percentage_error(predictions_reasoning_ft_small, test_hf)
# reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_reasoning_ft_small[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
# non_reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_reasoning_ft_small[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

# print("SMAPE on all test data:", overall_smape)
# print("SMAPE on reasoning-only test data:", reasoning_smape)
# print("SMAPE on non-reasoning-only test data:", non_reasoning_smape)

### d.iii. Fine-tuning with reasoning-only data (large)

In [40]:
# Re-import ModernBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = ModernBertForMaskedLM.from_pretrained("answerdotai/ModernBERT-base")
_= model.to("cuda")

In [41]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[f"layers.{i}.attn.Wqkv" for i in range(22)] + [f"layers.{i}.attn.Wo" for i in range(22)],
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

trainable params: 405,504 || all params: 150,060,736 || trainable%: 0.2702


In [42]:
# Fine-tuning with mixed-reasoning data
fine_tune_model(model, tokenizer, train_hf_reasoning, test_hf)

Map:   0%|          | 0/9711 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,2.1279,1.275203
2,1.136,1.272598
3,1.1068,1.270265


In [43]:
# Save model
model.save_pretrained(MODELS_PATH + "modernbert_reasoning_ft")

# Load model
# model = ModernBertForMaskedLM.from_pretrained(MODELS_PATH + "modernbert_reasoning_ft")
# _= model.to("cuda")

In [44]:
# Apply prediction function to the dataset
predictions_reasoning_ft = test_hf.map(predict_masked_token)["predictions"]

# Read predictions from file
# predictions_reasoning_ft_raw = pd.read_csv(PROCESSED_PATH + "modernbert_reasoning_ft_pred.csv",header=None)
# predictions_reasoning_ft_raw = predictions_reasoning_ft_raw.values.tolist()
# predictions_reasoning_ft = []
# for row in predictions_reasoning_ft_raw:
#     for value in row:
#         value_cleaned = value.strip("[]'")
#         predictions_reasoning_ft.append([value_cleaned])

# Examine predictions
print(predictions_reasoning_ft[:10])

Map:   0%|          | 0/2774 [00:00<?, ? examples/s]

[['18'], ['20'], ['100'], ['10'], ['Top'], ['90'], ['800'], ['200'], ['7'], ['9']]


In [45]:
# Save predictions
with open(PROCESSED_PATH + "modernbert_reasoning_ft_pred.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions_reasoning_ft:
        writer.writerow([item])

In [46]:
# Extract reasoning-only indices
reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 1]

# Extract non-reasoning-only indices
non_reasoning_indices = [i for i, example in enumerate(test_hf) if example["need_reasoning"] == 0]

# Calculate accuracy
overall_accuracy = calculate_accuracy(predictions_reasoning_ft, test_hf)
reasoning_accuracy = calculate_accuracy([predictions_reasoning_ft[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_accuracy = calculate_accuracy([predictions_reasoning_ft[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("Accuracy on all test data:", overall_accuracy)
print("Accuracy on reasoning-only test data:", reasoning_accuracy)
print("Accuracy on non-reasoning-only test data:", non_reasoning_accuracy)

Accuracy on all test data: 0.7130497476568133
Accuracy on reasoning-only test data: 0.7511363636363636
Accuracy on non-reasoning-only test data: 0.6953537486800423


In [47]:
vocab = tokenizer.get_vocab()

# Calculate MAPE
overall_mape = calculate_mean_absolute_percentage_error(predictions_reasoning_ft, test_hf)
reasoning_mape = calculate_mean_absolute_percentage_error([predictions_reasoning_ft[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_mape = calculate_mean_absolute_percentage_error([predictions_reasoning_ft[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("MAPE on all test data:", overall_mape)
print("MAPE on reasoning-only test data:", reasoning_mape)
print("MAPE on non-reasoning-only test data:", non_reasoning_mape)

Count non-calculable predictions: 36
Count non-calculable predictions: 1
Count non-calculable predictions: 35
MAPE on all test data: [0.42901993466731114, 0.17470014266918535]
MAPE on reasoning-only test data: [0.34187930905754566, 0.15034694780371333]
MAPE on non-reasoning-only test data: [0.4702230599556302, 0.18619369472273875]


In [48]:
# Calculate SMAPE
overall_smape = calculate_symm_mean_absolute_percentage_error(predictions_reasoning_ft, test_hf)
reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_reasoning_ft[i] for i in reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 1))
non_reasoning_smape = calculate_symm_mean_absolute_percentage_error([predictions_reasoning_ft[i] for i in non_reasoning_indices], test_hf.filter(lambda example: example["need_reasoning"] == 0))

print("SMAPE on all test data:", overall_smape)
print("SMAPE on reasoning-only test data:", reasoning_smape)
print("SMAPE on non-reasoning-only test data:", non_reasoning_smape)

SMAPE on all test data: 0.25940663265952063
SMAPE on reasoning-only test data: 0.1979547524460401
SMAPE on non-reasoning-only test data: 0.28846322368031163


## e. Examine metric and model limitations

In [51]:
# Print example of a prediction much higher than the label
print("Text:", test_hf['text'][348])
print("Label:", test_hf['labels'][348])
print("Prediction:", predictions_mixed_ft[348][0])

# Print example of a label much higher than the prediction
print("Text:", test_hf['text'][1177])
print("Label:", test_hf['labels'][1177])
print("Prediction:", predictions_mixed_ft[1177][0])

# The word you want to check
word = "284460000000"

# Check if the word is in the vocabulary
# You can use tokenizer.vocab or tokenizer.get_vocab()
vocab = tokenizer.get_vocab()

# Method 1: Direct check
if word in vocab:
    print(f"'{word}' is in the vocabulary.")
else:
    print(f"'{word}' is NOT in the vocabulary.")

# Method 2: Check if the tokenizer breaks it into more than one token
tokens = tokenizer.tokenize(word)
if len(tokens) == 1 and tokens[0] in vocab:
    print(f"'{word}' is effectively in the vocabulary (not split).")
else:
    print(f"'{word}' is NOT effectively in the vocabulary (split into: {tokens}).")

Text: SC Company Laying Off All but [MASK] Workers Over Tariffs. As the China-US tariff volley continues—China on Wednesday announced a 25% tariff on an additional $16 billion worth of US goods in retaliation for our similar move, reports CNBC—the consequences have become real for one South Carolina company. The State reports TV-maker Element Electronics is citing the tariffs as the reason it is essentially closing its doors: It intends to shut down its Winnsboro plant and lay off 126 of its 134 employees, it said in a letter to the state's Department of Employment and Workforce—though it does have a glimmer of hope. The letter explains the tariffs hit television components from China that it relies on, and the layoffs will commence in October unless it can get its parts off the tariff list. CNN Money cites a tweet from the company that suggests optimism: It writes that as the  only USA assembler of televisions, we believe the inclusion of our parts on the list ... is accidental and re