# 04 Flan-T5

## Notebook setup

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Define data path
BASE_PATH = '/content/drive/MyDrive/266_Final_Project/'
INPUT_PATH = '/content/drive/MyDrive/266_Final_Project/Input_Data/NumHG-main/Dataset/'
PROCESSED_PATH = '/content/drive/MyDrive/266_Final_Project/Processed_Data/'

In [None]:
!pip install -q transformers
!pip install -q datasets
!pip install -q evaluate
!pip install -q math-verify
!pip install -q word2number

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

# Import libraries
import numpy as np
import pandas as pd
import torch

import transformers
import evaluate

import matplotlib.pyplot as plt
from datasets import Dataset, load_from_disk, load_dataset

from transformers import TrainingArguments, Trainer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

from transformers import pipeline

import csv
from math_verify import parse, verify
from word2number import w2n
import pickle
import regex as re

In [None]:
# Set seed
np.random.seed(42)
pd_random_state = 24

## Functions

In [None]:
max_length = 512

def preprocess_data(batch_triplet, tokenizer):
    batch_triplet["context_and_question"] = ["Context: " + context + "Question: " + question for context, question in zip(batch_triplet["context"], batch_triplet["question"])]
    context_and_question, label = batch_triplet['context_and_question'], batch_triplet['labels']

    orig_encoded = tokenizer.batch_encode_plus(
        context_and_question,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    orig_input_ids = orig_encoded['input_ids']

    target_encoded = tokenizer.batch_encode_plus(
        label,
        max_length=5,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    label_ids = target_encoded['input_ids']

    return {'input_ids': orig_input_ids,
            'labels': label_ids}

In [None]:
def fine_tune(model,
              tokenizer,
              train_data,
              num_train_examples,
              val_data,
              file_path,
              batch_size = 8,
              num_epochs = 3
              ):

  args = Seq2SeqTrainingArguments(
      file_path,
      evaluation_strategy='epoch',
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      max_steps=int(num_epochs * num_train_examples / batch_size),
      report_to='none',
      #label_names=['labels']
  )

  # Define the trainer, passing in the model, training args, and data generators
  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=train_data,
      eval_dataset=val_data
  )

  trainer.train()

In [None]:
def predict_masked_token_flan_t5(pipeline, context, question):
  input_string = "Question: Based on the context, what number should replace the [MASK] token in the headline. Only provide the number as response. Headline: {1}. Context: {0}. ".format(context,question)
  output = pipeline(input_string, max_new_tokens=5)
  return output[0]['generated_text']

def batch_prediction(data, pipeline):
  data["predicted_labels"] = [ predict_masked_token_flan_t5(pipeline, context, question)  for context, question in zip(data["context"], data["question"]) ]
  return data
  #return predict_masked_token_gemma(data["context"],data["question"])

In [None]:
def csv_to_list(file_path):
  data_list = []
  with open(file_path, 'r', newline='') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        data_list.append(row[0])
  return data_list

In [None]:
def calculate_accuracy(predictions, data):
  matches = []
  for i in range(len(predictions)):
    if predictions[i] == data['labels'][i]:
      matches.append(1)
    else:
      matches.append(0)
  return sum(matches)/len(matches)

In [None]:
def is_decimal(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

In [None]:
def calculate_mean_absolute_percentage_error(predictions, data):
    pct_err_all = []
    pct_err_no_outliers = []
    non_numeric = []
    for i in range(len(predictions)):
      if predictions[i] is None:
        non_numeric.append([i,predictions[i],data['labels'][i]])
      else:
        if is_decimal(predictions[i]) and is_decimal(data['labels'][i].replace(",","")) and float(data['labels'][i].replace(",","")) != 0:
          abs_err = abs( float(predictions[i]) - float(data['labels'][i].replace(",","")) )
          pct_err = abs_err / float(data['labels'][i].replace(",",""))  * 100
          if pct_err <= 1000 : # outlier if wrong by over 1000%
            pct_err_no_outliers.append(pct_err)
          pct_err_all.append(pct_err)
        else:
          non_numeric.append([i,predictions[i],data['labels'][i]])
    print("Count for calculating Mean Absolute Percentage Error of all predictions:", len(pct_err_all))
    print("Count for calculating Mean Absolute Percentage Error of non-outlier predictions:", len(pct_err_no_outliers))
    mape_all = sum(pct_err_all)/len(pct_err_all)
    mape_no_outliers = sum(pct_err_no_outliers)/len(pct_err_no_outliers)
    return [mape_all, mape_no_outliers,non_numeric]

    #      if predictions[i] is not None:


In [None]:
# Function for Symmetric Mean Absolute Percentage Error (SMAPE)
def calculate_symm_mean_absolute_percentage_error(predictions, data):
    pct_err_all = []
    non_calculable = []
    for i in range(len(predictions)):
      if predictions[i] is not None:
        if is_decimal(predictions[i]) and is_decimal(data['labels'][i].replace(",","")) and abs( (float(predictions[i]) + float(data['labels'][i].replace(",","")))/2 ) != 0:
          abs_err = abs( float(predictions[i]) - float(data['labels'][i].replace(",","")) )
          avg_val = abs( (float(predictions[i]) + float(data['labels'][i].replace(",","")))/2 )
          pct_err = abs_err / avg_val * 100
          pct_err_all.append(pct_err)
        else:
          non_calculable.append(i)
    print("Count for calculating Symmetric Mean Absolute Percentage Error of all predictions:", len(pct_err_all))
    print("Count non-calculable predictions:", len(non_calculable))
    smape_all = sum(pct_err_all)/len(pct_err_all)
    return smape_all

In [None]:
def convert_word_to_number(word):
  # Extracts number from string. If number not found, tries to convert the string to number using word-to-number (eg. three -> 3 )
  try:
    number = re.findall(r'[-+]?\d{1,3}(?:,\d{3})*(?:\.\d+)?', word)[0]
    return number.replace(",","")
  except IndexError:
    pass # No number found. Proceed to word to number conversion.

  try:
    number = str(w2n.word_to_num(word))
    return number.replace(",","")
  except ValueError:
        return None

In [None]:
def extract_numbers_from_list(strings):
  unconvertible_words = []
  number_list = []
  for string in strings:
    result = convert_word_to_number(string)
    if result is not None:
      number_list.append(result)
    else:
      number_list.append(None)
      unconvertible_words.append(string)
  return number_list, unconvertible_words

## Intake of train/test data

In [None]:
# Read in processed HF data
train_hf = load_from_disk(PROCESSED_PATH + 'train_triplet.hf')
train_hf_reasoning_small = load_from_disk(PROCESSED_PATH + 'train_triplet_reasoning_small.hf')
train_hf_reasoning = load_from_disk(PROCESSED_PATH + 'train_triplet_reasoning.hf')
val_hf = load_from_disk(PROCESSED_PATH + 'test_triplet.hf')

In [None]:
num_train_examples = len(train_hf)        # 9711 for train_triplet
num_train_examples_reasoning_small = len(train_hf_reasoning_small)  # 2695 for train_triplet_reasoning_small
num_train_examples_reasoning = len(train_hf_reasoning)  # 9711 for train_triplet_reasoning
num_val_examples = len(val_hf)            # 2774


In [None]:
# Load parquet file as streaming dataset
train_hf_mixed = load_dataset('parquet', data_files=PROCESSED_PATH + 'train_triplet.parquet', split="train", streaming=True)
train_hf_reasoning_small = load_dataset('parquet', data_files=PROCESSED_PATH + 'train_triplet_reasoning_small.parquet',split="train", streaming=True)
train_hf_reasoning = load_dataset('parquet', data_files=PROCESSED_PATH + 'train_triplet_reasoning.parquet',split="train", streaming=True)
val_hf = load_dataset('parquet', data_files=PROCESSED_PATH + 'test_triplet.parquet', split="train", streaming=True)

In [None]:
# Tokenize train/val data
train_ds = train_hf.map(preprocess_data, batched=True)
train_ds_reasoning_small = train_hf_reasoning_small.map(preprocess_data, batched=True)
train_ds_reasoning = train_hf_reasoning.map(preprocess_data, batched=True)
val_ds = val_hf.map(preprocess_data, batched=True)

In [None]:
# Dropping dataset to save memory
train_hf=None
train_hf_reasoning_small=None
train_hf_reasoning=None
val_hf=None

## Flan-T5 Baseline

In [None]:
model_id = "google/flan-t5-base"

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained(model_id)
t5_model = T5ForConditionalGeneration.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
pipe = pipeline("text2text-generation", model=t5_model, tokenizer=t5_tokenizer, device="cuda", torch_dtype=torch.bfloat16)

Device set to use cuda


In [None]:
test_hf = load_from_disk(PROCESSED_PATH + 'test_triplet.hf')

In [None]:
test_hf = test_hf.map(batch_prediction, fn_kwargs={"pipeline": pipe} , batched=True )


Map:   0%|          | 0/2774 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


In [None]:
# Store predictions as a list
predictions = [test_hf['predicted_labels'][i].strip() for i in range(len(test_hf['predicted_labels']))]

In [None]:
# Store predictions as a list
predictions = [test_hf['predicted_labels'][i].strip() for i in range(len(test_hf['predicted_labels']))]

In [None]:
# Save predictions
with open(PROCESSED_PATH + "flant5_baseline_predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions:
        writer.writerow([item])

In [None]:
# Drop test_hf and pipe to save memory
test_hf = None
pipe = None

## Fine-tuning on mixed reasoning dataset

In [None]:
model_id = "google/flan-t5-base"

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained(model_id)
t5_model = T5ForConditionalGeneration.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
fine_tune(t5_model,
          t5_tokenizer,
          train_ds,
          num_train_examples,
          val_ds,
          file_path = BASE_PATH + '/Models/flan-t5-base-finetuned-alltrain',
          batch_size = 8,
          num_epochs = 3
          )

Epoch,Training Loss,Validation Loss
0,No log,0.410634
1,1.777100,0.385536
2,0.313500,0.361622


TrainOutput(global_step=1111, training_loss=0.9704349287773015, metrics={'train_runtime': 420.9806, 'train_samples_per_second': 21.113, 'train_steps_per_second': 2.639, 'total_flos': 6082014804443136.0, 'train_loss': 0.9704349287773015, 'epoch': 2.332133213321332})

In [None]:
pipe = pipeline("text2text-generation", model=t5_model, tokenizer=t5_tokenizer, device="cuda", torch_dtype=torch.bfloat16)

Device set to use cuda


In [None]:
test_hf = load_from_disk(PROCESSED_PATH + 'test_triplet.hf')

In [None]:
test_hf = test_hf.map(batch_prediction, fn_kwargs={"pipeline": pipe} , batched=True )


Map:   0%|          | 0/2774 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


In [None]:
# Store predictions as a list
predictions = [test_hf['predicted_labels'][i].strip() for i in range(len(test_hf['predicted_labels']))]

In [None]:
# Save predictions
with open(PROCESSED_PATH + "flant5_ft_mixed_predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions:
        writer.writerow([item])

In [None]:
# Drop test_hf and pipe to save memory
test_hf = None
pipe = None

## Fine-tuning on small reasoning dataset

In [None]:
model_id = "google/flan-t5-base"

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained(model_id)
t5_model = T5ForConditionalGeneration.from_pretrained(model_id)

In [None]:
fine_tune(t5_model,
          t5_tokenizer,
          train_ds_reasoning_small,
          num_train_examples_reasoning_small,
          val_ds,
          file_path = BASE_PATH + '/Models/flan-t5-base-finetuned-reasoning-small',
          batch_size = 8,
          num_epochs = 3
          )

Epoch,Training Loss,Validation Loss
0,No log,0.410634
1,1.777100,0.385536
2,0.313500,0.361622


TrainOutput(global_step=1111, training_loss=0.9704349287773015, metrics={'train_runtime': 420.9806, 'train_samples_per_second': 21.113, 'train_steps_per_second': 2.639, 'total_flos': 6082014804443136.0, 'train_loss': 0.9704349287773015, 'epoch': 2.332133213321332})

In [None]:
pipe = pipeline("text2text-generation", model=t5_model, tokenizer=t5_tokenizer, device="cuda", torch_dtype=torch.bfloat16)

In [None]:
test_hf = load_from_disk(PROCESSED_PATH + 'test_triplet.hf')

In [None]:
test_hf = test_hf.map(batch_prediction)

In [None]:
# Store predictions as a list
predictions = [test_hf['predicted_labels'][i].strip() for i in range(len(test_hf['predicted_labels']))]

In [None]:
# Save predictions
with open(PROCESSED_PATH + "flant5_ft_reasoning_small_predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions:
        writer.writerow([item])

In [None]:
# Drop test_hf and pipe to save memory
test_hf = None
pipe = None

## Fine-tuning on large reasoning dataset

In [None]:
model_id = "google/flan-t5-base"

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained(model_id)
t5_model = T5ForConditionalGeneration.from_pretrained(model_id)

In [None]:
fine_tune(t5_model,
          t5_tokenizer,
          train_ds_reasoning,
          num_train_examples_reasoning,
          val_ds,
          file_path = BASE_PATH + '/Models/flan-t5-base-finetuned-reasoning',
          batch_size = 8,
          num_epochs = 3
          )

Epoch,Training Loss,Validation Loss
0,No log,0.410634
1,1.777100,0.385536
2,0.313500,0.361622


TrainOutput(global_step=1111, training_loss=0.9704349287773015, metrics={'train_runtime': 420.9806, 'train_samples_per_second': 21.113, 'train_steps_per_second': 2.639, 'total_flos': 6082014804443136.0, 'train_loss': 0.9704349287773015, 'epoch': 2.332133213321332})

In [None]:
pipe = pipeline("text2text-generation", model=t5_model, tokenizer=t5_tokenizer, device="cuda", torch_dtype=torch.bfloat16)

In [None]:
test_hf = load_from_disk(PROCESSED_PATH + 'test_triplet.hf')

In [None]:
test_hf = test_hf.map(batch_prediction)

In [None]:
# Store predictions as a list
predictions = [test_hf['predicted_labels'][i].strip() for i in range(len(test_hf['predicted_labels']))]

In [None]:
# Save predictions
with open(PROCESSED_PATH + "flant5_ft_reasoning_predictions.csv", "w", newline="") as f:
    writer = csv.writer(f)
    for item in predictions:
        writer.writerow([item])

In [None]:
# Drop test_hf and pipe to save memory
test_hf = None
pipe = None

## Evalution

In [None]:
test_bert_hf = load_from_disk(PROCESSED_PATH + 'test_bert.hf')

In [None]:
# Extract reasoning-only indices
reasoning_indices = [i for i, example in enumerate(test_bert_hf) if example["need_reasoning"] == 1]

# Extract non-reasoning-only indices
non_reasoning_indices = [i for i, example in enumerate(test_bert_hf) if example["need_reasoning"] == 0]

In [None]:
test_bert = None

In [None]:
test_hf = load_from_disk(PROCESSED_PATH + 'test_triplet.hf')

In [None]:
predictions_baseline = csv_to_list(PROCESSED_PATH + "flant5_baseline_predictions.csv")

In [None]:
numeric_pred_baseline, unconvertible_pred_baseline = extract_numbers_from_list(predictions_baseline)

In [None]:
print("Accuracy of FlanT5 Baseline:", calculate_accuracy(predictions_baseline, test_hf) )
print("Mean Absolute Percentage Error of FlanT5 Baseline:", calculate_mean_absolute_percentage_error(numeric_pred_baseline, test_hf)[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Baseline:", calculate_symm_mean_absolute_percentage_error(numeric_pred_baseline, test_hf))

Accuracy of FlanT5 Baseline: 0.38067772170151404
Count for calculating Mean Absolute Percentage Error of all predictions: 2741
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 2520
Mean Absolute Percentage Error of FlanT5 Baseline: [73162.57219554196, 72.10148208253915]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 2741
Count non-calculable predictions: 27
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline: 81.52391034615978


In [None]:
predictions_baseline_reasoning = [predictions_baseline[i] for i in reasoning_indices]
predictions_baseline_non_reasoning = [predictions_baseline[i] for i in non_reasoning_indices]ca
numeric_pred_baseline_reasoning = [numeric_pred_baseline[i] for i in reasoning_indices]
numeric_pred_baseline_non_reasoning = [numeric_pred_baseline[i] for i in non_reasoning_indices]

In [None]:
print("Accuracy of FlanT5 Baseline on Reasoning questions:", calculate_accuracy(predictions_baseline_reasoning, test_hf.select(reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_baseline_reasoning, test_hf.select(reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_baseline_reasoning, test_hf.select(reasoning_indices)))

Accuracy of FlanT5 Baseline on Reasoning questions: 0.3170454545454545
Count for calculating Mean Absolute Percentage Error of all predictions: 868
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 736
Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions: [97335.92088513132, 123.55403174018883]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 868
Count non-calculable predictions: 9
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions: 132.9595300165889


In [None]:
print("Accuracy of FlanT5 Baseline on Non-Reasoning questions:", calculate_accuracy(predictions_baseline_non_reasoning, test_hf.select(non_reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_baseline_non_reasoning, test_hf.select(non_reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_baseline_non_reasoning, test_hf.select(non_reasoning_indices)))

Accuracy of FlanT5 Baseline on Non-Reasoning questions: 0.4102428722280887
Count for calculating Mean Absolute Percentage Error of all predictions: 1865
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 1793
Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions: [16903.971784995712, 61.72149599172018]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 1865
Count non-calculable predictions: 26
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions: 74.76209581634625


In [None]:
prediction_mixed_ft = csv_to_list(PROCESSED_PATH + "flant5_ft_mixed_predictions.csv")

In [None]:
numeric_pred_mixed_ft, unconvertible_pred_mixed_ft = extract_numbers_from_list(prediction_mixed_ft)
print("Number of words converted to numeric:",len(numeric_pred_mixed_ft))
print("Number of words NOT converted to numeric:",len(unconvertible_pred_mixed_ft))

Number of words converted to numeric: 2774
Number of words NOT converted to numeric: 0


In [None]:
print("Accuracy of FlanT5 Mixed Fine-tune:", calculate_accuracy(prediction_mixed_ft, test_hf) )
print("Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune:", calculate_mean_absolute_percentage_error(numeric_pred_mixed_ft, test_hf)[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune:", calculate_symm_mean_absolute_percentage_error(numeric_pred_mixed_ft, test_hf))

Accuracy of FlanT5 Mixed Fine-tune: 0.7880317231434751
Count for calculating Mean Absolute Percentage Error of all predictions: 2747
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 2712
Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune: [370.10986528773975, 22.748461850220387]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 2748
Count non-calculable predictions: 26
Symmetric Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune: 28.28969568677206


In [None]:
predictions_mixed_ft_reasoning = [prediction_mixed_ft[i] for i in reasoning_indices]
predictions_mixed_ft_non_reasoning = [prediction_mixed_ft[i] for i in non_reasoning_indices]
numeric_pred_mixed_ft_reasoning = [prediction_mixed_ft[i] for i in reasoning_indices]
numeric_pred_mixed_ft_non_reasoning = [prediction_mixed_ft[i] for i in non_reasoning_indices]

In [None]:
print("Accuracy of FlanT5 Mixed Fine-tune on Reasoning questions:", calculate_accuracy(predictions_mixed_ft_reasoning, test_hf.select(reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune on Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_mixed_ft_reasoning, test_hf.select(reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune on Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_mixed_ft_reasoning, test_hf.select(reasoning_indices)))

Accuracy of FlanT5 Baseline on Reasoning questions: 0.6897727272727273
Count for calculating Mean Absolute Percentage Error of all predictions: 872
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 849
Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions: [428.7359057935905, 27.94687152059774]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 872
Count non-calculable predictions: 8
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions: 27.23348078932998


In [None]:
print("Accuracy of FlanT5 Mixed Fine-tune on Non-Reasoning questions:", calculate_accuracy(predictions_mixed_ft_non_reasoning, test_hf.select(non_reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune on Non-Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_mixed_ft_non_reasoning, test_hf.select(non_reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Mixed Fine-tune on Non-Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_mixed_ft_non_reasoning, test_hf.select(non_reasoning_indices)))

Accuracy of FlanT5 Baseline on Non-Reasoning questions: 0.8336853220696938
Count for calculating Mean Absolute Percentage Error of all predictions: 1845
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 1836
Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions: [38.512925808365345, 16.922000749169953]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 1846
Count non-calculable predictions: 48
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions: 21.942375232316888


In [None]:
prediction_reasoning_small_ft = csv_to_list(PROCESSED_PATH + "flant5_ft_reasoning_small_predictions.csv")

In [None]:
numeric_pred_reasoning_small_ft, unconvertible_pred_reasoning_small_ft = extract_numbers_from_list(prediction_reasoning_small_ft)
print("Number of words converted to numeric:",len(numeric_pred_reasoning_small_ft))
print("Number of words NOT converted to numeric:",len(unconvertible_pred_reasoning_small_ft))

Number of words converted to numeric: 2774
Number of words NOT converted to numeric: 0


In [None]:
print("Accuracy of FlanT5 Small Reasoning Fine-tune:", calculate_accuracy(prediction_reasoning_small_ft, test_hf) )
print("Mean Absolute Percentage Error of FlanT5 Small Reasoning Fine-tune:", calculate_mean_absolute_percentage_error(numeric_pred_reasoning_small_ft, test_hf)[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Small Reasoning Fine-tune:", calculate_symm_mean_absolute_percentage_error(numeric_pred_reasoning_small_ft, test_hf))

Accuracy of FlanT5 Small Reasoning Fine-tune: 0.6377072819033887
Count for calculating Mean Absolute Percentage Error of all predictions: 2747
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 2714
Mean Absolute Percentage Error of FlanT5 Small Reasoning Fine-tune: [259.15432058784984, 35.665033006882595]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 2748
Count non-calculable predictions: 26
Symmetric Mean Absolute Percentage Error of FlanT5 Small Reasoning Fine-tune: 46.456866318981724


In [None]:
predictions_reasoning_small_ft_reasoning = [prediction_reasoning_small_ft[i] for i in reasoning_indices]
predictions_reasoning_small_ft_non_reasoning = [prediction_reasoning_small_ft[i] for i in non_reasoning_indices]
numeric_pred_reasoning_small_ft_reasoning = [prediction_reasoning_small_ft[i] for i in reasoning_indices]
numeric_pred_reasoning_small_ft_non_reasoning = [prediction_reasoning_small_ft[i] for i in non_reasoning_indices]

In [None]:
print("Accuracy of FlanT5 Small Reasoning on Reasoning questions:", calculate_accuracy(predictions_reasoning_small_ft_reasoning, test_hf.select(reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Small Reasoning on Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_reasoning_small_ft_reasoning, test_hf.select(reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Small Reasoning on Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_reasoning_small_ft_reasoning, test_hf.select(reasoning_indices)))

Accuracy of FlanT5 Baseline on Reasoning questions: 0.6920454545454545
Count for calculating Mean Absolute Percentage Error of all predictions: 874
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 857
Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions: [2226.330410996276, 34.54591474753349]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 874
Count non-calculable predictions: 6
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Reasoning questions: 31.99321865277406


In [None]:
print("Accuracy of FlanT5 Small Reasoning on Non-Reasoning questions:", calculate_accuracy(predictions_reasoning_small_ft_non_reasoning, test_hf.select(non_reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Small Reasoning on Non-Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_reasoning_small_ft_non_reasoning, test_hf.select(non_reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Small Reasoning on Non-Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_reasoning_small_ft_non_reasoning, test_hf.select(non_reasoning_indices)))

Accuracy of FlanT5 Baseline on Non-Reasoning questions: 0.6124604012671595
Count for calculating Mean Absolute Percentage Error of all predictions: 1857
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 1842
Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions: [55594.79093800521, 37.40523001947186]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 1858
Count non-calculable predictions: 36
Symmetric Mean Absolute Percentage Error of FlanT5 Baseline on Non-Reasoning questions: 52.88041718514905


In [None]:
prediction_reasoning_ft = csv_to_list(PROCESSED_PATH + "flant5_ft_reasoning_predictions.csv")


In [None]:
numeric_pred_reasoning_ft, unconvertible_pred_reasoning_ft = extract_numbers_from_list(prediction_reasoning_ft)
print("Number of words converted to numeric:",len(numeric_pred_reasoning_ft))
print("Number of words NOT converted to numeric:",len(unconvertible_pred_reasoning_ft))

Number of words converted to numeric: 2774
Number of words NOT converted to numeric: 0


In [None]:
print("Accuracy of FlanT5 Large Reasoning Fine-tune:", calculate_accuracy(prediction_reasoning_ft, test_hf) )
print("Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-tune:", calculate_mean_absolute_percentage_error(numeric_pred_reasoning_ft, test_hf)[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-tune:", calculate_symm_mean_absolute_percentage_error(numeric_pred_reasoning_ft, test_hf))

Accuracy of FlanT5 Large Reasoning Fine-tune: 0.6333813987022351
Count for calculating Mean Absolute Percentage Error of all predictions: 2747
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 2725
Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-tune: [124.62071994544293, 30.06804481345757]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 2748
Count non-calculable predictions: 26
Symmetric Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-tune: 44.811418454589685


In [None]:
predictions_reasoning_large_ft_reasoning = [prediction_reasoning_ft[i] for i in reasoning_indices]
predictions_reasoning_large_ft_non_reasoning = [prediction_reasoning_ft[i] for i in non_reasoning_indices]
numeric_pred_reasoning_large_ft_reasoning = [prediction_reasoning_ft[i] for i in reasoning_indices]
numeric_pred_reasoning_large_ft_non_reasoning = [prediction_reasoning_ft[i] for i in non_reasoning_indices]

In [None]:
print("Accuracy of FlanT5 Large Reasoning Fine-Tune on Reasoning questions:", calculate_accuracy(predictions_reasoning_large_ft_reasoning, test_hf.select(reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_reasoning_large_ft_reasoning, test_hf.select(reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_reasoning_large_ft_reasoning, test_hf.select(reasoning_indices)))

Accuracy of FlanT5 Large Reasoning Fine-Tune on Reasoning questions: 0.7454545454545455
Count for calculating Mean Absolute Percentage Error of all predictions: 879
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 869
Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Reasoning questions: [234.25623175671015, 22.207189231345726]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 879
Count non-calculable predictions: 1
Symmetric Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Reasoning questions: 23.65742527143716


In [None]:
print("Accuracy of FlanT5 Large Reasoning Fine-Tune on Non-Reasoning questions:", calculate_accuracy(predictions_reasoning_large_ft_non_reasoning, test_hf.select(non_reasoning_indices)) )
print("Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Non-Reasoning questions:", calculate_mean_absolute_percentage_error(numeric_pred_reasoning_large_ft_non_reasoning, test_hf.select(non_reasoning_indices))[:2])
print("Symmetric Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Non-Reasoning questions:", calculate_symm_mean_absolute_percentage_error(numeric_pred_reasoning_large_ft_non_reasoning, test_hf.select(non_reasoning_indices)))

Accuracy of FlanT5 Large Reasoning Fine-Tune on Non-Reasoning questions: 0.5813093980992609
Count for calculating Mean Absolute Percentage Error of all predictions: 1866
Count for calculating Mean Absolute Percentage Error of non-outlier predictions: 1853
Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Non-Reasoning questions: [74.82031006336436, 33.04344468687448]
Count for calculating Symmetric Mean Absolute Percentage Error of all predictions: 1867
Count non-calculable predictions: 27
Symmetric Mean Absolute Percentage Error of FlanT5 Large Reasoning Fine-Tune on Non-Reasoning questions: 53.56712313962281
