In [None]:
import warnings
warnings.filterwarnings('ignore')
import locale
locale.getpreferredencoding = lambda x=False: "UTF-8"
! pip install -q transformers datasets evaluate
from transformers import AutoModelForCausalLM, BloomTokenizerFast, set_seed
# device = 'cuda'
import torch
import gc
device = 'cuda'
from datasets import load_dataset

In [None]:
def get_model(model_name):
  model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
  return model

def tokenize(text):
   return tokenizer.encode(text, return_tensors="pt").to( device)

tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom")


In [None]:
import numpy as np

def get_classification(model, example, target_id):
    text = example.get('text')
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
    outputs = model.generate(inputs, return_dict_in_generate=True, min_new_tokens=1, max_new_tokens=5, output_scores=True)
    generated_tokens = outputs.sequences[0][inputs.shape[1]:]
    if target_id in generated_tokens:
        prediction = 1
    else:
        prediction = 0
    example['prediction'] = prediction
    return example

def classify_tweets(model, dataset, target):
    target_id = tokenize(target)[0][0]
    return dataset.map(lambda example: get_classification(model, example, target_id))


In [None]:
models = ['bigscience/bloom-560m', 'bigscience/bloom-1b1', 'bigscience/bloom-1b7','bigscience/bloomz-560m', 'bigscience/bloomz-1b1', 'bigscience/bloomz-1b7']

In [None]:
import evaluate
def calc_f1_score(dataset):
    predictions = dataset['prediction']
    references = dataset['label']
    f1 = evaluate.load("f1")
    return f1.compute(predictions=predictions, references=references)

def calc_accuracy(dataset):
    predictions = dataset['prediction']
    references = dataset['label']
    accuracy = evaluate.load("accuracy")
    return accuracy.compute(predictions=predictions, references=references)

def calc_precision(dataset):
    predictions = dataset['prediction']
    references = dataset['label']
    precision = evaluate.load("precision")
    return precision.compute(predictions=predictions, references=references)

def calc_recall(dataset):
    predictions = dataset['prediction']
    references = dataset['label']
    recall = evaluate.load("recall")
    return recall.compute(predictions=predictions, references=references)


In [None]:
import json
def evaluate_models(models, dataset_name, dataset):
  evaluation_data = {model_name: {} for model_name in models}
  evaluation_metrics = {model_name: {} for model_name in models}
  for model_name in models:
    model = get_model(model_name)
    prediction_dataset = classify_tweets(model, dataset)
    evaluation_data[model_name]['f1'] = f1 = calc_f1_score(prediction_dataset)
    evaluation_data[model_name]['accuracy'] = accuracy = calc_accuracy(prediction_dataset)
    evaluation_data[model_name]['precision'] = precision = calc_precision(prediction_dataset)
    evaluation_data[model_name]['recall'] = recall = calc_recall(prediction_dataset)
    prediction_dataset.to_csv(f'{model_name[11:]}_{dataset_name}.csv')
    evaluation_metrics[model_name]['f1'] = f1.get('f1')
    evaluation_metrics[model_name]['accuracy'] = accuracy.get('accuracy')
    evaluation_metrics[model_name]['precision'] = precision.get('precision')
    evaluation_metrics[model_name]['recall'] = recall.get('recall')
    del model
    gc.collect()
    torch.cuda.empty_cache()
    print(f'{model_name} {dataset_name} evaluation complete.\n {evaluation_metrics[model_name]}')

    with open(f'evaluation_metrics_{dataset_name}.json', 'w') as f:
      json.dump(evaluation_metrics, f, indent=4)
  return evaluation_data

In [None]:
def transform_dataset(dataset, transform_function):
  transformed_data = dataset.map(lambda example: transform_function(example))
  return transformed_data

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def display_cm(dataset, model_name):
  predictions = dataset['prediction']
  references = dataset['label']
  class_labels = [0, 1, 2]
  cm = confusion_matrix(references, predictions, labels=class_labels)
  plt.figure(figsize=(8, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
  disp.plot()
  plt.title(f'{model_name} Confusion Matrix')
  plt.show()


In [None]:
def tokenization(example):
    return tokenizer(example["text"])

In [None]:
dataset_names = []
for dataset_name in dataset_names:
  dataset = load_dataset(dataset_name, split="test")
  dataset = dataset.map(tokenization, batch=True)
  data = evaluate_models(models, dataset_name, dataset)
  for model_name, data in data.items():
    print(f'{model_name} F1: {data.get("f1")}')
    display_cm(data['dataset'], model_name)