In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from collections import Counter, defaultdict
import string
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.metrics import accuracy_score
import lime
from lime.lime_text import LimeTextExplainer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset = load_dataset("snli")

# Initialize model + tokenizer
model_name = "google/electra-small-discriminator"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
if hasattr(model, 'electra'):
        for param in model.electra.parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def preprocess(example):
    return tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=tokenizer.model_max_length)

dataset = dataset.filter(lambda ex: ex['label'] != -1)
encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")  # Ensure labels are named correctly
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['validation'],
)
trainer.train()

# Save the trained model and tokenizer
output_dir = "./electra-snli-model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [6]:
output_dir = "./model"

model = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [7]:
#Evaluate model accuracy on SNLI validation

# Define the compute_metrics function
def compute_metrics(pred):
    predictions = pred.predictions.argmax(axis=-1)
    labels = pred.label_ids
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
)

# Reinitialize the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=encoded_dataset['validation'],
    compute_metrics=compute_metrics,
)

# Evaluate the model
validation_results = trainer.evaluate()
print(f"Validation accuracy: {validation_results['eval_accuracy']:.4f}")


100%|██████████| 616/616 [13:47<00:00,  1.34s/it]

Validation accuracy: 0.8953





In [8]:
# Evaluate model accuracy on MNLI validation
mnli = load_dataset("multi_nli")

In [9]:
mnli = mnli.filter(lambda ex: ex['label'] != -1)
def preprocess_mnli(examples):
    return tokenizer(
        examples["premise"],  # MNLI uses "premise" and "hypothesis"
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

# Apply preprocessing
tokenized_mnli = mnli.map(preprocess_mnli, batched=True)

# Ensure column naming consistency
tokenized_mnli = tokenized_mnli.rename_column("label", "labels")  # Ensure the label column is named 'labels'
tokenized_mnli.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 392702/392702 [01:04<00:00, 6120.04 examples/s]
Map: 100%|██████████| 9815/9815 [00:01<00:00, 6294.71 examples/s]
Map: 100%|██████████| 9832/9832 [00:01<00:00, 5158.84 examples/s]


In [10]:
mnli_results = trainer.evaluate(eval_dataset=tokenized_mnli["validation_matched"])
print(f"MNLI Matched accuracy: {mnli_results['eval_accuracy']:.4f}")

mnli_results_mismatched = trainer.evaluate(eval_dataset=tokenized_mnli["validation_mismatched"])
print(f"MNLI Mismatched accuracy: {mnli_results_mismatched['eval_accuracy']:.4f}")


100%|██████████| 614/614 [02:11<00:00,  4.67it/s]


MNLI Matched accuracy: 0.7018


100%|██████████| 615/615 [02:15<00:00,  4.55it/s]

MNLI Mismatched accuracy: 0.7122





In [19]:
hans = load_dataset("hans", trust_remote_code=True)


Downloading data: 100%|██████████| 15.5M/15.5M [00:01<00:00, 11.4MB/s]
Downloading data: 100%|██████████| 15.5M/15.5M [00:01<00:00, 12.3MB/s]
Generating train split: 100%|██████████| 30000/30000 [00:01<00:00, 24605.39 examples/s]
Generating validation split: 100%|██████████| 30000/30000 [00:01<00:00, 25399.38 examples/s]


In [21]:
def preprocess_hans(examples):
    return tokenizer(
        examples["premise"],  # HANS uses "sentence1" and "sentence2"
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

# Apply preprocessing
tokenized_hans = hans.map(preprocess_hans, batched=True)

# Ensure column naming consistency
tokenized_hans = tokenized_hans.rename_column("label", "labels")  # Ensure the label column is named 'labels'
tokenized_hans.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map: 100%|██████████| 30000/30000 [00:02<00:00, 12069.72 examples/s]
Map: 100%|██████████| 30000/30000 [00:02<00:00, 12507.61 examples/s]


In [None]:
hans_results = trainer.evaluate(eval_dataset=tokenized_hans)


100%|██████████| 1875/1875 [06:56<00:00,  4.50it/s]
100%|██████████| 1875/1875 [06:51<00:00,  4.55it/s]


KeyError: 'eval_accuracy'

In [27]:
print(f"HANS accuracy: {hans_results['eval_validation_accuracy']:.4f}")
print(f"HANS accuracy: {hans_results['eval_train_accuracy']:.4f}")


HANS accuracy: 0.4933
HANS accuracy: 0.4935


In [13]:
anli = load_dataset("anli")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train_r1 split: 100%|██████████| 16946/16946 [00:00<00:00, 323808.78 examples/s]
Generating dev_r1 split: 100%|██████████| 1000/1000 [00:00<00:00, 249943.63 examples/s]
Generating test_r1 split: 100%|██████████| 1000/1000 [00:00<00:00, 333172.13 examples/s]
Generating train_r2 split: 100%|██████████| 45460/45460 [00:00<00:00, 1009985.06 examples/s]
Generating dev_r2 split: 100%|██████████| 1000/1000 [00:00<00:00, 333278.03 examples/s]
Generating test_r2 split: 100%|██████████| 1000/1000 [00:00<00:00, 333172.13 examples/s]
Generating train_r3 split: 100%|██████████| 100459/100459 [00:00<00:00, 1141309.76 examples/s]
Generating dev_r3 split: 100%|██████████| 1200/1200 [00:00<00:00, 299914.48 examples/s]
Generating test_r

In [None]:
def preprocess_anli(examples):
    return tokenizer(
        examples["premise"],  # ANLI uses "premise" and "hypothesis"
        examples["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

# Apply preprocessing to all rounds of ANLI
tokenized_anli = {}
for round_name in ["test_r1", "test_r2", "test_r3"]:
    tokenized_anli[round_name] = anli[round_name].map(preprocess_anli, batched=True)
    tokenized_anli[round_name] = tokenized_anli[round_name].rename_column("label", "labels")
    tokenized_anli[round_name].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7517.13 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3815.91 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 8161.37 examples/s]


In [28]:
for round_name, dataset in tokenized_anli.items():
    anli_results = trainer.evaluate(eval_dataset=dataset)
    print(f"ANLI {round_name} accuracy: {anli_results['eval_accuracy']:.4f}")


100%|██████████| 63/63 [00:12<00:00,  4.97it/s]


ANLI dev_r1 accuracy: 0.3170


100%|██████████| 63/63 [00:13<00:00,  4.79it/s]


ANLI dev_r2 accuracy: 0.3210


100%|██████████| 75/75 [00:16<00:00,  4.67it/s]

ANLI dev_r3 accuracy: 0.3067





In [29]:
print(anli_results)

{'eval_loss': 2.0615222454071045, 'eval_model_preparation_time': 0.002, 'eval_accuracy': 0.30666666666666664, 'eval_runtime': 16.2791, 'eval_samples_per_second': 73.714, 'eval_steps_per_second': 4.607}
