In [1]:
!pip install transformers datasets



In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset

In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [4]:
# Load the FEVER dataset
fever = load_dataset("fever", "v1.0")

# Label mappings
label2id = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
id2label = {v: k for k, v in label2id.items()}

fever.py:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

The repository for fever contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/fever.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/311431 [00:00<?, ? examples/s]

Generating labelled_dev split:   0%|          | 0/37566 [00:00<?, ? examples/s]

Generating unlabelled_dev split:   0%|          | 0/19998 [00:00<?, ? examples/s]

Generating unlabelled_test split:   0%|          | 0/19998 [00:00<?, ? examples/s]

Generating paper_dev split:   0%|          | 0/18999 [00:00<?, ? examples/s]

Generating paper_test split:   0%|          | 0/18567 [00:00<?, ? examples/s]

In [5]:
# # Load tokenizer and model
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3, hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Preprocess data
def preprocess_data(batch):
    inputs = tokenizer(batch["claim"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = [label2id[label] for label in batch["label"]]
    return inputs

In [7]:
# train_data = fever["train"].map(preprocess_data, batched=True)
# val_data = fever["labelled_dev"].map(preprocess_data, batched=True)

# train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
# val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [8]:
from datasets import Dataset

# Function to sample equal examples for each class
def sample_equal_classes(dataset, label_column, num_samples_per_class):
    sampled_data = []
    for label in label2id.values():  # Iterate over label IDs
        class_samples = dataset.filter(lambda x: label2id[x[label_column]] == label)
        sampled_data.append(class_samples.select(range(min(num_samples_per_class, len(class_samples)))))
    # Combine all class samples
    return Dataset.from_dict({key: sum([d[key] for d in sampled_data], []) for key in dataset.column_names})

# Sample 20,000 training examples and 5,000 validation examples
train_data = sample_equal_classes(fever["train"], "label", num_samples_per_class=10000 // len(label2id))
val_data = sample_equal_classes(fever["labelled_dev"], "label", num_samples_per_class=1000 // len(label2id))

# Preprocess the data
train_data = train_data.map(preprocess_data, batched=True)
val_data = val_data.map(preprocess_data, batched=True)

# Convert to PyTorch format
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Create DataLoaders
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32)

print(f"Training examples: {len(train_data)}, Validation examples: {len(val_data)}")

Filter:   0%|          | 0/311431 [00:00<?, ? examples/s]

Filter:   0%|          | 0/311431 [00:00<?, ? examples/s]

Filter:   0%|          | 0/311431 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37566 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37566 [00:00<?, ? examples/s]

Filter:   0%|          | 0/37566 [00:00<?, ? examples/s]

Map:   0%|          | 0/9999 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Training examples: 9999, Validation examples: 999


In [9]:
# train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
# val_dataloader = DataLoader(val_data, batch_size=128)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# from tqdm import tqdm

# for epoch in range(10):  # Simplified to 1 epochs
#     model.train()
#     total_loss = 0

#     # Wrap the DataLoader with TQDM for a progress bar
#     train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

#     for batch in train_progress:
#         inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
#         labels = batch["labels"].to(device)

#         # Forward pass
#         outputs = model(**inputs, labels=labels)
#         loss = outputs.loss

#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()

#         # Update TQDM with the current batch loss
#         train_progress.set_postfix(loss=loss.item())

#     # Print epoch-level loss
#     print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}")

In [11]:
from tqdm import tqdm

for epoch in range(10):  # Number of epochs
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    # Wrap the DataLoader with TQDM for a progress bar
    train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

    for batch in train_progress:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate predictions and update accuracy metrics
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

        # Update TQDM with the current batch loss and accuracy
        train_progress.set_postfix(loss=loss.item(), accuracy=correct_predictions / total_predictions)

    # Calculate epoch-level accuracy
    epoch_accuracy = correct_predictions / total_predictions

    # Print epoch-level loss and accuracy
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch 1: 100%|██████████| 313/313 [01:52<00:00,  2.79it/s, accuracy=0.642, loss=0.799]


Epoch 1, Loss: 0.7928, Accuracy: 0.6424


Epoch 2: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.795, loss=0.48] 


Epoch 2, Loss: 0.5087, Accuracy: 0.7950


Epoch 3: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.869, loss=0.356]


Epoch 3, Loss: 0.3548, Accuracy: 0.8693


Epoch 4: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.915, loss=0.239] 


Epoch 4, Loss: 0.2438, Accuracy: 0.9149


Epoch 5: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.947, loss=0.143] 


Epoch 5, Loss: 0.1623, Accuracy: 0.9472


Epoch 6: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.963, loss=0.143] 


Epoch 6, Loss: 0.1120, Accuracy: 0.9634


Epoch 7: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.977, loss=0.00567]


Epoch 7, Loss: 0.0763, Accuracy: 0.9767


Epoch 8: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.982, loss=0.0249] 


Epoch 8, Loss: 0.0583, Accuracy: 0.9825


Epoch 9: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.987, loss=0.355]  


Epoch 9, Loss: 0.0436, Accuracy: 0.9874


Epoch 10: 100%|██████████| 313/313 [01:53<00:00,  2.77it/s, accuracy=0.99, loss=0.00149] 

Epoch 10, Loss: 0.0306, Accuracy: 0.9902





In [12]:
from sklearn.metrics import accuracy_score, classification_report


model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        # Get predictions (the index of the highest logit)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]))

Validation Accuracy: 0.6206
Classification Report:
                 precision    recall  f1-score   support

       SUPPORTS       0.75      0.52      0.61       333
        REFUTES       0.75      0.56      0.64       333
NOT ENOUGH INFO       0.50      0.78      0.61       333

       accuracy                           0.62       999
      macro avg       0.67      0.62      0.62       999
   weighted avg       0.67      0.62      0.62       999



In [13]:
# Function to predict the class of a particular claim
def predict_claim_class(claim):
    # Preprocess the claim (same way as during training)
    inputs = tokenizer(claim, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class (index of max logit)
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Convert class ID to label
    predicted_label = id2label[predicted_class_id]

    return predicted_label


In [14]:
# Example usage:
claim = "The Atlantic Ocean is the largest ocean on Earth."
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The Atlantic Ocean is the largest ocean on Earth.' belongs to the class: NOT ENOUGH INFO
