In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AlbertTokenizer, AlbertForSequenceClassification

In [4]:
# Load the FEVER dataset
fever = load_dataset("fever", "v1.0")

# Label mappings
label2id = {"SUPPORTS": 0, "REFUTES": 1, "NOT ENOUGH INFO": 2}
id2label = {v: k for k, v in label2id.items()}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

fever.py:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

The repository for fever contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/fever.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/311431 [00:00<?, ? examples/s]

Generating labelled_dev split:   0%|          | 0/37566 [00:00<?, ? examples/s]

Generating unlabelled_dev split:   0%|          | 0/19998 [00:00<?, ? examples/s]

Generating unlabelled_test split:   0%|          | 0/19998 [00:00<?, ? examples/s]

Generating paper_dev split:   0%|          | 0/18999 [00:00<?, ? examples/s]

Generating paper_test split:   0%|          | 0/18567 [00:00<?, ? examples/s]

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Preprocess data
def preprocess_data(batch):
    inputs = tokenizer(batch["claim"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = [label2id[label] for label in batch["label"]]
    return inputs

In [7]:
len(fever["train"])

311431

In [8]:
len(fever["labelled_dev"])

37566

In [9]:
from collections import Counter

# Count class occurrences in the training dataset
label_counts = Counter(fever["train"]["label"])

# Total number of samples
total_samples = len(fever["train"]["label"])

# Calculate and print the percentage for each class
for label, count in label_counts.items():
    percentage = (count / total_samples) * 100
    print(f"Class '{label}': {count} samples, {percentage:.2f}%")


Class 'SUPPORTS': 193756 samples, 62.21%
Class 'REFUTES': 70066 samples, 22.50%
Class 'NOT ENOUGH INFO': 47609 samples, 15.29%


In [10]:
from datasets import Dataset

# Function to sample equal examples for each class
def balance_classes(dataset, label_column):
    # Find the minimum number of samples across all classes
    label_counts = Counter(dataset[label_column])
    min_samples = min(label_counts.values())  # Smallest class count

    balanced_data = []
    for label in label_counts.keys():
        # Filter examples for the current class
        class_samples = dataset.filter(lambda x: x[label_column] == label)
        # Sample `min_samples` examples
        balanced_data.append(class_samples.select(range(min(min_samples, len(class_samples)))))

    # Combine all balanced class samples
    return Dataset.from_dict({key: sum([d[key] for d in balanced_data], []) for key in dataset.column_names})

# Balance the training and validation datasets
train_data = balance_classes(fever["train"], "label")
# labelled_dev_list = list(fever["labelled_dev"])
# val_data = labelled_dev_list[:14282]
val_data = fever["labelled_dev"].select(range(14282))

# Preprocess the data
train_data = train_data.map(preprocess_data, batched=True)
val_data = val_data.map(preprocess_data, batched=True)

# Convert to PyTorch format
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Create DataLoaders
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32)

print(f"Training examples: {len(train_data)}, Validation examples: {len(val_data)}")

Filter:   0%|          | 0/311431 [00:00<?, ? examples/s]

Filter:   0%|          | 0/311431 [00:00<?, ? examples/s]

Filter:   0%|          | 0/311431 [00:00<?, ? examples/s]

Map:   0%|          | 0/142827 [00:00<?, ? examples/s]

Map:   0%|          | 0/14282 [00:00<?, ? examples/s]

Training examples: 142827, Validation examples: 14282


In [11]:
from collections import Counter

# Count class occurrences in the training dataset
label_counts = Counter(train_data["label"])

# Total number of samples
total_samples = len(train_data["label"])

# Calculate and print the percentage for each class
for label, count in label_counts.items():
    percentage = (count / total_samples) * 100
    print(f"Class '{label}': {count} samples, {percentage:.2f}%")


Class 'SUPPORTS': 47609 samples, 33.33%
Class 'REFUTES': 47609 samples, 33.33%
Class 'NOT ENOUGH INFO': 47609 samples, 33.33%


In [12]:
# train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)
# val_dataloader = DataLoader(val_data, batch_size=128)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
from tqdm import tqdm

for epoch in range(5):  # Number of epochs
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    # Wrap the DataLoader with TQDM for a progress bar
    train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

    for batch in train_progress:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate predictions and update accuracy metrics
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.size(0)

        # Update TQDM with the current batch loss and accuracy
        train_progress.set_postfix(loss=loss.item(), accuracy=correct_predictions / total_predictions)

    # Calculate epoch-level accuracy
    epoch_accuracy = correct_predictions / total_predictions

    # Print epoch-level loss and accuracy
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch 1: 100%|██████████| 4464/4464 [44:36<00:00,  1.67it/s, accuracy=0.727, loss=0.365]


Epoch 1, Loss: 0.6314, Accuracy: 0.7270


Epoch 2: 100%|██████████| 4464/4464 [44:39<00:00,  1.67it/s, accuracy=0.821, loss=0.422]


Epoch 2, Loss: 0.4557, Accuracy: 0.8214


Epoch 3: 100%|██████████| 4464/4464 [44:37<00:00,  1.67it/s, accuracy=0.868, loss=0.593]


Epoch 3, Loss: 0.3486, Accuracy: 0.8679


Epoch 4: 100%|██████████| 4464/4464 [44:38<00:00,  1.67it/s, accuracy=0.9, loss=0.184]


Epoch 4, Loss: 0.2722, Accuracy: 0.8999


Epoch 5: 100%|██████████| 4464/4464 [44:40<00:00,  1.67it/s, accuracy=0.923, loss=0.748]

Epoch 5, Loss: 0.2131, Accuracy: 0.9233





In [14]:
from sklearn.metrics import accuracy_score, classification_report


model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items() if key != "labels"}
        labels = batch["labels"].to(device)

        outputs = model(**inputs)
        logits = outputs.logits

        # Get predictions (the index of the highest logit)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]))

Validation Accuracy: 0.6697
Classification Report:
                 precision    recall  f1-score   support

       SUPPORTS       0.75      0.64      0.69      5571
        REFUTES       0.79      0.70      0.74      5357
NOT ENOUGH INFO       0.47      0.67      0.55      3354

       accuracy                           0.67     14282
      macro avg       0.67      0.67      0.66     14282
   weighted avg       0.70      0.67      0.68     14282



In [None]:
# Function to predict the class of a particular claim
def predict_claim_class(claim):
    # Preprocess the claim (same way as during training)
    inputs = tokenizer(claim, padding="max_length", truncation=True, max_length=128, return_tensors="pt").to(device)

    # Make predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class (index of max logit)
    predicted_class_id = torch.argmax(logits, dim=1).item()

    # Convert class ID to label
    predicted_label = id2label[predicted_class_id]

    return predicted_label


In [17]:
# Example usage:
claim = "The Atlantic Ocean is the largest ocean on Earth."
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The Atlantic Ocean is the largest ocean on Earth.' belongs to the class: REFUTES


In [18]:
# Example usage:
claim= "FIFA World Cup in 2022 was won by France National team."
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'FIFA World Cup in 2022 was won by France National team.' belongs to the class: REFUTES


In [19]:
# Example usage:
claim="The sun appears yellow when observed from the Earth "
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The sun appears yellow when observed from the Earth ' belongs to the class: SUPPORTS


In [20]:
# Example usage:
claim= "The Earth's shadow on the moon is approximately 14,500 miles"
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The Earth's shadow on the moon is approximately 14,500 miles' belongs to the class: REFUTES


In [21]:
# Example usage:
claim="World War 2 began on September 1, 939, when Nazi Germany invaded Poland."
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'World War 2 began on September 1, 939, when Nazi Germany invaded Poland.' belongs to the class: SUPPORTS


In [22]:
# Example usage:
claim="India gained independence on August 15, 2007, after being ruled by the British for nearly 30 years."
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'India gained independence on August 15, 2007, after being ruled by the British for nearly 30 years.' belongs to the class: SUPPORTS


In [23]:
# Example usage:
claim="The Eiffel tower was designed by Gustave Eiffele, a French engineer and archite"
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The Eiffel tower was designed by Gustave Eiffele, a French engineer and archite' belongs to the class: SUPPORTS


In [24]:
# Example usage:
claim="The first practical telecommunications device was invent by Alexander Graham Bell"
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The first practical telecommunications device was invent by Alexander Graham Bell' belongs to the class: SUPPORTS


In [25]:
# Example usage:
claim="The Nobel prize in literature in the year 2024 was awarded to Margaret Atwood"
predicted_label = predict_claim_class(claim)
print(f"The claim '{claim}' belongs to the class: {predicted_label}")

The claim 'The Nobel prize in literature in the year 2024 was awarded to Margaret Atwood' belongs to the class: SUPPORTS
