Code for Tokenization & Embedding Generation

In [2]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("CLAN_data_cleaned.csv")

df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"president ""biden's plan would mean america's s...",biden s energy plan would get rid of seniors a...
1,2,important announcement coronavirus last evenin...,"if someone with the new coronavirus sneezes, i..."
2,3,heart is delighted to hear,heart is delighted to hear
3,4,an allowed appeal is one where the initial ref...,the vast majority of people coming across the ...
4,5,warm water therapy dr. d. mensah asare says th...,a widely popular social media post claims that...


Will Use Hugging Face Tokenizer class

In [5]:
from transformers import AutoTokenizer

# Load a pretrained tokenizer (choose based on your model)
MODEL_NAME = "facebook/bart-base"  # Change this if using BERT or T5
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Ensure all entries are strings
df["Social Media Post"] = df["Social Media Post"].astype(str)

# Tokenize the Social Media Posts
df["Tokenized"] = df["Social Media Post"].apply(
    lambda x: tokenizer(
        x, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
    )
)

# Check tokenized output
df["Tokenized"].head()
df

Unnamed: 0,PID,Social Media Post,Normalized Claim,Tokenized
0,1,"president ""biden's plan would mean america's s...",biden s energy plan would get rid of seniors a...,"[input_ids, attention_mask]"
1,2,important announcement coronavirus last evenin...,"if someone with the new coronavirus sneezes, i...","[input_ids, attention_mask]"
2,3,heart is delighted to hear,heart is delighted to hear,"[input_ids, attention_mask]"
3,4,an allowed appeal is one where the initial ref...,the vast majority of people coming across the ...,"[input_ids, attention_mask]"
4,5,warm water therapy dr. d. mensah asare says th...,a widely popular social media post claims that...,"[input_ids, attention_mask]"
...,...,...,...,...
2285,2798,"foh with that weak malala slander, twitter fin...",malala yousafzai has been silent about the cri...,"[input_ids, attention_mask]"
2286,2802,the plight of hindu girls madly in love with m...,the plight of hindu girls madly in love with m...,"[input_ids, attention_mask]"
2287,2804,i am hearing that james comey has 50 counts of...,james comey has 50 counts of treason. and john...,"[input_ids, attention_mask]"
2288,2806,dr. lonnie herman reviews how your food is pre...,irradiated food causes cancer,"[input_ids, attention_mask]"


Conversion of Tokenized Data in Model Input format

In [7]:
import torch

# Ensure labels are strings
df["Normalized Claim"] = df["Normalized Claim"].astype(str)

# Encode Labels (if required for classification)
# If this is a text generation task, use tokenizer on Normalized Claim as well
labels = tokenizer(
    df["Normalized Claim"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=128,
    return_tensors="pt",
)

# Print shapes to verify
print("Tokenized input shape:", labels["input_ids"].shape)

Tokenized input shape: torch.Size([2290, 128])


In [8]:
labels['input_ids']

tensor([[    0,   428, 12145,  ...,     1,     1,     1],
        [    0,  1594,   951,  ...,     1,     1,     1],
        [    0, 12690,    16,  ...,     1,     1,     1],
        ...,
        [    0,   267, 12336,  ...,     1,     1,     1],
        [    0,   853,  7822,  ...,     1,     1,     1],
        [    0, 22118,   337,  ...,     1,     1,     1]])

Create Data Loader Class for Model

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Load tokenizer (Modify model name if needed)
MODEL_NAME = "facebook/bart-base"  # Change to "t5-small" if using T5
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


class ClaimDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        """
        Args:
            texts (list): List of input text strings.
            labels (list): List of target labels (normalized claims).
            tokenizer (AutoTokenizer): Pretrained tokenizer.
            max_length (int): Max token length.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        """
        Tokenizes the input text and label, and returns tensors.
        """
        text_encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        label_encoding = self.tokenizer(
            self.labels[idx],  # Labels should also be tokenized
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": text_encoding["input_ids"].squeeze(
                0
            ),  # (1, max_length) → (max_length,)
            "attention_mask": text_encoding["attention_mask"].squeeze(0),
            "labels": label_encoding["input_ids"].squeeze(
                0
            ),  # Labels must be tokenized for seq2seq
        }


def collate_fn(batch):
    """
    Custom collate function to stack tensors properly.
    """
    input_ids = torch.stack([b["input_ids"] for b in batch])
    attention_mask = torch.stack([b["attention_mask"] for b in batch])
    labels = torch.stack([b["labels"] for b in batch])

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


# Example Data (Replace with actual dataset)
texts = ["Covid-19 is a hoax!", "Vaccines contain microchips."]
labels = ["Covid-19 is not a hoax.", "Vaccines do not contain microchips."]

# Create Dataset
dataset = ClaimDataset(texts, labels, tokenizer)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Check a Sample Batch
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[    0,   347,  1417,   808,    12,  1646,    16,    10, 23195,   328,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,  