In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
df_processed = pd.read_csv("datasets/cleaned_dataset.csv")
print(f"Dataset shape: {df_processed.shape}")

Dataset shape: (407047, 5)


In [8]:
premises = df_processed['premise'].tolist()
hypotheses = df_processed['hypothesis'].tolist()
labels = df_processed['label'].tolist()

In [9]:
X_temp_premise, X_test_premise, X_temp_hypothesis, X_test_hypothesis, y_temp, y_test = train_test_split(
    premises, hypotheses, labels, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels
)

# Second split: train vs validation
X_train_premise, X_val_premise, X_train_hypothesis, X_val_hypothesis, y_train, y_val = train_test_split(
    X_temp_premise, X_temp_hypothesis, y_temp,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 of total data for validation
    random_state=42,
    stratify=y_temp )


In [18]:
pip install datasets

Collecting datasets
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-21.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached aiohttp-3.12.15-cp311-cp311-win_amd64.whl.metadata (7.9 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached aiohappyeyeballs-2.

In [10]:
from datasets import Dataset, DatasetDict

train_dict = {
    "premises": X_train_premise,
    "hypotheses": X_train_hypothesis,
    "labels": y_train
}

val_dict = {
    "premises": X_val_premise,
    "hypotheses": X_val_hypothesis,
    "labels": y_val
}

test_dict = {
    "premises": X_test_premise,
    "hypotheses": X_test_hypothesis,
    "labels": y_test
}


In [12]:
# Create Dataset objects for each split
train_ds = Dataset.from_dict(train_dict)
val_ds = Dataset.from_dict(val_dict)
test_ds = Dataset.from_dict(test_dict)

# Combine into a DatasetDict for convenience
ds = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# Access individual splits
print(ds["train"][0])
print(ds["validation"][0])
print(ds["test"][0])


{'premises': 'every single one of them is a tax-cutting, reform-the-government, conservative republican, gingrich declared on abc.', 'hypotheses': 'gingrich was made that he did not get a seat at the time.', 'labels': 1}
{'premises': 'and uh so i had her baby sitting but she was six months pregnant and it was getting too much for her so i just quit i would rather quit and take care of my own kids than let somebody else raise them', 'hypotheses': 'my babysitter was approaching her third trimester and struggling so decided to look after my kids instead', 'labels': 0}
{'premises': 'uh you can you can buy bags of silver coins a a bag has a thousand dollars face value in it and it is traded for silver', 'hypotheses': 'the bags are available for sale and you get them for just a thousand dollars.', 'labels': 0}


In [8]:
print(f"Train samples: {len(y_train)}")
print(f"Validation samples: {len(y_val)}")
print(f"Test samples: {len(y_test)}")

Train samples: 244227
Validation samples: 81410
Test samples: 81410


In [14]:
import torch

max_length = 128

def preprocess_function(examples, batch_size=2):
    inputs = [f"Premise : {x} Hypothesis: {y} Label : " for x, y in zip(examples["premises"], examples["hypotheses"])]
    all_encodings = {'input_ids': [], 'attention_mask': []}
    for i in range(0, len(examples["premises"]), batch_size):
        batch_input = inputs[i:i+batch_size]
        model_inputs = tokenizer(batch_input)
        for i in range(len(batch_input)):
            sample_input_ids = model_inputs["input_ids"][i]
            model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
                max_length - len(sample_input_ids)
            ) + sample_input_ids
            model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
            ][i]
            model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
            model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
            all_encodings["input_ids"].append(model_inputs["input_ids"])
            all_encodings['attention_mask'].append(model_inputs['attention_mask'])
    final_encodings = {
        'input_ids': torch.cat(all_encodings['input_ids'], dim=0),
        'attention_mask': torch.cat(all_encodings['attention_mask'], dim=0)
    }
    return final_encodings

In [15]:
train_encodings = ds["train"].map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    remove_columns=ds["train"].column_names,
    desc="Tokenizing train"
)
val_encodings = ds["validation"].map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    remove_columns=ds["validation"].column_names,
    desc="Tokenizing val"
)
test_encodings = ds["test"].map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    remove_columns=ds["test"].column_names,
    desc="Tokenizing test"
)

Tokenizing train:   0%|                                                              | 0/244227 [00:00<?, ? examples/s]


TypeError: preprocess_function() missing 2 required positional arguments: 'hypotheses' and 'tokenizer'

In [None]:
train_encodings = ds.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)