In [1]:

from datasets import load_dataset

In [3]:
headline_data = load_dataset("csv", data_files={
    "train": "../data/headlines_train.csv", 
    "validation": "../data/headlines_valid.csv",
    "test": "../data/headlines_test.csv"
})

Downloading and preparing dataset csv/default to C:/Users/Turgut/.cache/huggingface/datasets/csv/default-e7242ce81c6adec5/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/Turgut/.cache/huggingface/datasets/csv/default-e7242ce81c6adec5/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
print(headline_data)

DatasetDict({
    train: Dataset({
        features: ['headline', 'is_sarcastic'],
        num_rows: 22802
    })
    validation: Dataset({
        features: ['headline', 'is_sarcastic'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['headline', 'is_sarcastic'],
        num_rows: 2851
    })
})


In [7]:
headline_data = headline_data.rename_columns({'headline': 'text', 'is_sarcastic': 'label'})

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)



Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [44]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [None]:
headline_data["validation"][:100]
#headline_data.isna()

In [37]:
print(headline_data["test"].features)
print(headline_data["train"].features)
print(headline_data["validation"].features)

{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}
{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}
{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}


In [41]:
from datasets import ClassLabel, Value

new_features = headline_data['train'].features.copy()
new_features['label'] = ClassLabel(num_classes=2, names=['neg', 'pos'])
headline_data = headline_data.cast(new_features)

Loading cached processed dataset at C:\Users\Turgut\.cache\huggingface\datasets\csv\default-e7242ce81c6adec5\0.0.0\eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\cache-ff5f2d666f517a79.arrow
Loading cached processed dataset at C:\Users\Turgut\.cache\huggingface\datasets\csv\default-e7242ce81c6adec5\0.0.0\eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\cache-b2a046fe2ceda68b.arrow
Loading cached processed dataset at C:\Users\Turgut\.cache\huggingface\datasets\csv\default-e7242ce81c6adec5\0.0.0\eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d\cache-dbd79ea28b6ee05c.arrow


In [42]:
print(headline_data["test"].features)
print(headline_data["train"].features)
print(headline_data["validation"].features)

{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


In [46]:
headline_tokenized = headline_data.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/22802 [00:00<?, ? examples/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
headline_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from torch.utils.data import DataLoader, Dataset

class HeadlineDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [None]:
train_dataset = HeadlineDataset(headline_tokenized, partition_key="train")
test_dataset = HeadlineDataset(headline_tokenized, partition_key="test")
valid_dataset = HeadlineDataset(headline_tokenized, partition_key="validation")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    #shuffle=True, 
    num_workers=4
)

val_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2)

In [None]:
import lightning as L
import torch
import torchmetrics


class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy()
        self.test_acc = torchmetrics.Accuracy()

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)
        
    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])        
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])        
        self.log("val_loss", outputs["loss"], prog_bar=True)
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])        
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer
    

lightning_model = LightningModel(model)

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger


callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    devices=[2],
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

In [None]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")