In [1]:
import torch
from torch import optim, nn, utils
from torch.utils.data import Dataset, DataLoader
import lightning.pytorch as pl

import numpy as np

import math

from tqdm import tqdm

from transformers import XLNetTokenizer, XLNetModel

from helper import load_dataset

In [2]:
class XLNetSoftMaxLit(pl.LightningModule):
    """
    Reference
    https://machinelearningmastery.com/introduction-to-softmax-classifier-in-pytorch/
    """
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
        self.softmax = nn.Softmax(dim=1)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.softmax(self.linear(x))
        
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch
        
        y_hat = self(x)

        loss = self.criterion(y_hat, y)
        
        self.log('train_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr = 0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
        return optimizer

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("val_loss", loss)
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y = torch.argmax(y, dim=1)
        y_hat = torch.argmax(self(x), dim=1)
        accuracy = torch.sum(y == y_hat).item() / (len(y) * 1.0)
        self.log('test_acc', accuracy)

In [3]:
class Data(Dataset):
    "The data for multi-class classification"
    def __init__(self, df, load_batch_size):        
        # Might be able to use XLNetTokenizerFast
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        cls_model = XLNetModel.from_pretrained('xlnet-base-cased')
        
        dim_0 = df['text'].shape[0]
        
        docs = df['text'].tolist()
        inputs = tokenizer(docs, return_tensors="pt", padding=True)

        cls_arr = []
        for i, (x, y) in zip(tqdm(range(math.ceil(len(df) / load_batch_size))), self._get_x_y_from_df_with_batch(df, load_batch_size)):
            cls = cls_model(**{'input_ids':inputs['input_ids'][x:y],'token_type_ids':inputs['token_type_ids'][x:y],'attention_mask':inputs['attention_mask'][x:y]}).last_hidden_state[:, 0, :].detach()
            cls_arr.append(cls)
        self.x = torch.concat(cls_arr)
        
        matrix = np.zeros((dim_0,2))
        for i, y in enumerate(df['label'].tolist()):
            matrix[i][y] = 1
        self.y = torch.from_numpy(matrix)
        self.len = dim_0
 
    def _get_x_y_from_df_with_batch(self, df, step_size):
        l = list(range(0, len(df), step_size))
        for ind, _ in enumerate(l):
            if l[ind] + step_size >= len(df):
                yield (l[ind], len(df))
            else:    
                yield (l[ind], l[ind + 1])

    def __getitem__(self, idx):
        "accessing one element in the dataset by index"
        return self.x[idx], self.y[idx] 
 
    def __len__(self):
        "size of the entire dataset"
        return self.len
    
# THIS IS LEGACY
class DataLit(pl.LightningDataModule):
    def __init__(self, batch_size = 4):
        super().__init__()
        self.batch_size = batch_size
    
    def setup(self, stage):
        df = load_dataset('../dataset/training.json', test=True)
        dataset = Data(df[:100], 30)  # 10 > 30 > 40 yes # 4 is the best
        self.train_dataset, self.val_dataset, self.test_dataset = utils.data.random_split(dataset,(0.8, 0.1, 0.1))
    
    def train_dataloader(self):
        return DataLoader(dataset = self.train_dataset, batch_size = self.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(dataset = self.val_dataset, batch_size = self.batch_size, shuffle=True)
    
    def test_dataloader(self):
        return DataLoader(dataset = self.test_dataset, batch_size = self.batch_size, shuffle=True)

In [4]:
df = load_dataset('../dataset/training.json', test=True)
dataset = Data(df[:100], 30)  # 10 > 30 > 40 yes # 4 is the best
train_dataset, val_dataset, test_dataset = utils.data.random_split(dataset,(0.8, 0.1, 0.1))
batch_size = 4

train_dataloader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle=True)
val_dataloader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle=True)
test_dataloader = DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle=True)

# THIS IS LEGACY
# dataloader = DataLit()

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:17<00:00,  4.31s/it]


In [5]:
model = XLNetSoftMaxLit(768, 2)
trainer = pl.Trainer(max_epochs=20)

# THIS IS LEGACY
# trainer.fit(model, dataloader)

trainer.fit(model=model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2023-08-29 19:34:16.582750: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-29 19:34:16.610340: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To 

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [6]:
trainer.test(model, dataloaders=test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_acc': 0.800000011920929}]

In [8]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/