In [None]:
!python -m pip install lightning
!pip install transformers tqdm torchmetrics
!pip install sentencepiece

In [None]:
import json
import pandas as pd

import torch

import lightning.pytorch as pl
import pytorch_lightning as plpl
from lightning.pytorch.callbacks import ModelCheckpoint

from torch import optim, nn, utils
from torch.utils.data import Dataset, DataLoader

from transformers import XLNetTokenizer, XLNetModel, AutoTokenizer, AlbertModel, AutoModel, DebertaV2Model, DebertaV2Tokenizer, ElectraModel, RobertaModel, AlbertTokenizer

import numpy as np

import math

from tqdm import tqdm

from argparse import ArgumentParser

import torch
from torchmetrics import Accuracy
from torch import nn
from torch.nn import functional as F
from torch.optim.optimizer import Optimizer

def load_dataset(path, test=True):
    '''Convert samples in JSON to dataframe
    0 if the text is AI-generated
    1 if the text is human-generated
    '''
    data = []
    columns = ['id', 'text', 'label']
    with open(path) as f:
        lines = f.readlines()
        if test:
            for line in lines:
                line_dict = json.loads(line)
                data.append([line_dict['id'], line_dict['text'], line_dict['label']])
        else:
            columns = columns[:-1]
            for line in lines:
                line_dict = json.loads(line)
                data.append([line_dict['id'], line_dict['text']])

    return pd.DataFrame(data, columns=columns).set_index('id')

In [None]:
class SoftMaxLit(pl.LightningModule):
    """
    Reference
    https://machinelearningmastery.com/introduction-to-softmax-classifier-in-pytorch/
    """
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
        self.softmax = nn.Softmax(dim=1)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.softmax(self.linear(x))

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch

        y_hat = self(x)

        loss = self.criterion(y_hat, y)

        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr = 0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
        return optimizer

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("val_loss", loss)

    def test_step(self, batch, batch_idx):
        x, y = batch
        y = torch.argmax(y, dim=1)
        y_hat = torch.argmax(self(x), dim=1)
        accuracy = torch.sum(y == y_hat).item() / (len(y) * 1.0)
        self.log('test_acc', accuracy)

class Data(Dataset):
    "The data for multi-class classification"
    def __init__(self, df, *, x=None, load_batch_size=None, tokenizer=None, pretrained=None, get_y=True):
        if get_y:
            self.y, self.len = self._get_y_and_len_from_df(df)
        else:
            self.y = None
            self.len = df['text'].shape[0]

        if x is not None:
            self.x = x
        else:
            self.x = self._get_x_from_df(df, load_batch_size, tokenizer, pretrained)

    def _get_x_from_df(self, df, load_batch_size, tokenizer, pretrained):
        docs = df['text'].tolist()
        inputs = tokenizer(docs, return_tensors="pt", padding=True)

        cls_arr = []
        for i, (x, y) in zip(tqdm(range(math.ceil(len(df) / load_batch_size))), self._get_x_y_from_df_with_batch(df, load_batch_size)):
            cls = pretrained(**{k: inputs[k][x:y] for k in list(inputs.keys())}).last_hidden_state[:, 0, :].detach()
            cls_arr.append(cls)
        return torch.concat(cls_arr).type(torch.float32)

    def _get_y_and_len_from_df(self, df):
        dim_0 = df['text'].shape[0]
        matrix = np.zeros((dim_0,2))
        for i, y in enumerate(df['label'].tolist()):
            matrix[i][y] = 1
        return torch.from_numpy(matrix).type(torch.float32), dim_0

    def _get_x_y_from_df_with_batch(self, df, step_size):
        l = list(range(0, len(df), step_size))
        for ind, _ in enumerate(l):
            if l[ind] + step_size >= len(df):
                yield (l[ind], len(df))
            else:
                yield (l[ind], l[ind + 1])

    def __getitem__(self, idx):
        "accessing one element in the dataset by index"
        return self.x[idx], self.y[idx]

    def __len__(self):
        "size of the entire dataset"
        return self.len

    @staticmethod
    def concat(df, datasets):
        "concatenate dataset embeddings from x provided they are applied on the same df"
        x = torch.cat([dataset.x for dataset in datasets], 1)
        return Data(df, x=x)

# MODELS
class TransformerModel():
    # # XLNet: https://huggingface.co/docs/transformers/model_doc/xlnet # size = 768
    # # ALBERT: https://huggingface.co/docs/transformers/model_doc/albert # size = 768
    # # ELECTRA: 256
    # # Roberta: 768

    MODELS = {
        'albert': {'name': 'albert-base-v2', 'dim': 768,'tokenizer': AutoTokenizer, 'pretrained': AlbertModel}, # AlbertTokenizer
        'electra': {'name': 'google/electra-small-discriminator', 'dim': 256,'tokenizer': AutoTokenizer, 'pretrained': ElectraModel},
        'roberta': {'name': 'roberta-base', 'dim': 768,'tokenizer': AutoTokenizer, 'pretrained': RobertaModel},
        'xlnet': {'name': 'xlnet-base-cased', 'dim': 768, 'tokenizer': AutoTokenizer, 'pretrained': XLNetModel}, # XLNetTokenizer
    }

    def __init__(self, model_tag):
        if model_tag not in list(self.MODELS.keys()):
            raise ValueError(f'Invalid model: {model_tag}. Valide models are: {self.MODELS.join(" ")}')

        self.model_tag = model_tag
        self.dim = self.MODELS[model_tag]['dim']
        self.tokenizer = self.MODELS[model_tag]['tokenizer'].from_pretrained(self.MODELS[model_tag]['name'])
        self.pretrained = self.MODELS[model_tag]['pretrained'].from_pretrained(self.MODELS[model_tag]['name'])

    def dataset(self, df, dev, save=False, delete=False, get_y = True):
        # cur_df = df[:100] if dev else df
        dataset = Data(df, load_batch_size = 30, tokenizer=self.tokenizer, pretrained=self.pretrained, get_y=get_y)  # 10 > 30 > 40 yes # 4 is the best

        if save:
            torch.save(dataset.x, f"pretrained--dev={dev}--model={self.model_tag}.pt")

        if delete:
            del dataset.x
            torch.cuda.empty_cache()

        return dataset

def get_dataloaders(dataset, batch_size):
    train_dataset, val_dataset, test_dataset = utils.data.random_split(dataset,(0.8, 0.1, 0.1))
    train_dataloader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle=True)
    val_dataloader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle=True)
    test_dataloader = DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle=True)
    return {'train': train_dataloader, 'val': val_dataloader, 'test': test_dataloader}

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.device_count()

In [None]:
device = torch.cuda.current_device()

In [None]:
device

In [None]:
# #### MAKING SURE IT WORKS
# BATCH_SIZE = 128
# NUM_EPOCH = 20
# DEV = True

# # Because DEV
# df = load_dataset('./training.json', test=True)[:100]

#### TRUE

BATCH_SIZE = 128
NUM_EPOCH = 300
DEV = False

# Because DEV
df = load_dataset('./training.json', test=True) # [:100]

model_names = ['albert', 'electra', 'roberta', 'xlnet'] #albert: 128, electra: 64, roberta: 128, xlnet: 128

In [None]:
# for cur_model_name in model_names:
#     TransformerModel(cur_model_name).dataset(df, DEV, save=True, delete=True)

In [None]:
# https://stackoverflow.com/questions/65445651/t5tokenizer-requires-the-sentencepiece-library-but-it-was-not-found-in-your-envicheckpoints = []
checkpoints = []
for cur_model_name in model_names:
    # cur_model_name
    cur_dataset_x = torch.load(f'pretrained--dev={DEV}--model={cur_model_name}.pt')
    cur_data = Data(df, x=cur_dataset_x)
    cur_dataloaders = get_dataloaders(cur_data, BATCH_SIZE)
    cur_model = SoftMaxLit(TransformerModel.MODELS[cur_model_name]['dim'], 2)
    checkpoint_callback = ModelCheckpoint(
        save_top_k=1,
        monitor='val_loss',
        mode='min',
        filename=f'model={cur_model_name}--dev={DEV}' + '--{epoch}-{step}--{val_loss:.2f}'
    )

    trainer = pl.Trainer(callbacks=[checkpoint_callback], max_epochs=NUM_EPOCH)
    trainer.fit(model=cur_model, train_dataloaders=cur_dataloaders['train'], val_dataloaders=cur_dataloaders['val'])

    checkpoints.append(checkpoint_callback.best_model_path)
    best_model = cur_model.load_from_checkpoint(n_inputs=TransformerModel.MODELS[cur_model_name]['dim'], n_outputs=2, checkpoint_path=checkpoint_callback.best_model_path)
    trainer.test(best_model, dataloaders=cur_dataloaders['test'])

    del cur_dataset_x
    del cur_data.x
    torch.cuda.empty_cache()

In [None]:
pretrained_datasets_x = [
    f"pretrained--dev={DEV}--model=albert.pt",
    f"pretrained--dev={DEV}--model=electra.pt",
    f"pretrained--dev={DEV}--model=roberta.pt",
    f"pretrained--dev={DEV}--model=xlnet.pt"
]

# checkpoints = [
#     'lightning_logs/version_0/checkpoints/model=albert--dev=True--epoch=2-step=60--val_loss=0.39.ckpt',
#     'lightning_logs/version_1/checkpoints/model=electra--dev=True--epoch=2-step=60--val_loss=0.53.ckpt',
#     'lightning_logs/version_4/checkpoints/model=roberta--dev=True--epoch=3-step=80--val_loss=0.63.ckpt',
#     'lightning_logs/version_5/checkpoints/model=xlnet--dev=True--epoch=3-step=80--val_loss=0.62.ckpt'
# ]

model_y_arr = []
for model_name, pretrained_dataset_x, ckpt in zip(model_names, pretrained_datasets_x, checkpoints):
    n_inputs = TransformerModel.MODELS[model_name]['dim']
    model = SoftMaxLit(n_inputs, 2).load_from_checkpoint(n_inputs=n_inputs, n_outputs=2, checkpoint_path=ckpt)
    x = torch.load(pretrained_dataset_x).to(device)
    y_hat = model(x)

    # Free up memory
    del x
    torch.cuda.empty_cache()
#     y_first = y_hat[:,0]
    y_first = y_hat
#     y_first = y_first.reshape((y_first.shape[0], 1))

    model_y_arr.append(y_first)

lr_dataset_x = torch.cat(model_y_arr, dim=1).detach()

In [None]:
NUM_EPOCH = 100

lr_dataset = Data(df, x=lr_dataset_x)
lr_dataloaders = get_dataloaders(lr_dataset, BATCH_SIZE)

# lr_model = LogisticRegression(input_dim=lr_dataset_x.shape[1], num_classes=2)
lr_model = SoftMaxLit(lr_dataset_x.shape[1], 2)

checkpoint_callback = ModelCheckpoint(
    save_top_k=1,
    monitor='val_loss',
    mode='min',
    filename=f'model=lr--dev={DEV}'
)

trainer = pl.Trainer(callbacks = [checkpoint_callback], max_epochs=NUM_EPOCH) # callbacks=[checkpoint_callback]
trainer.fit(model=lr_model, train_dataloaders=lr_dataloaders['train'], val_dataloaders=lr_dataloaders['val'])
best_lr_model = lr_model.load_from_checkpoint(n_inputs=lr_dataset_x.shape[1], n_outputs=2, checkpoint_path=checkpoint_callback.best_model_path)
trainer.test(best_lr_model, dataloaders=lr_dataloaders['test'])

In [None]:
checkpoints

In [None]:
validation_df = load_dataset('./test_data.json', test=False)
DEV = False
model_names = ['albert', 'electra', 'roberta', 'xlnet'] #albert: 128, electra: 64, roberta: 128, xlnet: 128

model_y_arr = []
for model_name, ckpt in zip(model_names, checkpoints):
    n_inputs = TransformerModel.MODELS[model_name]['dim']
    model = SoftMaxLit(n_inputs, 2).load_from_checkpoint(n_inputs=n_inputs, n_outputs=2, checkpoint_path=ckpt)

    x = TransformerModel(model_name).dataset(validation_df, DEV, save=False, delete=False, get_y=False).x.to(device)
    y_hat = model(x)

    # Free up memory
    del x
    torch.cuda.empty_cache()
    y_first = y_hat

    model_y_arr.append(y_first)
lr_dataset_x = torch.cat(model_y_arr, dim=1).detach()
x = lr_dataset_x.to(device)

lr_model = SoftMaxLit(lr_dataset_x.shape[1], 2).load_from_checkpoint(n_inputs=lr_dataset_x.shape[1], n_outputs=2, checkpoint_path='/content/lightning_logs/version_17/checkpoints/model=lr--dev=False.ckpt').to(device)
validation_out = lr_model(x)
validation_out

In [None]:
x.shape

In [None]:
validation_out = validation_out.detach()
out = torch.argmax(validation_out, dim=1)

In [None]:
f = open('answer.json', 'w')
f.write('')
f.close()

f = open('answer.json', 'a')
for idx, label_out in enumerate(out.tolist()):
    to_write = '{"id": ' + str(idx) + ', "label": ' + str(label_out) + '}\n'
    f.write(to_write)
f.close()
# {"id": 0, "label": 1}

In [None]:
len(out.tolist())

In [None]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/