In [1]:
import torch
from torch import optim, nn, utils
from torch.utils.data import Dataset, DataLoader
import lightning.pytorch as pl

from sklearn import model_selection
from cuml.svm import SVC
import cuml


import numpy as np

import math

from tqdm import tqdm

from transformers import XLNetTokenizer, XLNetModel, AutoTokenizer, AlbertModel, AutoModel, DebertaV2Model, DebertaV2Tokenizer, ElectraModel, RobertaModel, AlbertTokenizer

from helper import load_dataset

ModuleNotFoundError: No module named 'cupy'

In [None]:
class SoftMaxLit(pl.LightningModule):
    """
    Reference
    https://machinelearningmastery.com/introduction-to-softmax-classifier-in-pytorch/
    """
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        self.linear = torch.nn.Linear(n_inputs, n_outputs)
        self.softmax = nn.Softmax(dim=1)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.softmax(self.linear(x))
        
    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y = batch
        
        y_hat = self(x)

        loss = self.criterion(y_hat, y)
        
        self.log('train_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr = 0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
        return optimizer

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log("val_loss", loss)
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y = torch.argmax(y, dim=1)
        y_hat = torch.argmax(self(x), dim=1)
        accuracy = torch.sum(y == y_hat).item() / (len(y) * 1.0)
        self.log('test_acc', accuracy)

In [None]:
class Data(Dataset):
    "The data for multi-class classification"
    def __init__(self, df, *, x=None, load_batch_size=None, tokenizer=None, cls_model=None):
        self.y, self.len = self._get_y_and_len_from_df(df)
        
        if x is not None:
            self.x = x
        else:
            self.x = self._get_x_from_df(df, load_batch_size, tokenizer, cls_model)
        
    def _get_x_from_df(self, df, load_batch_size, tokenizer, cls_model):
        docs = df['text'].tolist()
        inputs = tokenizer(docs, return_tensors="pt", padding=True)

        cls_arr = []
        for i, (x, y) in zip(tqdm(range(math.ceil(len(df) / load_batch_size))), self._get_x_y_from_df_with_batch(df, load_batch_size)):
            cls = cls_model(**{k: inputs[k][x:y] for k in list(inputs.keys())}).last_hidden_state[:, 0, :].detach()
#             cls = cls_model(**{'input_ids':inputs['input_ids'][x:y],'token_type_ids':inputs['token_type_ids'][x:y],'attention_mask':inputs['attention_mask'][x:y]}).last_hidden_state[:, 0, :].detach()
            cls_arr.append(cls)
        return torch.concat(cls_arr)
    
    def _get_y_and_len_from_df(self, df):
        dim_0 = df['text'].shape[0]
        matrix = np.zeros((dim_0,2))
        for i, y in enumerate(df['label'].tolist()):
            matrix[i][y] = 1
        return torch.from_numpy(matrix), dim_0

    def _get_x_y_from_df_with_batch(self, df, step_size):
        l = list(range(0, len(df), step_size))
        for ind, _ in enumerate(l):
            if l[ind] + step_size >= len(df):
                yield (l[ind], len(df))
            else:    
                yield (l[ind], l[ind + 1])

    def __getitem__(self, idx):
        "accessing one element in the dataset by index"
        return self.x[idx], self.y[idx] 
 
    def __len__(self):
        "size of the entire dataset"
        return self.len

    @staticmethod
    def concat(datasets):
        "concatenate dataset embeddings from x provided they are applied on the same df"
        x = torch.cat([dataset.x for dataset in datasets], 1)
        return Data(df, x=x)

# THIS IS LEGACY
class DataLit(pl.LightningDataModule):
    def __init__(self, batch_size = 4):
        super().__init__()
        self.batch_size = batch_size
    
    def setup(self, stage):
        df = load_dataset('../dataset/training.json', test=True)
        dataset = Data(df[:100], 30)  # 10 > 30 > 40 yes # 4 is the best
        self.train_dataset, self.val_dataset, self.test_dataset = utils.data.random_split(dataset,(0.8, 0.1, 0.1))
    
    def train_dataloader(self):
        return DataLoader(dataset = self.train_dataset, batch_size = self.batch_size, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(dataset = self.val_dataset, batch_size = self.batch_size, shuffle=True)
    
    def test_dataloader(self):
        return DataLoader(dataset = self.test_dataset, batch_size = self.batch_size, shuffle=True)

In [None]:
def get_dataloaders(df=None, tokenizer=None, cls_model=None, batch_size=None, dataset=None):
    if not dataset:
        dataset = Data(df[:100], load_batch_size = 30, tokenizer=tokenizer, cls_model=cls_model)  # 10 > 30 > 40 yes # 4 is the best

    train_dataset, val_dataset, test_dataset = utils.data.random_split(dataset,(0.8, 0.1, 0.1))
    train_dataloader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle=True)
    val_dataloader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle=True)
    test_dataloader = DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle=True)
    return {'data': dataset, 'train': train_dataloader, 'val': val_dataloader, 'test': test_dataloader}

# THIS IS LEGACY
# dataloader = DataLit()

In [None]:
# from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
# import torch

# tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# config = AutoConfig.from_pretrained("roberta-base")
# config.is_decoder = True
# model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# # outputs = model(**inputs)

# # prediction_logits = outputs.logits

# inputs

In [None]:
BATCH_SIZE = 4

df = load_dataset('../dataset/training.json', test=True)

# # # XLNet: https://huggingface.co/docs/transformers/model_doc/xlnet # size = 768
# # # Might be able to use XLNetTokenizerFast
# xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') # XLNetTokenizer
# xlnet_cls_model = XLNetModel.from_pretrained('xlnet-base-cased') # XLNetModel
# xlnet_dataloaders = get_dataloaders(df, xlnet_tokenizer, xlnet_cls_model, BATCH_SIZE)

# # ALBERT: https://huggingface.co/docs/transformers/model_doc/albert # size = 768
albert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
albert_cls_model = AlbertModel.from_pretrained("albert-base-v2") # AlbertModel
albert_dataloaders = get_dataloaders(df, albert_tokenizer, albert_cls_model, BATCH_SIZE)

# # ELECTRA: 256
electra_tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator")
electra_cls_model = ElectraModel.from_pretrained("google/electra-small-discriminator")
electra_dataloaders = get_dataloaders(df, electra_tokenizer, electra_cls_model, BATCH_SIZE)

# # Roberta: 768
# roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
# roberta_cls_model = RobertaModel.from_pretrained("roberta-base")
# roberta_dataloaders = get_dataloaders(df, roberta_tokenizer, roberta_cls_model, BATCH_SIZE)

# svm_data = Data.concat([xlnet_dataloaders['data'], albert_dataloaders['data'], electra_dataloaders['data'], roberta_dataloaders['data']])
svm_data = Data.concat([albert_dataloaders['data'], electra_dataloaders['data']])
svm_dataloaders = get_dataloaders(dataset = svm_data)

# EVERYTHING CAN HAVE BATCH SIZE 4 BUT THIS CANNOT EVEN WORK W BATCH SIZE = 1
# DebertaV2: 1536 # Can use DebertaV3
# debertav2_tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v2-xlarge")
# debertav2_cls_model = DebertaV2Model.from_pretrained("microsoft/deberta-v2-xlarge")
# debertav2_dataloaders = get_dataloaders(df, debertav2_tokenizer, debertav2_cls_model, BATCH_SIZE)

# dataloaders = roberta_dataloaders

In [None]:
svm_data = Data.concat([albert_dataloaders['data'], electra_dataloaders['data']])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(svm_data.x,df['label'].tolist(),
                                   random_state=104, 
                                   test_size=0.2, 
                                   shuffle=True)


svc = SVC()
svc.fit(X_train, y_train)

In [None]:
# model = SoftMaxLit(768, 2)
# trainer = pl.Trainer(max_epochs=5)

# # THIS IS LEGACY
# # trainer.fit(model, dataloader)

# trainer.fit(model=model, train_dataloaders=dataloaders['train'], val_dataloaders=dataloaders['val'])

In [None]:
# trainer.test(model, dataloaders=dataloaders['test'])

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir=lightning_logs/

In [None]:
# np.concat([ALBERT_Data, XLNet_Data, ...])

# # Ensemble model using concatenation of embedding outputs
# SVM(x: [
#     0: concat([ALBERT_embedding, XLNet_embedding, ...]),
#     1: ...
#     2: ...
#     n: ...
# ], y: [0, 1, 1, 0, ...])

In [None]:
# # Ensemble - stacking:
# LogisticReggresion([{'x': [xlnet_y_hat, albert_y_hat, roberta_y_hat, svm_y_hat, ..., detecllm_y_hat], 'y': 1}])