ref implementation

https://github.com/awsm-research/gpt2sp

In [None]:
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
import torch.nn as nn
from transformers import GPT2Model, GPT2PreTrainedModel
import torch
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import time
from transformers import GPT2Config
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd

import os

In [None]:
# parameters
BATCH_SIZE = 256
SEQUENCE_LEN = 20
LEARNING_RATE = 5e-4
EPOCHS = 10
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
OUTPUT = ''
PROJECT = 'mes_all'

In [None]:
class GPT2SP(GPT2PreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPT2Model(config)
        self.dense1 = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
        self.dense2 = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
                
        # MLP Layer
        hidden_states = self.dense1(hidden_states)
        hidden_states = self.dense2(hidden_states)
        
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = nn.L1Loss()
                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))

        if not return_dict:
            output = (pooled_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

In [None]:
def prepare_dataloader(seq, y, sampler_type):
    tensor_dataset = TensorDataset(seq, y)
    if sampler_type == 'random':
        sampler = RandomSampler(tensor_dataset)
    elif sampler_type == 'sequential':
        sampler = SequentialSampler(tensor_dataset)
    dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=BATCH_SIZE)
    return dataloader

def tokenization(tokenizer, text_list):
    return tokenizer.batch_encode_plus(text_list, truncation=True, max_length=SEQUENCE_LEN, padding='max_length')

In [None]:
data = pd.read_csv(f'/kaggle/input/storypoint/{PROJECT}.csv')
data = data[data['storypoint'] != -1]
data['description'] = data['description'].fillna('')
data.dropna(inplace=True)
data['text'] = data['title'] + ' ' + data['description']
data['label'] = data['storypoint'].astype(float)
data = data[['text', 'label']]

train_val_split_point = int(len(data) * 0.6)
val_test_split_point = int(len(data) * 0.8)
train_text = data['text'][:train_val_split_point]
train_labels = data['label'][:train_val_split_point]
val_text = data['text'][train_val_split_point:val_test_split_point]
val_labels = data['label'][train_val_split_point:val_test_split_point]
test_text = data['text'][val_test_split_point:]
test_labels = data['label'][val_test_split_point:]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = '[PAD]'

In [None]:
# tokenization
tokens_train = tokenization(tokenizer, train_text.tolist())
tokens_val = tokenization(tokenizer, val_text.tolist())

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_y = torch.tensor(train_labels.tolist()).type(torch.LongTensor)
train_dataloader = prepare_dataloader(train_seq, train_y, sampler_type='random')

val_seq = torch.tensor(tokens_val['input_ids'])
val_y = torch.tensor(val_labels.tolist()).type(torch.LongTensor)
val_dataloader = prepare_dataloader(val_seq, val_y, sampler_type='sequential')

tokens_test = tokenization(tokenizer, test_text.tolist())
test_seq = torch.tensor(tokens_test['input_ids'])
test_y = torch.tensor(test_labels.tolist()).type(torch.LongTensor)
test_dataloader = prepare_dataloader(test_seq, test_y, sampler_type='sequential')

all_test_dataloader = []
all_test_dataloader.append(test_dataloader)

In [None]:
config = GPT2Config(num_labels=1, pad_token_id=50256)
model = GPT2SP.from_pretrained('gpt2', config=config)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
# train
total_steps = len(train_dataloader) * EPOCHS
# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
print("Start training ...")
writer = SummaryWriter(f'/kaggle/working/tb/{PROJECT}')

# vars for model selection
min_eval_loss_epoch = [10000, 0]
    
time_records = []
MAE_RECORDS = []
MDAE_RECORDS = []
start_time = time.time()

for e in range(EPOCHS):
    # ---TRAINING---
    # clean GPU memory
    torch.cuda.empty_cache()
    print(">>> epoch ", e)
    # set model into train mode
    model.train()
    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):            
        b_input_ids = batch[0].to(DEVICE)
        b_labels = batch[1].to(DEVICE)
        model.zero_grad()
        result = model(b_input_ids, 
                        labels=b_labels,
                        return_dict=True)
        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()  
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        # clean memory
        del step, batch, b_input_ids, b_labels, result, loss, logits

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(" Average training MAE loss: {0:.2f}".format(avg_train_loss))
    writer.add_scalar('loss/train', avg_train_loss, e)
    # clean memory
    del avg_train_loss, total_train_loss
        
    time_records.append(time.time() - start_time)
        
    # ---EVAL---
    print("---Evaluating ...")
    # set model into eval mode
    model.eval()
    total_eval_loss = 0
    for batch in val_dataloader:            
        b_input_ids = batch[0].to(DEVICE)
        b_labels = batch[1].to(DEVICE)
        model.zero_grad()
        result = model(b_input_ids, 
                        labels=b_labels,
                        return_dict=True)
        loss = result.loss
        logits = result.logits
        total_eval_loss += loss.item()  
        # clean memory
        del b_input_ids, b_labels, batch, result, loss, logits
    avg_eval_loss = total_eval_loss / len(val_dataloader)
    print(" Average eval MAE loss: {0:.2f}".format(avg_eval_loss))
        
    if avg_eval_loss <= min_eval_loss_epoch[0]:
        min_eval_loss_epoch[0] = avg_eval_loss
        min_eval_loss_epoch[1] = e
        
    writer.add_scalar('loss/eval', avg_eval_loss, e)
    # clean memory
    del avg_eval_loss, total_eval_loss
    # save model state to dict
    torch.save(model.state_dict(), '/kaggle/working/models/' + 'epo_' + str(e))
        
    print("===============================")
        
    # testing on holdout data
    index = 0
    for test_dataloader in all_test_dataloader:
        index += 1
        predictions = []
        true_labels = []
        for batch in test_dataloader:
            batch = tuple(t.to(DEVICE) for t in batch)
            b_input_ids, b_labels = batch
            with torch.no_grad():
                logits = model(b_input_ids)
            logits = logits['logits'].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.append(logits)
            true_labels.append(label_ids)
        # calculate errors
        distance_records = []
        for i in range(len(predictions)):
            for j in range(len(predictions[i])):
                distance = abs(predictions[i][j] - true_labels[i][j])
                distance_records.append(distance)

        ## MAE = mean value of all absolute errors (stored in distance_records)
        MAE = np.mean(np.array(distance_records)) 
        ## MdAE = median value of all absolute errors (stored in distance_records)
        MdAE = np.median(np.array(distance_records)) 

        MAE_RECORDS.append(MAE)
        MDAE_RECORDS.append(MdAE)
            
        OUTPUT +=  'Epochs ' + str(e) + '\n'
        OUTPUT += 'MAE: ' + str(MAE) + '\n'
        OUTPUT += 'MdAE: ' + str(MdAE) + '\n\n'
        print('MAE: ', MAE)
        print('MdAE: ', MdAE)
writer.flush()
writer.close()
    
# select model
os.rename('/kaggle/working/models/epo_' + str(min_eval_loss_epoch[1]), 
            f'/kaggle/working/models/{PROJECT}_epo_' + str(min_eval_loss_epoch[1]))
    
# del unwanted models
for i in range(20):
    try:
        os.remove("/kaggle/working/models/epo_" + str(i))
    except:
        continue
            
OUTPUT += 'MAE: ' + str(MAE_RECORDS[min_eval_loss_epoch[1]]) \
        + '  MdAE: ' + str(MDAE_RECORDS[min_eval_loss_epoch[1]]) + '\n'
OUTPUT += 'training time: ' + str(time_records[min_eval_loss_epoch[1]]) + '\n'
OUTPUT += 'Epochs: ' + str(min_eval_loss_epoch[1]) +'\n'
OUTPUT += 'batch size: ' + str(BATCH_SIZE)
print('all done for one project')