In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments, Trainer, DataCollator
import torch
import numpy as np
from torch.utils.data import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/My Drive/TUM/SS22/LDSI_LAB/Implementations/Seq2Seq/'

/content/drive/My Drive/TUM/SS22/LDSI_LAB/Implementations/Seq2Seq


### Create Data for Hugging face

In [None]:
import json

In [None]:
train_path = 'train.json'
dev_path = 'dev.json'
train_data_raw = json.load(open(train_path))
dev_data_raw = json.load(open(dev_path))

In [None]:
train_sentences_list = list()
train_labels_list = list()

In [None]:
def create_sentence_and_labels_list(data_raw):
  len_data_raw = len(data_raw)
  sentences = list()
  labels = list()
  for i in range(len_data_raw):
    doc_len = len(data_raw[i]['annotations'][0]['result'])
    for j in range(doc_len):
      sent = data_raw[i]['annotations'][0]['result'][j]['value']['text']
      label = data_raw[i]['annotations'][0]['result'][j]['value']['labels'][0]
      sentences.append(sent)
      labels.append(label)
  
  return sentences, labels

In [None]:
train_sentences_list, train_labels_list = create_sentence_and_labels_list(train_data_raw)

In [None]:
dev_sentences_list, dev_labels_list = create_sentence_and_labels_list(dev_data_raw)

In [None]:
#train_sentences_list[54]

In [None]:
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
#a = tokenizer("[156D-E](3) The fact that the amount realised is in excess of\nthe tax leviable and not as amount which was not at all\npayable as tax, would not make any difference.")

In [None]:
class TrainingDataset(Dataset):
  
  def __init__(self):
    self.x = train_sentences_list
    self.y = train_labels_list
    self.n_samples = len(train_sentences_list)

  def __len__(self):
    return self.n_samples

  def __getitem__(self, index):
    input_ids = tokenizer(self.x[index])['input_ids']
    label_ids = tokenizer(self.y[index])['input_ids']
    return {"input_ids":input_ids,"labels":label_ids}
  

  


In [None]:
class DevDataset(Dataset):

  def __init__(self):
    self.x = dev_sentences_list
    self.y = dev_labels_list
    self.n_samples = len(dev_sentences_list)

  def __len__(self):
    return self.n_samples

  def __getitem__(self, index):
    input_ids = tokenizer(self.x[index])['input_ids']
    label_ids = tokenizer(self.y[index])['input_ids']
    return {"input_ids":input_ids,"labels":label_ids} 

  

    

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-seq2seq-labeling",
    evaluation_strategy = "epoch",
    learning_rate = 4e-3,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    save_total_limit = 1,
    save_strategy="no",
    num_train_epochs = 10,
    predict_with_generate = True,
    push_to_hub = False,
    load_best_model_at_end=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=TrainingDataset(),
    eval_dataset=DevDataset(),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

***** Running training *****
  Num examples = 28986
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 36240


Epoch,Training Loss,Validation Loss
1,0.3883,0.41366
2,0.4054,0.446413
3,0.3911,0.420851


Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
***** Running Evaluation *****
  Num examples = 2879
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2879
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2879
  Batch size = 8


In [None]:
trainer.save_model('/content/drive/My Drive/TUM/SS22/LDSI_LAB/Implementations/Seq2Seq/')

### Getting Outputs

In [None]:
#from transformers import TextClassificationPipeline
devdataset = DevDataset()

In [None]:
device = 'cuda'

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
def spacy_tokenize(txt):
    doc = nlp(txt)
    tokens = list(doc)
    clean_tokens = []
    for t in tokens:
        if t.pos_ == 'PUNCT':
            pass
        elif t.pos_ == '\n' or t.pos == '\n\n':
          pass
        elif t.pos_ == 'NUM':
            clean_tokens.append(f'<NUM{len(t)}>')
        else:    
            lower_case = t.lemma_
            clean_tokens.append(lower_case)
    return clean_tokens

In [None]:
preds = list()
for i in range(len(devdataset)):
  data = devdataset.__getitem__(i)
  x = data['input_ids']
  y = data['labels']
  pred = model.generate(input_ids = torch.tensor(x).to(device).view(1,-1))
  #print(pred.shape)
  pred_decoded = spacy_tokenize(tokenizer.decode(pred.squeeze(0)))
  pred_decoded_clean = pred_decoded[3][:-3]
  #print(pred_decoded)
  preds.append(pred_decoded_clean)
  


In [None]:
pred_name = 'pred_t5_16_10.json'

In [None]:
with open(pred_name, 'a') as jsonfile:
      json.dump(preds, jsonfile)

In [None]:
from sklearn.metrics import f1_score
f1_weighted = f1_score(dev_labels_list, preds, average='weighted')
f1_macro = f1_score(dev_labels_list, preds, average='macro')
f1_micro = f1_score(dev_labels_list, preds, average='micro')

In [None]:
print('Weighted AVG F1: '+str(f1_weighted))
print('Macro AVG F1: '+str(f1_macro))
print('Micro AVG F1: '+str(f1_micro))