In [None]:
import os
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/TUM/SS22/LDSI_LAB/Implementations/Hugging_face_Pubmed'

# Data Preprocess

In [None]:
!ls

In [None]:
### Get the delimiter

delim = ''
filename = 'train.txt'
with open(filename) as file:
  print(filename)
  lines = file.readlines()
  lines = [line.rstrip() for line in lines]
  for i in range(len(lines)):
    a = lines[i]
    if i == 13:
      delim = a


## Get training Sentences

In [None]:

filename = 'train.txt'
train_sentences = list()
train_labels = list()
with open(filename) as file:
  print(filename)
  lines = file.readlines()
  lines = [line.rstrip() for line in lines]
  for i in range(len(lines)):
    a = lines[i]
    
    if a == delim:
      continue
    if a[0] == '#':
      continue
    else:
      line = a.split('\t')
      label = line[0]
      sent = line[1]

      train_labels.append(label)
      train_sentences.append(sent)
      
    
    #sent = a[0]
    #label = a[1]
    #sentences.append(sent)
    #labels.append(label)


In [None]:

with open('train_sentences.json', 'a') as jsonfile:
      json.dump(train_sentences, jsonfile)
with open('train_labels.json', 'a') as jsonfile:
      json.dump(train_labels, jsonfile)


## Get Dev Sentences

In [None]:

filename = 'dev.txt'
dev_sentences = list()
dev_labels = list()
with open(filename) as file:
  print(filename)
  lines = file.readlines()
  lines = [line.rstrip() for line in lines]
  for i in range(len(lines)):
    a = lines[i]
    
    if a == delim:
      continue
    if a[0] == '#':
      continue
    else:
      line = a.split('\t')
      label = line[0]
      sent = line[1]

      dev_labels.append(label)
      dev_sentences.append(sent)


In [None]:
with open('dev_sentences.json', 'a') as jsonfile:
      json.dump(dev_sentences, jsonfile)
with open('dev_labels.json', 'a') as jsonfile:
      json.dump(dev_labels, jsonfile) 

# Loading Json

In [None]:
train_sentences = json.load(open('train_sentences.json'))
train_labels = json.load(open('train_labels.json'))
dev_sentences = json.load(open('dev_sentences.json'))
dev_labels = json.load(open('dev_labels.json'))

In [None]:
len(train_sentences)

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments, Trainer, DataCollator
import torch
import numpy as np
from torch.utils.data import Dataset

In [None]:
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
class TrainingDataset(Dataset):
  
  def __init__(self):
    self.x = train_sentences
    self.y = train_labels
    self.n_samples = len(train_sentences)

  def __len__(self):
    return self.n_samples

  def __getitem__(self, index):
    input_ids = tokenizer(self.x[index])['input_ids']
    label_ids = tokenizer(self.y[index])['input_ids']
    return {"input_ids":input_ids,"labels":label_ids}

In [None]:
class DevDataset(Dataset):
  
  def __init__(self):
    self.x = dev_sentences
    self.y = dev_labels
    self.n_samples = len(dev_sentences)

  def __len__(self):
    return self.n_samples

  def __getitem__(self, index):
    input_ids = tokenizer(self.x[index])['input_ids']
    label_ids = tokenizer(self.y[index])['input_ids']
    return {"input_ids":input_ids,"labels":label_ids}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 8
model_name = 't5-small-paheli'
args = Seq2SeqTrainingArguments(
    f"{model_name}-seq2seq-labeling",
    evaluation_strategy = "epoch",
    learning_rate = 2e-3,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    save_total_limit = 1,
    save_strategy="no",
    num_train_epochs = 6,
    predict_with_generate = True,
    push_to_hub = False,
    load_best_model_at_end=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=TrainingDataset(),
    eval_dataset=DevDataset(),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
trainer.save_model('/content/drive/MyDrive/TUM/SS22/LDSI_LAB/Implementations/Hugging_face_Pubmed')

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
devdataset = DevDataset()
device = 'cuda'

In [None]:
def spacy_tokenize(txt):
    doc = nlp(txt)
    tokens = list(doc)
    clean_tokens = []
    for t in tokens:
        if t.pos_ == 'PUNCT':
            pass
        elif t.pos_ == '\n' or t.pos == '\n\n':
          pass
        elif t.pos_ == 'NUM':
            clean_tokens.append(f'<NUM{len(t)}>')
        else:    
            lower_case = t.lemma_
            clean_tokens.append(lower_case)
    return clean_tokens

In [None]:
dev_preds = list()
for i in range(len(devdataset)):
  data = devdataset.__getitem__(i)
  x = data['input_ids']
  y = data['labels']
  pred = model.generate(input_ids = torch.tensor(x).to(device).view(1,-1))
  pred_decoded = spacy_tokenize(tokenizer.decode(pred.squeeze(0)))
  pred_decoded_clean = pred_decoded[3][:-3]
  #print(pred_decoded_clean)
  dev_preds.append(pred_decoded_clean)

In [None]:
from sklearn.metrics import f1_score
f1_weighted = f1_score(dev_labels, dev_preds, average='weighted')
f1_macro = f1_score(dev_labels, dev_preds, average='macro')
f1_micro = f1_score(dev_labels, dev_preds, average='micro')

In [None]:
print('Weighted AVG F1: '+str(f1_weighted))
print('Macro AVG F1: '+str(f1_macro))
print('Micro AVG F1: '+str(f1_micro))