In [1]:
! pip install rouge_score
! pip install -U accelerate



In [2]:
!pip install transformers datasets



In [3]:
!pip install wandb



In [4]:
!pip install sacremoses



In [5]:
import glob
import wandb
import matplotlib.pyplot as plt

from io import StringIO
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
from datasets import load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch
from datasets import Dataset
from sklearn.metrics import f1_score
import numpy as np
import os
from transformers import BioGptTokenizer, BioGptForCausalLM

In [6]:
import accelerate

In [7]:
tqdm.pandas()

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
os.environ["WANDB_PROJECT"] = "BioGPT" # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

In [10]:
def read_paper(path):
  f = open(path, 'r', encoding="utf-8")
  text = str(f.read())
  f.close()
  return text

In [11]:
def create_list(folder_path):
  temp_papers = []
  for filename in tqdm(glob.glob(folder_path + "/*.txt")):
      temp_papers.append(read_paper(filename))
  for filename in tqdm(glob.glob(folder_path + "/*.xml")):
      temp_papers.append(read_paper(filename))
  return temp_papers

In [12]:
train_path = '/content/drive/MyDrive/train'
val_path = '/content/drive/MyDrive/validation'
test_path = '/content/drive/MyDrive/test'

In [13]:
list_train = create_list(train_path)
list_val = create_list(val_path)
list_test = create_list(test_path)

100%|██████████| 445/445 [00:00<00:00, 745.07it/s]
100%|██████████| 226/226 [00:00<00:00, 1213.36it/s]
100%|██████████| 41/41 [00:00<00:00, 524.56it/s]
100%|██████████| 62/62 [00:00<00:00, 1256.96it/s]
100%|██████████| 19/19 [00:00<00:00, 614.41it/s]
100%|██████████| 86/86 [00:00<00:00, 1196.76it/s]


In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [15]:
model = "microsoft/biogpt"

tokenizer = BioGptTokenizer.from_pretrained(model)

model_compactbiobert = BioGptForCausalLM.from_pretrained(model).to(device)

In [16]:
encoded_data_train = tokenizer.batch_encode_plus(
    list_train,
    add_special_tokens=True,
    return_attention_mask=True,
    truncation=True,
    padding='longest',
    max_length=512,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    list_val,
    add_special_tokens=True,
    return_attention_mask=True,
    truncation=True,
    padding='longest',
    max_length=512,
    return_tensors='pt'
)

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False,mlm_probability=0.15, return_tensors="pt")

In [18]:
savePath = "/content/drive/MyDrive/models/"

In [19]:
trainingArguments = TrainingArguments(
    savePath + "checkpoints",
    logging_steps=1,
    overwrite_output_dir=True,
    num_train_epochs=100,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    do_train=True,
    do_eval=True,
    warmup_steps=5000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=64,
    metric_for_best_model='eval_loss',
    weight_decay=1e-4,
    fp16=True,
    report_to="wandb",
    gradient_checkpointing=True,
    optim="adamw_torch",
    save_total_limit=2,
    remove_unused_columns=True,
    evaluation_strategy = 'steps',
    eval_steps = 1,
    save_strategy = "steps",
    load_best_model_at_end=True
)

In [20]:
dataset_train = Dataset.from_dict(encoded_data_train)
dataset_val = Dataset.from_dict(encoded_data_val)

In [21]:
trainer = Trainer(model=model_compactbiobert, args=trainingArguments,
                  tokenizer=tokenizer, data_collator=data_collator,
                  train_dataset=dataset_train,eval_dataset=dataset_val)

In [22]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mgravisandeep[0m ([33mravigorti[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
1,2.9159,4.370745
2,1.5246,4.370831
3,1.3894,4.370677
4,2.9159,4.370786
5,0.1399,4.370757
6,2.7746,4.3706
7,1.6804,4.370542
8,1.2346,4.370265
9,2.9119,4.370379
10,0.2731,4.370254


TrainOutput(global_step=100, training_loss=1.648592367693782, metrics={'train_runtime': 6271.6038, 'train_samples_per_second': 10.699, 'train_steps_per_second': 0.016, 'total_flos': 3.801264812772557e+16, 'train_loss': 1.648592367693782, 'epoch': 61.0})

In [23]:
trainer.save_model(savePath + "Biogpt/")