# Fine Tuning GPT Models

## Set Up

In [1]:
# !pip install transformers datasets accelerate evaluate sacremoses

In [2]:
#Am I running a GPU and what type is it?
!nvidia-smi

Mon Jul  3 01:40:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10G         On   | 00000000:00:1E.0 Off |                    0 |
|  0%   35C    P0    42W / 300W |      0MiB / 23028MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch

# Clear out cuda
torch.cuda.empty_cache()

if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('Number of GPU(s) available:', torch.cuda.device_count())
    print('GPU device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available')
    device = torch.device("cpu")

Number of GPU(s) available: 1
GPU device name: NVIDIA A10G


In [4]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BioGptTokenizer, BioGptForCausalLM
from transformers import TrainingArguments, Trainer
from transformers import AdamW, get_cosine_schedule_with_warmup, DataCollatorForLanguageModeling
from datasets import load_metric, load_dataset

import re
import random
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import time

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, average_precision_score, auc

from logging import warning
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fcc1cc25e50>

In [5]:
%cd ..

/home/ubuntu/capstone-project


## Build Dataset

In [6]:
# Import data
train_file = 'data/binary_ddi/ddi_train_balanced_150k.csv'
dev_file = 'data/binary_ddi/ddi_val_binary.csv'
test_file = 'data/binary_ddi/ddi_test_binary.csv'

dataset = load_dataset('csv',
                       sep="\t",
                       data_files={'train': train_file, 'validation': dev_file,'test': test_file})

Found cached dataset csv (/home/ubuntu/.cache/huggingface/datasets/csv/default-7c7fda65b4fd85b3/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['d1', 'd2', 'd1_name', 'd2_name', 'interaction', 'interaction_type', 'severity', 'smiles1', 'smiles2'],
        num_rows: 300000
    })
    validation: Dataset({
        features: ['d1', 'd2', 'd1_name', 'd2_name', 'interaction', 'interaction_type', 'severity', 'smiles1', 'smiles2'],
        num_rows: 238072
    })
    test: Dataset({
        features: ['d1', 'd2', 'd1_name', 'd2_name', 'interaction', 'interaction_type', 'severity', 'smiles1', 'smiles2'],
        num_rows: 186753
    })
})

In [8]:
# Tokenizer
# tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/BioMedLM",
#                                           bos_token='<|startoftext|>',
#                                           eos_token='<|endoftext|>',
#                                           pad_token='<|pad|>',
#                                           cls_token='<|cls|>',
#                                           sep_token='<|sep|>'
#                                          )

# tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt",
#                                           bos_token='<|startoftext|>',
#                                           eos_token='<|endoftext|>',
#                                           pad_token='<|pad|>',
#                                           cls_token='<|cls|>',
#                                           sep_token='<|sep|>'
#                                          )

# tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt",
#                                           bos_token='<|startoftext|>',
#                                           eos_token='<|endoftext|>',
#                                           pad_token='<|pad|>',
#                                           cls_token='<|cls|>',
#                                           sep_token='<|sep|>'
#                                          )

# tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B",
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125m",
                                          bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>',
                                          pad_token='<|pad|>',
                                          cls_token='<|cls|>',
                                          sep_token='<|sep|>'
                                         )

# tokenizer = GPT2Tokenizer.from_pretrained("gpt2",
#                                           bos_token='<|startoftext|>',
#                                           eos_token='<|endoftext|>',
#                                           pad_token='<|pad|>',
#                                           cls_token='<|cls|>',
#                                           sep_token='<|sep|>'
#                                          )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
## Test tokenizer
test_smiles1 = "[H][C@]12CC[C@]3([H])[C@]([H])(C[C@@H](O)[C@]4(C)[C@H](CC[C@]34O)C3=CC(=O)OC3)[C@@]1(C)CC[C@@H](C2)O[C@H]1C[C@H](O)[C@H](O[C@H]2C[C@H](O)[C@H](O[C@H]3C[C@H](O)[C@H](O)[C@@H](C)O3)[C@@H](C)O2)[C@@H](C)O1"
test_smiles2 = "[Cl-].[Cl-].[223Ra++]"
source_inputs = '<|startoftext|>' + test_smiles1[:25] + ',<|sep|>' + test_smiles2[:25] + ',<|cls|> Interaction: ' + '1' + '<|endoftext|>'
model_inputs = tokenizer(source_inputs, return_tensors='pt')
model_inputs

{'input_ids': tensor([[50257,    58,    39,  7131,    34,    31,    60,  1065,  4093,    58,
            34,    31,    60,    18, 26933,    39, 12962,    58,    34,    31,
          4357, 50258,    58,  2601,    12,    60,  3693,  2601,    12,    60,
          3693, 22047, 21762,  4880,  4357, 50260,  9492,  2673,    25,   352,
         50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
print(tokenizer('<|startoftext|>'))
print(tokenizer('<|sep|>'))
print(tokenizer('<|cls|>'))
print(tokenizer('<|endoftext|>'))

{'input_ids': [50257], 'attention_mask': [1]}
{'input_ids': [50258], 'attention_mask': [1]}
{'input_ids': [50260], 'attention_mask': [1]}
{'input_ids': [50256], 'attention_mask': [1]}


In [11]:
print(tokenizer.tokenize(source_inputs)[:20])
print(tokenizer.tokenize(source_inputs)[-30:])

['<|startoftext|>', '[', 'H', '][', 'C', '@', ']', '12', 'CC', '[', 'C', '@', ']', '3', '([', 'H', '])', '[', 'C', '@']
['@', ']', '3', '([', 'H', '])', '[', 'C', '@', '],', '<|sep|>', '[', 'Cl', '-', ']', '.[', 'Cl', '-', ']', '.[', '223', 'Ra', '++', '],', '<|cls|>', 'Inter', 'action', ':', 'Ġ1', '<|endoftext|>']


In [12]:
def preprocess_data(examples, tokenizer=tokenizer, data='train'):
    max_input_length = 128

    source_inputs = ['<|startoftext|>' + sm1[:60] + ',<|sep|>' + sm2[:60] + ',<|cls|> Interaction: ' 
                     for (sm1, sm2) in zip(examples['smiles1'], examples['smiles2'])]
    target_inputs = [str(label) for label in examples['interaction_type']]

    # Add labels into training set source inputs
    if data == 'train' or data =='validation':  
        source_inputs = [source_inputs[i] + target_inputs[i] +'<|endoftext|>' for i in range(len(source_inputs))]

    # Tokenize imputs and labels
    model_inputs = tokenizer(source_inputs, 
                             max_length=max_input_length, 
                             padding="max_length", 
                             truncation=True, 
                             return_tensors='pt') 

    # label_inputs = ['<|startoftext|>' + sm1[:250] + ',<|sep|>' + sm2[:250] + ',<|cls|> Interaction: ' + str(label) + +'<|endoftext|>' 
    #                  for (sm1, sm2, label) in zip(examples['smiles1'], examples['smiles2'], examples['interaction_type'])]
    # label_tokens = tokenizer(label_inputs, max_length=max_input_length, padding="max_length", truncation=True)
    
    # Add labels to model_inputs
    # model_inputs["labels"] = label_tokens.input_ids
  
    return model_inputs

In [13]:
encoded_train_ds = dataset['train'].map(lambda x: preprocess_data(x, data='train'), batched=True, remove_columns=dataset['train'].column_names)
encoded_val_ds = dataset['validation'].map(lambda x: preprocess_data(x, data='validation'), batched=True, remove_columns=dataset['validation'].column_names)
encoded_test_ds = dataset['test'].map(lambda x: preprocess_data(x,data='test'), batched=True, remove_columns=dataset['test'].column_names)

encoded_train_ds.set_format(type="torch")
encoded_val_ds.set_format(type="torch")
encoded_test_ds.set_format(type="torch")

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Map:   0%|          | 0/238072 [00:00<?, ? examples/s]

Map:   0%|          | 0/186753 [00:00<?, ? examples/s]

In [14]:
encoded_train_ds

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 300000
})

In [15]:
# Smaller subset of validation set used for faster training
encoded_val_ds_small = encoded_val_ds.select(indices=range(30000))
encoded_val_ds_small

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 30000
})

In [16]:
# Check train dataset model input
print(tokenizer.decode(encoded_train_ds[1]['input_ids'], skip_special_tokens=True))

[K+].[K+].OP([O-])([O-])=O,[H][C@@]1(CCC2=CC=CC=C2N(CC(O)=O)C1=O)N[C@@H](CCC1=CC=CC=C1),Interaction: 1


In [17]:
# Check test dataset model input
print(tokenizer.decode(encoded_test_ds[1]['input_ids'], skip_special_tokens=True))

[La+3].[La+3].[O-]C([O-])=O.[O-]C([O-])=O.[O-]C([O-])=O,CC1CN(CCN1)C1=C(F)C=C2C(=O)C(=CN(C2=C1)C1=C(F)C=C(F)C=C1)C(O,Interaction: 


## Build Trainer

In [18]:
# Define hyperparameters
BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 5e-5
MOMENTUM = 0.9
WEIGHT_DECAY = 0.01

In [19]:
# Initialize
# model = AutoModelForCausalLM.from_pretrained("stanford-crfm/BioMedLM")
# model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt")
# model = BioGptForCausalLM.from_pretrained("microsoft/biogpt")
# model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125m")
# model = GPT2LMHeadModel.from_pretrained("gpt2")
model.to(device)
# model.resize_token_embeddings(len(tokenizer))

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps=30000)
# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [20]:
# Load Metric
metric = load_metric('glue', 'mrpc')
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [21]:
# Test metric calculation
fake_preds = ['1', '0', '1', '1']
fake_labels = ['0', '0', '1', '1']
metric.compute(predictions=fake_preds, references=fake_labels)

{'accuracy': 0.75, 'f1': 0.8}

In [22]:
def compute_metrics(eval_pred, tokenizer=tokenizer):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_preds = [re.findall('Interaction: (.*)', pred)[-1] for pred in list(decoded_preds)]
    
    # labels = np.where(labels != -100, labels, gpt2tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_labels = [re.findall('Interaction: (.*)', label)[-1] for label in list(decoded_labels)]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    return result

In [23]:
# Trainer Argument
model_dir = "models/BioGPT"

args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True,
    do_train=True,
    evaluation_strategy="steps",
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=WEIGHT_DECAY,
    num_train_epochs=EPOCHS,
    # metric_for_best_model="accuracy",
)

In [24]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_train_ds, 
    eval_dataset=encoded_val_ds,
    # data_collator=data_collator,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler),
    # compute_metrics=compute_metrics
)

In [25]:
# os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

In [26]:
trainer.train()

/opt/conda/conda-bld/pytorch_1686274778240/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [12,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1686274778240/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [12,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1686274778240/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [12,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1686274778240/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [12,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1686274778240/work/aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [12,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`