In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
%cd /content/drive/MyDrive/Project

/content/drive/MyDrive/Project


In [36]:
%%capture
!pip install transformers datasets evaluate accelerate

In [40]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from datasets import Dataset
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          DataCollatorWithPadding,
                          TrainingArguments,
                          Trainer)

from utils import compute_metrics

import logging
logging.disable(logging.WARNING)

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [38]:
checkpoint = 'models/DeepChem/ChemBERTa-10M-MLM_BBB'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
train_dataset = Dataset.load_from_disk('data/generated_train_dataset')
valid_dataset = Dataset.load_from_disk('data/valid_dataset')
test_dataset = Dataset.load_from_disk('data/test_dataset')

In [8]:
generated_train_dataset = pd.read_csv('data/generated_train_dataset.csv', dtype={'text': str, 'label': int})

In [10]:
generated_train_dataset = Dataset.from_pandas(generated_train_dataset)

In [25]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

generated_train_dataset = generated_train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3418 [00:00<?, ? examples/s]

In [34]:
generated_train_dataset.save_to_disk('data/generated_train_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/3418 [00:00<?, ? examples/s]

In [31]:
training_args = TrainingArguments(output_dir="results",
                                  gradient_accumulation_steps=1,
                                  learning_rate=1e-5,
                                  weight_decay=0.01,
                                  adam_epsilon=1e-8,
                                  max_grad_norm=1.0,
                                  num_train_epochs=10,
                                  warmup_ratio=0.1,
                                  logging_dir="logs",
                                  logging_first_step=True,
                                  logging_strategy="epoch",
                                  eval_strategy="epoch",
                                  dataloader_num_workers=2,
                                  seed=42)

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=generated_train_dataset,
                  eval_dataset=valid_dataset,
                  tokenizer=tokenizer,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics)

trainer.train()
trainer.save_model(f'models/{checkpoint}_drugbank')

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2182,0.414638,0.901478,0.934641,0.928571,0.940789
2,0.1923,0.486803,0.8867,0.925566,0.910828,0.940789
3,0.1782,0.520633,0.896552,0.931596,0.922581,0.940789
4,0.1806,0.576596,0.8867,0.923588,0.932886,0.914474
5,0.1667,0.58503,0.896552,0.931148,0.928105,0.934211
6,0.1494,0.618724,0.891626,0.927152,0.933333,0.921053
7,0.1373,0.624024,0.891626,0.927632,0.927632,0.927632
8,0.1626,0.627478,0.891626,0.927632,0.927632,0.927632
9,0.136,0.63345,0.891626,0.927632,0.927632,0.927632
10,0.1455,0.640356,0.891626,0.927632,0.927632,0.927632


In [32]:
results = trainer.evaluate(test_dataset)

In [33]:
print(results)

{'eval_loss': 1.0072150230407715, 'eval_accuracy': 0.8349753694581281, 'eval_f1': 0.8903436988543371, 'eval_precision': 0.8831168831168831, 'eval_recall': 0.8976897689768977, 'eval_runtime': 6.0556, 'eval_samples_per_second': 67.045, 'eval_steps_per_second': 8.422, 'epoch': 10.0}
