In [20]:
import os

from sklearn.metrics import average_precision_score, roc_auc_score
import wandb

from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, IntervalStrategy

from transformers import T5ForConditionalGeneration
from t5chem import T5ForProperty, SimpleTokenizer

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F


In [2]:
auc = evaluate.load("roc_auc")
accuracy = evaluate.load("accuracy")
metric = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precison = evaluate.load("precision")
recall = evaluate.load("recall")

In [3]:
pretrain_path = "/home/eyal/Desktop/University/Research/DeepWithdrawn/models/pretrain/simple/"
split_type = 'db_agree_no_dups'
dataset_name = 'NCATS'

In [45]:
model = T5ForProperty.from_pretrained(pretrain_path, num_labels=2,
                                      id2label={0: 'Not Withdrawn', 1:'Withdrawn'},
                                      label2id={'Not Withdrawn': 0, 'Withdrawn': 1})  # for non-seq2seq task

In [46]:
tokenizer = SimpleTokenizer(vocab_file=os.path.join(pretrain_path, 'vocab.pt'))

In [47]:
model

T5ForProperty(
  (shared): Embedding(100, 256)
  (encoder): T5Stack(
    (embed_tokens): Embedding(100, 256)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=256, out_features=512, bias=False)
              (k): Linear(in_features=256, out_features=512, bias=False)
              (v): Linear(in_features=256, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=256, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=256, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=256, bias=False)
              (dropout): Dropout(p=0.1, inpla

In [48]:
inputs = tokenizer.encode("Classification:COC(=O)c1cccc(C(=O)OC)c1>CN(C)N.Cl.O>COC(=O)c1cccc(C(=O)O)c1", return_tensors='pt')

In [49]:
outputs = model(inputs)

In [50]:
outputs.logits.shape

torch.Size([1, 256])

In [4]:
dataset = load_dataset('csv', data_files={'train': f'split/{split_type}/{dataset_name}/train2.csv',
                                          'validation': f'split/{split_type}/{dataset_name}/val.csv',
                                          'test': f'split/{split_type}/{dataset_name}/test.csv',})

Using custom data configuration default-2e9352474f27d6c8


Downloading and preparing dataset csv/default to /home/eyal/.cache/huggingface/datasets/csv/default-2e9352474f27d6c8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/eyal/.cache/huggingface/datasets/csv/default-2e9352474f27d6c8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
dataset = dataset.rename_column('withdrawn_class', 'labels').\
            remove_columns(['Unnamed: 0', 'index', 'length', 'inchikey', 'name', 'groups', 'source']).\
            with_format('torch')

In [6]:
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
model = AutoModelForSequenceClassification.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k", num_labels=2,
                                                           id2label={0: 'Not Withdrawn', 1:'Withdrawn'},
                                                           label2id={'Not Withdrawn': 0, 'Withdrawn': 1})

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_450k were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE_45

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["smiles"], padding="max_length", truncation=True, max_length=300)

In [8]:
dataset = dataset.map(tokenize_function, batched=True).remove_columns(['smiles'])

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    auc_score = auc.compute(prediction_scores=logits[:, 1], references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels)
    aupr = average_precision_score(y_score=logits[:, 1], y_true=labels)
    precision_score = precison.compute(predictions=predictions, references=labels)
    recall_score = recall.compute(predictions=predictions, references=labels)
    return {**f1_score , **{'PR-AUC': aupr}, **accuracy_score, **auc_score, **precision_score, **recall_score}

In [11]:
training_args = TrainingArguments(
    output_dir=f"./results/{split_type}/{dataset_name}",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy=IntervalStrategy.STEPS,
    save_strategy=IntervalStrategy.STEPS,
    report_to='wandb',
    run_name=f'ChemBERTa {split_type} {dataset_name}',
    logging_steps=50,
    save_steps=50
    
    
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset={'Validation': dataset["validation"], 'Test': dataset["test"]},
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 1883
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 351
  Number of trainable parameters = 83450882
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33meyalmazuz[0m. Use [1m`wandb login --relogin`[0m to force relogin
wandb: ERROR Failed to sample metric: Not Supported


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Validation F1,Validation Pr-auc,Validation Accuracy,Validation Roc Auc,Validation Precision,Validation Recall,Validation Runtime,Validation Samples Per Second,Validation Steps Per Second,Test F1,Test Pr-auc,Test Accuracy,Test Roc Auc,Test Precision,Test Recall
50,0.4987,0.549636,0.43871,0.715529,0.815287,0.832553,0.944444,0.285714,12.3273,38.208,9.572,0.238284,0.31808,0.733685,0.651014,0.348028,0.181159
100,0.3618,0.657026,0.635897,0.768095,0.849257,0.849575,0.815789,0.521008,12.4063,37.965,9.511,0.367139,0.366768,0.689808,0.656516,0.345784,0.391304
150,0.3257,0.745371,0.63,0.776755,0.842887,0.865594,0.777778,0.529412,12.4233,37.912,9.498,0.370522,0.379519,0.658428,0.659785,0.321492,0.437198
200,0.2675,0.753218,0.690722,0.80384,0.872611,0.887462,0.893333,0.563025,12.4404,37.861,9.485,0.382911,0.384871,0.67509,0.673062,0.339888,0.438406
250,0.2437,0.860846,0.704225,0.808082,0.866242,0.886769,0.797872,0.630252,12.4289,37.895,9.494,0.411846,0.386955,0.647042,0.674646,0.333833,0.53744
300,0.2386,0.910115,0.697674,0.805883,0.861996,0.886077,0.78125,0.630252,12.435,37.877,9.489,0.416515,0.385281,0.642877,0.673683,0.333576,0.554348
350,0.2116,0.86614,0.7,0.811317,0.872611,0.890518,0.864198,0.588235,12.4334,37.882,9.491,0.410697,0.384042,0.657317,0.673305,0.339652,0.519324


***** Running Evaluation *****
  Num examples = 471
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3601
  Batch size = 4
Saving model checkpoint to ./results/db_agree_no_dups/NCATS/checkpoint-50
Configuration saved in ./results/db_agree_no_dups/NCATS/checkpoint-50/config.json
Model weights saved in ./results/db_agree_no_dups/NCATS/checkpoint-50/pytorch_model.bin
tokenizer config file saved in ./results/db_agree_no_dups/NCATS/checkpoint-50/tokenizer_config.json
Special tokens file saved in ./results/db_agree_no_dups/NCATS/checkpoint-50/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 471
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3601
  Batch size = 4
Saving model checkpoint to ./results/db_agree_no_dups/NCATS/checkpoint-100
Configuration saved in ./results/db_agree_no_dups/NCATS/checkpoint-100/config.json
Model weights saved in ./results/db_agree_no_dups/NCATS/checkpoint-100/pytorch_model.bin
tokenizer config file saved in 

TrainOutput(global_step=351, training_loss=0.3067149913888372, metrics={'train_runtime': 1161.4185, 'train_samples_per_second': 4.864, 'train_steps_per_second': 0.302, 'total_flos': 437608121263200.0, 'train_loss': 0.3067149913888372, 'epoch': 2.99})