### Environment Setup

# dependencies
# https://github.com/seyonechithrananda/bert-loves-chemistry/blob/master/environment.yml

In [None]:
!pip install --pre deepchem
import deepchem
deepchem.__version__

In [None]:
!pip install transformers
!pip install simpletransformers
!pip install datasets
!pip install wandb

In [None]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']

In [None]:
!git clone https://github.com/seyonechithrananda/bert-loves-chemistry.git

### Import libraries & Parameters setting

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from typing import List

from rdkit import Chem
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline, RobertaModel, RobertaTokenizer
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# import MoleculeNet loaders from DeepChem
from deepchem.molnet import load_tox21

# import MoleculeNet dataloder from bert-loves-chemistry fork
from chemberta.utils.molnet_dataloader import load_molnet_dataset, write_molnet_dataset_for_chemprop

In [None]:
# logging directories setting
project_name = "ChemBERTa_"       # dataset 구분
output_path = './output_bbbp'
model_name = 'model_1'

model_folder = os.path.join(output_path, model_name)

evaluation_folder = os.path.join(output_path, model_name + '_evaluation')
if not os.path.exists(evaluation_folder):
    os.makedirs(evaluation_folder)

# parameters setting
EPOCHS = 200
BATCH_SIZE = 256
patience = 15
optimizer = "AdamW"
learning_rate = 0.00001
manual_seed = 112

print(model_folder)

### Dataset loading & splitting
- deepchem/molnet/load_function 확인

In [None]:
tasks, (train_df, valid_df, test_df), transformers = load_molnet_dataset("bbbp", tasks_wanted=None)

print(f"train set: {train_df.shape[0]}")
print(f"valid set: {valid_df.shape[0]}")
print(f"test set:  {test_df.shape[0]}")

### Classification with ChemBERTa
- DeepChem + **RobERTa (BERT의 변형 모델)**
- **Tokenizer**: **RobertaTokenizerFast**

In [None]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# log in to Weights & Biases for experiment tracking
# !wandb login (your_token)

In [None]:
from sklearn.metrics import confusion_matrix

def compute_metrics(preds, labels):
    # preds: (n_samples, n_classes) -> probability scores
    # labels: (n_samples,) -> true labels
    
    pred_labels = preds.argmax(axis=1)
    cm = confusion_matrix(labels, pred_labels)
    
    TN, FP, FN, TP = cm.ravel()
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    
    metrics = {
        "TN": TN,
        "FP": FP,
        "FN": FN,
        "TP": TP,
        "sensitivity": sensitivity,
        "specificity": specificity,
    }
    
    # Log metrics to wandb
    wandb.log(metrics)
    
    return metrics

In [None]:
# wandb config
# configure Weights & Biases logging
wandb_kwargs = {'name' : model_name}

classification_args = {
    'evaluate_each_epoch': True,
    'evaluate_during_training_verbose': True,
    'evaluate_during_training': True,
    'best_model_dir': model_folder,
    'no_save': False,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': False,
    'save_best_model': True,
    'save_steps': -1,
    'num_train_epochs': EPOCHS,
    'use_early_stopping': True,
    'early_stopping_patience': patience,
    'early_stopping_delta': 0.001,
    'early_stopping_metric': 'eval_loss',
    'early_stopping_metric_minimize': True,
    'early_stopping_consider_epochs': True,
    'fp16': False,
    'optimizer': optimizer,
    'adam_betas': (0.95, 0.999),
    'learning_rate': learning_rate,
    'manual_seed': manual_seed,
    'train_batch_size': BATCH_SIZE,
    'eval_batch_size': BATCH_SIZE,
    'logging_steps': len(train_df) / BATCH_SIZE,
    'auto_weights': True,
    'wandb_project': project_name,
    'wandb_kwargs': wandb_kwargs,
    'compute_metrics': compute_metrics  # Add the custom metrics function here
}

In [None]:
model = ClassificationModel('roberta', 'DeepChem/ChemBERTa-77M-MLM', args=classification_args)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3, shuffle=True)
acc_list = []

train_data = pd.read_csv('C:/Users/Admin/Desktop/ChemBERTa/clintox5.csv')
X = train_data  # 전체 데이터 (SMILES와 기타 컬럼 포함)
y = train_data['label']

fold_number=1
for train_index, val_index in skf.split(X,y):
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]
    
    fold_output_dir = os.path.join(model_folder, f"fold_{fold_number}")
    
    results = model.train_model(train_df, eval_df=valid_df, output_dir=fold_output_dir)
    #results = model.train_model(train_df, eval_df=valid_df, output_dir=model_folder)
    result, model_outputs, wrong_predictions = model.eval_model(val_df, acc=sklearn.metrics.accuracy_score)
    print(result['acc'])
    acc_list.append(result['acc'])
    fold_number += 1

for i, result in enumerate(acc_list, 1):
    print(f"Fold-{i}: {result}")
    
print(f"{n}-fold CV accuracy result: Mean: {np.mean(acc_list)} Standard deviation:{np.std(acc_list)}")

### Evaluation

In [None]:
# evaluate the best model
result, model_outputs, wrong_predictions = model.eval_model(test_df, acc=sklearn.metrics.accuracy_score)
print(result)

In [None]:
import matplotlib.pyplot as plt  

# Data for the line graphs
x = ['Base', 'Interval 1', 'Interval 2', 'Interval 3', 'Interval 4', 'Interval 5']
(a1, a2, a3, a4, a5, a6) = (0.922,0.941,0.936,0.945,0.947,0.949)
(b1, b2, b3, b4, b5, b6) = (1.0,0.315,0.612,0.685,0.704,0.712)
(c1, c2, c3, c4, c5, c6) = (0.0,0.524,0.894,0.881,0.780,0.747)

y_accuracy = [a1, a2, a3, a4, a5, a6]
y_sensitivity = [b1, b2, b3, b4, b5, b6]
y_specificity = [c1, c2, c3, c4, c5, c6]

# Create the plot
plt.figure(figsize=(10, 6))

# Plot each line with specified colors and labels
plt.plot(x, y_accuracy, label='Accuracy', color='black', marker='o')
plt.plot(x, y_sensitivity, label='Sensitivity', color='blue', marker='o')
plt.plot(x, y_specificity, label='Specificity', color='red', marker='o')

# Annotate each point with its y-value
for i, txt in enumerate(y_accuracy):
    plt.text(i, y_accuracy[i] + 0.03, f'{txt:.3f}', color='black', fontsize=8, ha='center')
for i, txt in enumerate(y_sensitivity):
    plt.text(i, y_sensitivity[i] + 0.03, f'{txt:.3f}', color='blue', fontsize=8, ha='center')
for i, txt in enumerate(y_specificity):
    plt.text(i, y_specificity[i] + 0.015, f'{txt:.3f}', color='red', fontsize=8, ha='center')

# Add labels, title, and legend
plt.xlabel('Intervals')
plt.ylabel('Values')
plt.title('BBBP - (Accuracy, Sensitivity, Specificity)')   # Dataset 구분
plt.legend()

# Set y-axis ticks
plt.yticks([i * 0.1 for i in range(11)])

# Save the plot as a PNG file
plt.savefig("aplot3.png", dpi=300)

# Show the plot
plt.show()