# BCM-DTI


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from SSCNN_train import test
import pickle
import torch
from SSCNN_model import SSCNN_DTI
from SSCNN_utils import load_train_val_test_set
from SSCNN_dataset import NewDataset
from torch.utils.data import DataLoader
import sys, json, torch, pickle
from sklearn.metrics import matthews_corrcoef
from SSCNN_utils import get_one_bcm
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
)


# 1 Step- Comparing my results with those in the literature 

In [None]:
#!git clone https://github.com/ld199609/BCM-DTI.git

The model is executed through the SSCNN_train.py script, which handles the training process and produces the history data later visualized in the notebook. Some changes were made to the SSCNN_train.py file in order to use exactly the same split that I used in the Barlow model so that I could compare results.

# 2 STEP- Results of training model whit my datasets

Preparation of PAPYRUS dataset for BCM-Dti model

In [None]:
def generate_data_txt_from_csv(csv_path, output_path):
    df = pd.read_csv(csv_path)

    with open(output_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            line = f"{row['smiles']} {row['sequence']} {int(row['label'])} {row['split']}\n"
            f.write(line)


generate_data_txt_from_csv("Papyrus_Barlow.csv", "Papyrus.txt")


Preparation of PAPYRUS + TB dataset for BCM-Dti model

In [None]:


def generate_data_txt_from_csv(csv_path, output_path):
    df = pd.read_csv(csv_path)

    with open(output_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            line = f"{row['smiles']} {row['sequence']} {int(row['label'])} {row['split']}\n"
            f.write(line)

    print(f"✓ Arquivo 'data.txt' criado em {output_path}")


generate_data_txt_from_csv("Papyrus_TB_Barlow.csv", "Papyrus_TB_Barlow.txt")

Preparation of TB dataset for BCM-Dti model

In [None]:


def generate_data_txt_from_csv(csv_path, output_path):
    df = pd.read_csv(csv_path)

    with open(output_path, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            line = f"{row['smiles']} {row['sequence']} {int(row['label'])} {row['split']}\n"
            f.write(line)

    print(f"✓ Arquivo 'data.txt' criado em {output_path}")


generate_data_txt_from_csv("TB_BARLOW.csv", "TB.txt")

# Fine-tuning

 Fine tuning, adapting the pre-trained BCM-DTI model, originally trained with the Papyrus dataset, to the specific context of predicting drug-protein interactions associated with tuberculosis. The model's performance was evaluated on the tuberculosis test dataset, before and after fine tuning.

In [None]:

n = 2  

args['d_channel_size'] = args['d_channel_size'][n]
args['p_channel_size'] = args['p_channel_size'][n]
print(args['d_channel_size'])  
print(args['p_channel_size']) 


[25, 128, 256, 512]
[462, 128, 256, 512]


In [None]:
args = SSCNN_args()
args['max_drug_seq'] = 44
args['max_protein_seq'] = 462
args['input_d_dim'] = len(words2idx_d_old) + 1
args['input_p_dim'] = len(words2idx_p_old) + 1
args['d_channel_size'][2][0] = args['max_drug_seq'] 
args['p_channel_size'][2][0] = args['max_protein_seq']
args['d_channel_size'] = args['d_channel_size'][2]
args['p_channel_size'] = args['p_channel_size'][2]
args['dataset_name'] = "TB"

model = SSCNN_DTI(args)
model.load_state_dict(torch.load("TB.pkl"))
model.to("cuda")

with open("words2idx_d_papyrus.pkl", "rb") as f:
    words2idx_d = pickle.load(f)

with open("words2idx_p_papyrus.pkl", "rb") as f:
    words2idx_p = pickle.load(f)

input_path = "TB"
trainSmiles, trainProtein, trainLabel, \
valSmiles, valProtein, valLabel, \
testSmiles, testProtein, testLabel, \
frag_set_d, frag_set_p, \
frag_len_d, frag_len_p, _, _ = load_train_val_test_set(
    input_path, decompose="bcm", decompose_protein="category",
    unseen_smiles=False, k=3, 
)

testDataset = NewDataset(testSmiles, testProtein, testLabel, words2idx_d, words2idx_p, args['max_drug_seq'], args['max_protein_seq'])
test_loader = DataLoader(testDataset, batch_size=args['batch_size'], shuffle=False)

model.eval()
with torch.no_grad():
    auc, auprc, acc, recall, precision, f1, logits, loss = test(test_loader, model)

print(f"\nResults on dataset TB:\n"
      f"AUROC: {auc:.4f}\n"
      f"AUPRC: {auprc:.4f}\n"
      f"Prec: {precision:.4f}\n"
      f"Recall: {recall:.4f}\n"
      f"F1 Score: {f1:.4f}\n"
      f"Accc: {acc:.4f}\n"
      f"Loss: {loss:.4f}\n")


tuberculosis metrics on dataset papyrus + tb

In [None]:
df = pd.read_csv("metricstb.csv")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open("words2idx_d_Papyrus_TB.pkl","rb") as f:
    words2idx_d = pickle.load(f)
with open("words2idx_p_Papyrus_TB.pkl","rb") as f:
    words2idx_p = pickle.load(f)
max_drug_seq = 44
max_prot_seq = 462

smiles_list = df["smiles"].tolist()
prot_list   = df["sequence"].tolist()


if "label" not in df.columns:
    raise ValueError(" 'label' not exists.")
labels_real = df["label"]


mask_valid = labels_real.notna()
if not mask_valid.all():
    df = df.loc[mask_valid].reset_index(drop=True)
    smiles_list = df["smiles"].tolist()
    prot_list   = df["sequence"].tolist()
    labels_real = df["label"].reset_index(drop=True)

labels_real = labels_real.astype(int).clip(0,1).tolist()

sm_frag, pr_frag, _ = get_one_bcm(smiles_list, prot_list, labels_real, decompose2="category", k=3)

with open("config_Papyrus_TB.json") as f:
    args_config = json.load(f)

dataset_inf = NewDataset(sm_frag, pr_frag, labels_real, words2idx_d, words2idx_p, max_drug_seq, max_prot_seq)
loader_inf  = DataLoader(dataset_inf, batch_size=64, shuffle=False)

model = SSCNN_DTI(args_config)
model.load_state_dict(torch.load("Papyrus_TB.pkl", map_location=device))
model.to(device).eval()

scores = []
with torch.no_grad():
    for d, _, p, _ in loader_inf:
        out = model(d.long().to(device), p.long().to(device)) 
        s = out.squeeze()

        if (s.min() < 0) or (s.max() > 1):
            s = torch.sigmoid(s)

        scores.extend(s.detach().cpu().numpy().tolist())


y_true  = np.array(labels_real, dtype=int)
y_score = np.array(scores, dtype=float)


y_pred = (y_score >= 0.5).astype(int)

has_both_classes = (y_true.min() != y_true.max())

metrics = {}
if has_both_classes:
    metrics["AUC"]   = float(roc_auc_score(y_true, y_score))
    metrics["AP"]    = float(average_precision_score(y_true, y_score))  # PR-AUC
else:
    metrics["AUC"] = None
    metrics["AP"]  = None

metrics["Accuracy"]  = float(accuracy_score(y_true, y_pred))
metrics["F1"]        = float(f1_score(y_true, y_pred, zero_division=0))
metrics["Precision"] = float(precision_score(y_true, y_pred, zero_division=0))
metrics["Recall"]    = float(recall_score(y_true, y_pred, zero_division=0))
mcc = matthews_corrcoef(y_true, y_pred)
metrics["MCC"] = float(mcc)

for k,v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v,float) else f"{k}: {v}")

df_out = df.copy()
df_out["score_bcmdti"] = y_score
df_out["pred_bcmdti"]  = y_pred
df_out.to_csv("predic_bcmdti4.csv", index=False)

with open("metric_bcmdti4.json", "w") as f:
    json.dump(metrics, f, indent=2)