# Barlow Twins

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import pickle
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_curve, precision_recall_curve, matthews_corrcoef, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler




In [None]:
from utils.sequence import uniprot2sequence, encode_sequences
from utils.chem import *
from utils.parallel import *
from utils.sequence import encode_sequences
from utils.chem import get_mols, get_fp  

In [None]:
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, average_precision_score
from rdkit import Chem
from xgboost import XGBClassifier
from rdkit.Chem import AllChem
import torch
import esm
from base_model import BaseModel
from preprocessor import Preprocessor
from barlow_twins import BarlowTwins



### Preparation of PAPYRUS dataset for Barlow Twins model

Renames columns to match the input format expected by the Barlow Twins model, specifically smiles, sequence, and label, remove Nan, create a stratifying split column and truncate proteins and SMILES

In [None]:
sample_data = pd.read_csv("Papyrus_merge.csv")
print(sample_data.columns)

In [None]:
sample_data.rename(columns={"SMILES": "smiles", "Sequence": "sequence", 'pchembl_value_Mean': 'label'}, inplace=True)
final_columns = ["smiles", "sequence", 'label']
sample_data = sample_data[final_columns]

### Split

In [None]:
train_df, temp_df = train_test_split(
    sample_data,
    test_size=0.3,
    stratify=train_df['label'],  
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=42
)

train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'

sample_data = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)


### Truncate

In [None]:
unique_seqs = sample_data['sequence'].unique()
seq_lengths = [len(seq) for seq in unique_seqs]

print("Min:", np.min(seq_lengths))
print("Mean:", np.mean(seq_lengths))
print("Max:", np.max(seq_lengths))
print("95th percentile:", np.percentile(seq_lengths, 95))


plt.figure(figsize=(10,6))
sns.histplot(seq_lengths, bins=50, kde=True, color="steelblue")
plt.axvline(np.percentile(seq_lengths, 95), color="red", linestyle="--", linewidth=2, 
            label=f"95th percentile = {int(np.percentile(seq_lengths, 95))}")

plt.title("Distribution of protein sequence lengths (before truncation)", fontsize=14)
plt.xlabel("Sequence length (aa)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.legend()
plt.show()


In [None]:
unique_smiles = sample_data['smiles'].unique()
smiles_lengths = [len(s) for s in unique_smiles]

print("Min:", np.min(smiles_lengths))
print("Mean:", np.mean(smiles_lengths))
print("Max:", np.max(smiles_lengths))
print("95th percentile:", np.percentile(smiles_lengths, 95))

threshold = np.percentile(smiles_lengths, 95)

plt.figure(figsize=(10,6))
sns.histplot(smiles_lengths, bins=50, kde=True, color="darkgreen")
plt.axvline(threshold, color="red", linestyle="--", linewidth=2, 
            label=f"95th percentile = {int(threshold)}")

plt.title("Distribution of SMILES string lengths (before padding)", fontsize=14)
plt.xlabel("SMILES length (characters)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
max_len = 1385

train_df['sequence'] = sample_data['sequence'].apply(
    lambda seq: seq[:max_len] if len(seq) > max_len else seq
)


### Treshold

In [None]:
threshold = 6.5
sample_data["label"] = (sample_data["label"] > threshold).astype(int)

In [None]:
cont = sample_data["label"].value_counts()
print(cont)

In [None]:
test_dataset_path = "Papyrus_Barlow.csv"
#sample_data.to_csv(test_dataset_path, index=False)


# 1 Step-Comparing my results with those in the literature 

##  BindingDB

### Split and preprocess

In [None]:
base_path = "BindingDB"

train_df = pd.read_csv(base_path + "train.csv")
val_df   = pd.read_csv(base_path + "val.csv")
test_df  = pd.read_csv(base_path + "test.csv")

train_df["split"] = "train"
val_df["split"] = "val"
test_df["split"] = "test"

full_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

print(full_df.head())

In [2]:
full_df.rename(columns={"smile": "smiles"}, inplace=True)

In [8]:
full_df.rename(columns={"SMILES": "smiles","Sequence": "sequence","Label": "label" }, inplace=True)

In [None]:
unique_seqs = full_df['sequence'].unique()
seq_lengths = [len(seq) for seq in unique_seqs]
sorted_lengths = sorted(seq_lengths, reverse=True)
print(sorted_lengths)
print("medium size:", np.mean(seq_lengths))
print("min size:", np.min(seq_lengths))
print("max size:", np.max(seq_lengths))

In [None]:
print("Percentil 95:", np.percentile(seq_lengths, 95))

In [20]:
max_len = 1341

full_df['sequence'] = full_df['sequence'].apply(
    lambda seq: seq[:max_len] if len(seq) > max_len else seq
)

In [None]:
test_dataset_path = "Binding_BARLOW.csv"
#full_df.to_csv(test_dataset_path, index=False)

## Model train

In [None]:
#!git clone https://github.com/maxischuh/BarlowDTI.git

The model is executed through the pretraining_pipeline.py script, which handles the training process and produces the history data later visualized in the notebook.

#### Load and visualize the training history 

In [None]:
test_dataset_path = "Binding_BARLOW.csv"
full_df.to_csv(test_dataset_path, index=False)

In [None]:
path = "stash/bin4" 

with open(os.path.join(path, "history.json"), "rb") as f:
    history = pickle.load(f)

plt.plot(history["train_loss"], label="Train Loss")
if "validation_loss" in history:
    plt.plot(history["validation_loss"], label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.title("history train")
plt.show()

### Preprocessor

In [None]:

preprocessor = Preprocessor(
    path="Binding_BARLOW.csv",  
    radius=2,
    n_bits=1024,
    aa_embedding="prot_t5",
    num_workers=4
)

import pickle
with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

### Classifier

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BarlowTwins()
model.load_model("stash/bin4")  

fps = preprocessor.fp
aas = preprocessor.aa
splits = preprocessor.split
labels = preprocessor.label


train_idx = [i for i, s in enumerate(splits) if s == "train"]
fp_train = np.stack([fps[i] for i in train_idx])
aa_train = np.stack([aas[i] for i in train_idx])
y_train = [labels[i] for i in train_idx]

test_idx = [i for i, s in enumerate(splits) if s == "test"]
fp_test = np.stack([fps[i] for i in test_idx])
aa_test = np.stack([aas[i] for i in test_idx])
y_test = [labels[i] for i in test_idx]

def zero_shot_batched(model, mol_array, aa_array, batch_size=256, device="cuda"):
    embeddings = []
    n_samples = mol_array.shape[0]

    for i in range(0, n_samples, batch_size):
        mol_batch = mol_array[i:i+batch_size]
        aa_batch = aa_array[i:i+batch_size]

        with torch.no_grad():
            emb = model.zero_shot(mol_batch, aa_batch, device=device)
            embeddings.append(emb)

    return np.vstack(embeddings)

X_train = zero_shot_batched(model, fp_train, aa_train, device=device)
X_test = zero_shot_batched(model, fp_test, aa_test, device=device)

clf = XGBClassifier(n_estimators=500, max_depth=5, random_state=42)
clf.fit(X_train, y_train)

preds = clf.predict_proba(X_test)[:, 1]
roc = roc_auc_score(y_test, preds)
pr = average_precision_score(y_test, preds)

print(f" ROC-AUC: {roc:.4f}")
print(f" PR-AUC:  {pr:.4f}")


### Metrics

In [None]:
y_pred_binary = (preds >= 0.5).astype(int)

f1 = f1_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
acc = accuracy_score(y_test, y_pred_binary)
cm = confusion_matrix(y_test, y_pred_binary)

print(f"📊 Accuracy:  {acc:.4f}")
print(f"🎯 Precision: {precision:.4f}")
print(f"🔁 Recall:    {recall:.4f}")
print(f"🧮 F1 Score:  {f1:.4f}")

# Confusion Matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.grid(False)
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, preds)
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc)
roc_display.plot()
plt.title("ROC Curve")
plt.grid(True)
plt.show()

# Precision-Recall Curve
prec, rec, _ = precision_recall_curve(y_test, preds)
pr_display = PrecisionRecallDisplay(precision=prec, recall=rec, average_precision=pr)
pr_display.plot()
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.show()


# 2 Step- Comparison between a base model with ESm + Morgan Fingerprints and the Barlow model

#### Base with simple sequence encoding Papyrus

In [None]:

df = pd.read_csv("Papyrus_Barlow.csv")
df = df.dropna(subset=["smiles", "sequence", "label"])
df["label"] = df["label"].astype(int)


def smiles_to_ecfp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits))
    return np.zeros(nBits)

def simple_seq_encode(seq, max_len=1500):
    vocab = {aa: idx+1 for idx, aa in enumerate("ACDEFGHIKLMNPQRSTVWY")}
    vec = [vocab.get(aa, 0) for aa in seq.upper()]
    padded = vec[:max_len] + [0]*(max_len - len(vec)) if len(vec) < max_len else vec[:max_len]
    return np.array(padded)

df["fp"] = df["smiles"].apply(smiles_to_ecfp)

model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval() 

sequences = [("protein1", "MESYHKPDQQLKDL..."), ...]  
batch_labels, batch_strs, batch_tokens = batch_converter(sequences)

with torch.no_grad():
    results = model(batch_tokens, repr_layers=[33], return_contacts=False)


token_representations = results["representations"][33]

protein_embeddings = []
for i, (_, seq) in enumerate(sequences):
    emb = token_representations[i, 1:len(seq)+1].mean(0) 
    protein_embeddings.append(emb.numpy())

df["seq_encoded"] = df["sequence"].apply(simple_seq_encode)

X_baseline = np.stack(df["fp"].values)
X_seq = np.stack(df["seq_encoded"].values)
X_raw = np.concatenate([X_baseline, X_seq], axis=1)
y = df["label"].values

train_idx = df["split"] == "train"
val_idx = df["split"] == "val"
test_idx = df["split"] == "test"

X_train_raw = X_raw[train_idx]
X_val_raw = X_raw[val_idx]
X_test_raw = X_raw[test_idx]

y_train = y[train_idx]
y_val = y[val_idx]
y_test = y[test_idx]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_val_scaled = scaler.transform(X_val_raw)
X_test_scaled = scaler.transform(X_test_raw)

def evaluate(model, X_train, X_test, y_train, y_test, name):
    model.fit(X_train, y_train)
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_test)[:, 1]
    else:
        probs = model.decision_function(X_test)
     
        probs = (probs - probs.min()) / (probs.max() - probs.min())
    preds = (probs >= 0.5).astype(int)
    print(f"\n Resultados para {name}")
    print(f"AUC:       {roc_auc_score(y_test, probs):.4f}")
    print(f"F1 Score:  {f1_score(y_test, preds):.4f}")
    print(f"Precision: {precision_score(y_test, preds):.4f}")
    print(f"Recall:    {recall_score(y_test, preds):.4f}")
    print(f"Accuracy:  {accuracy_score(y_test, preds):.4f}")
    print(f"MCC:       {matthews_corrcoef(y_test, preds):.4f}")

evaluate(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    X_train_scaled, X_test_scaled, y_train, y_test, "Baseline XGBoost"
)

evaluate(
    SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3),
    X_train_scaled, X_test_scaled, y_train, y_test, "Baseline Linear SVM (SGD)"
)

X_train = zero_shot_batched(model, fp_train, aa_train, device=device)
X_test = zero_shot_batched(model, fp_test, aa_test, device=device)




#### Base with simple sequence encoding Papyrus + TB

In [None]:
df = pd.read_csv("Papyrus_TB_Barlow.csv")
df = df.dropna(subset=["smiles", "sequence", "label"])
df["label"] = df["label"].astype(int)


df["fp"] = df["smiles"].apply(smiles_to_ecfp)

df["seq_encoded"] = df["sequence"].apply(simple_seq_encode)


X_baseline = np.stack(df["fp"].values)
X_seq = np.stack(df["seq_encoded"].values)
X_raw = np.concatenate([X_baseline, X_seq], axis=1)
y = df["label"].values

train_idx = df["split"] == "train"
val_idx = df["split"] == "val"
test_idx = df["split"] == "test"

X_train_raw = X_raw[train_idx]
X_val_raw = X_raw[val_idx]
X_test_raw = X_raw[test_idx]

y_train = y[train_idx]
y_val = y[val_idx]
y_test = y[test_idx]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_val_scaled = scaler.transform(X_val_raw)
X_test_scaled = scaler.transform(X_test_raw)


evaluate(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    X_train_scaled, X_test_scaled, y_train, y_test, "Baseline XGBoost"
)

evaluate(
    SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3),
    X_train_scaled, X_test_scaled, y_train, y_test, "Baseline Linear SVM (SGD)"
)

X_train = zero_shot_batched(model, fp_train, aa_train, device=device)
X_test = zero_shot_batched(model, fp_test, aa_test, device=device)


#### Base with simple sequence encoding TB


In [None]:

df = pd.read_csv("TB_BARLOW.csv")
df = df.dropna(subset=["smiles", "sequence", "label"])
df["label"] = df["label"].astype(int)


df["fp"] = df["smiles"].apply(smiles_to_ecfp)

df["seq_encoded"] = df["sequence"].apply(simple_seq_encode)

X_baseline = np.stack(df["fp"].values)
X_seq = np.stack(df["seq_encoded"].values)
X_raw = np.concatenate([X_baseline, X_seq], axis=1)
y = df["label"].values

train_idx = df["split"] == "train"
val_idx = df["split"] == "val"
test_idx = df["split"] == "test"

X_train_raw = X_raw[train_idx]
X_val_raw = X_raw[val_idx]
X_test_raw = X_raw[test_idx]

y_train = y[train_idx]
y_val = y[val_idx]
y_test = y[test_idx]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_val_scaled = scaler.transform(X_val_raw)
X_test_scaled = scaler.transform(X_test_raw)

evaluate(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    X_train_scaled, X_test_scaled, y_train, y_test, "Baseline XGBoost"
)

evaluate(
    SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3),
    X_train_scaled, X_test_scaled, y_train, y_test, "Baseline Linear SVM (SGD)"
)




#### Base with ESM TB


In [None]:
df = pd.read_csv("TB_BARLOW.csv")
df = df.dropna(subset=["smiles", "sequence", "label"])
df["label"] = df["label"].astype(int)

df["fp"] = df["smiles"].apply(smiles_to_ecfp)



model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = model.half().to(device)
model.eval()


sequences = [("protein"+str(i), seq) for i, seq in enumerate(df["sequence"].tolist())]

protein_embeddings = []
batch_size = 16 

for i in tqdm(range(0, len(sequences), batch_size)):
    batch_seqs = sequences[i:i+batch_size]
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_seqs)

    with torch.no_grad():
        results = model(
            batch_tokens.to(device), 
            repr_layers=[6],  
            return_contacts=False
        )

    token_representations = results["representations"][6]

    for j, (_, seq) in enumerate(batch_seqs):
        emb = token_representations[j, 1:len(seq)+1].mean(0)
        protein_embeddings.append(emb.cpu().numpy())


df["esm_emb"] = protein_embeddings


X_ecfp = np.stack(df["fp"].values)
X_esm = np.stack(df["esm_emb"].values)
X_raw = np.concatenate([X_ecfp, X_esm], axis=1)
y = df["label"].values

train_idx = df["split"] == "train"
test_idx = df["split"] == "test"

X_train_raw = X_raw[train_idx]
X_test_raw = X_raw[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

def evaluate(model, X_train, X_test, y_train, y_test, name):
    model.fit(X_train, y_train)
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_test)[:, 1]
    else:
        probs = model.decision_function(X_test)
        probs = (probs - probs.min()) / (probs.max() - probs.min())
    preds = (probs >= 0.5).astype(int)
    return {
        "Modelo": name,
        "AUC": roc_auc_score(y_test, probs),
        "F1 Score": f1_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "Accuracy": accuracy_score(y_test, preds),
        "MCC": matthews_corrcoef(y_test, preds)
    }

results = []
results_tb = []

results_tb.append(evaluate(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    X_train_scaled, X_test_scaled, y_train, y_test, "ESM + ECFP XGBoost"
))

results_tb.append(evaluate(
    SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3),
    X_train_scaled, X_test_scaled, y_train, y_test, "ESM + ECFP Linear SVM"
))



#### Base with ESM Papyrus + TB

In [None]:
df = pd.read_csv("Papyrus_TB_Barlow.csv")
df = df.dropna(subset=["smiles", "sequence", "label"])
df["label"] = df["label"].astype(int)


df["fp"] = df["smiles"].apply(smiles_to_ecfp)




model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = model.half().to(device)
model.eval()


sequences = [("protein"+str(i), seq) for i, seq in enumerate(df["sequence"].tolist())]

protein_embeddings = []
batch_size = 16 

for i in tqdm(range(0, len(sequences), batch_size)):
    batch_seqs = sequences[i:i+batch_size]
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_seqs)

    with torch.no_grad():
        results = model(
            batch_tokens.to(device), 
            repr_layers=[6],  
            return_contacts=False
        )

    token_representations = results["representations"][6]

    for j, (_, seq) in enumerate(batch_seqs):
        emb = token_representations[j, 1:len(seq)+1].mean(0)
        protein_embeddings.append(emb.cpu().numpy())


df["esm_emb"] = protein_embeddings


X_ecfp = np.stack(df["fp"].values)
X_esm = np.stack(df["esm_emb"].values)
X_raw = np.concatenate([X_ecfp, X_esm], axis=1)
y = df["label"].values

train_idx = df["split"] == "train"
test_idx = df["split"] == "test"

X_train_raw = X_raw[train_idx]
X_test_raw = X_raw[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

results = []
results_tb = []

results_tb.append(evaluate(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    X_train_scaled, X_test_scaled, y_train, y_test, "ESM + ECFP XGBoost"
))

results_tb.append(evaluate(
    SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3),
    X_train_scaled, X_test_scaled, y_train, y_test, "ESM + ECFP Linear SVM"
))



#### Base with ESM Papyrus

In [None]:
df = pd.read_csv("/home/resperanca/Tuberculosis_Tese/Data/Tuberculosis_Data/Papyrus_BARLOW.csv")
df = df.dropna(subset=["smiles", "sequence", "label"])
df["label"] = df["label"].astype(int)

df["fp"] = df["smiles"].apply(smiles_to_ecfp)


model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()

device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = model.half().to(device)
model.eval()


sequences = [("protein"+str(i), seq) for i, seq in enumerate(df["sequence"].tolist())]

protein_embeddings = []
batch_size = 16 

for i in tqdm(range(0, len(sequences), batch_size)):
    batch_seqs = sequences[i:i+batch_size]
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_seqs)

    with torch.no_grad():
        results = model(
            batch_tokens.to(device), 
            repr_layers=[6],  
            return_contacts=False
        )

    token_representations = results["representations"][6]

    for j, (_, seq) in enumerate(batch_seqs):
        emb = token_representations[j, 1:len(seq)+1].mean(0)
        protein_embeddings.append(emb.cpu().numpy())


df["esm_emb"] = protein_embeddings


X_ecfp = np.stack(df["fp"].values)
X_esm = np.stack(df["esm_emb"].values)
X_raw = np.concatenate([X_ecfp, X_esm], axis=1)
y = df["label"].values

train_idx = df["split"] == "train"
test_idx = df["split"] == "test"

X_train_raw = X_raw[train_idx]
X_test_raw = X_raw[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

results = []
results_tb = []

results_tb.append(evaluate(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    X_train_scaled, X_test_scaled, y_train, y_test, "ESM + ECFP XGBoost"
))

results_tb.append(evaluate(
    SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3),
    X_train_scaled, X_test_scaled, y_train, y_test, "ESM + ECFP Linear SVM"
))



#### Barlow with Papyrus (for other two datasets, just change the test_path), XGBoost and SVM, were obtained using the barlowdti_xxl.py file.

In [None]:


test_path = "tb_test_BARLOW.csv"
barlow_model_path = "Papyrus"

bt_model = BarlowTwins()
bt_model.load_model(barlow_model_path)


test_df = pd.read_csv(test_path)

# ECFPs e embeddings 
test_mols = [Chem.MolFromSmiles(smi) for smi in test_df["smiles"]]
test_ecfp = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024) for m in test_mols]
test_ecfp = np.array(test_ecfp)

test_emb = encode_sequences(test_df["sequence"].tolist(), encoder="prost_t5")
test_emb = np.array([np.array(x) for x in test_emb])

# Conct embeddings
test_vectors = bt_model.zero_shot(test_ecfp, test_emb)
true_labels = test_df["label"].values




XGBoost

In [None]:
xgb_model = XGBClassifier()
xgb_model.load_model("Papyrus_barlowdti_xxl_model_tb.json")


print("\nXGBOOST-TB")
gbm_preds = xgb_model.predict_proba(test_vectors)[:, 1]
gbm_binary = (gbm_preds >= 0.5).astype(int)

roc = roc_auc_score(true_labels, gbm_preds)
acc = accuracy_score(true_labels, gbm_binary)
prec = precision_score(true_labels, gbm_binary)
rec = recall_score(true_labels, gbm_binary)
f1 = f1_score(true_labels, gbm_binary)
mcc = matthews_corrcoef(true_labels, gbm_binary)
pr_auc = average_precision_score(true_labels, gbm_preds)
cm = confusion_matrix(true_labels, gbm_binary)

print(f"ROC-AUC: {roc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")
print(f"ACCURACY: {acc:.4f}")
print(f"PRECISION: {prec:.4f}")
print(f"RECALL: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"MCC: {mcc:.4f}")


In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(true_labels, gbm_preds)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – XGBoost – Papyrus")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score

prec, rec, _ = precision_recall_curve(true_labels, gbm_preds)
ap = average_precision_score(true_labels, gbm_preds)

plt.figure()
plt.plot(rec, prec, label=f"PR curve (AP = {ap:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve – XGBoost – Papyrus")
plt.legend()
plt.show()

In [None]:
cm = confusion_matrix(true_labels, gbm_binary)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Pred 0","Pred 1"],
            yticklabels=["True 0","True 1"])
plt.title("Confusion Matrix – XGBoost – Papyrus")
plt.show()

SVM

In [None]:
import joblib

svm = joblib.load("Papyrus_barlowdti_xxl_model_svm.pkl") 
scaler = joblib.load("Papyrus_barlowdti_xxl_svm_scaler.pkl") #

test_bt_scaled = scaler.transform(test_vectors)

decision_scores = svm.decision_function(test_bt_scaled)
binary_preds = (decision_scores >= 0).astype(int)

roc = roc_auc_score(true_labels, decision_scores)
acc = accuracy_score(true_labels, binary_preds)
prec = precision_score(true_labels, binary_preds)
rec = recall_score(true_labels, binary_preds)
f1 = f1_score(true_labels, binary_preds)
mcc = matthews_corrcoef(true_labels, binary_preds)
pr_auc = average_precision_score(true_labels, decision_scores)
cm = confusion_matrix(true_labels, binary_preds)

print(f"ROC-AUC: {roc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")
print(f"ACCURACY: {acc:.4f}")
print(f"PRECISION: {prec:.4f}")
print(f"RECALL: {rec:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"MCC: {mcc:.4f}")

sns.heatmap(cm, annot=True, fmt="d", cmap="Greens")
plt.title("Matriz de Confusão - SVM Linear")
plt.xlabel("Predito")
plt.ylabel("Real")
plt.show()


In [None]:
fpr, tpr, _ = roc_curve(true_labels, decision_scores)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.3f})")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – SVM – Papyrus")
plt.legend()
plt.show()

In [None]:
prec, rec, _ = precision_recall_curve(true_labels, decision_scores)
ap = average_precision_score(true_labels, decision_scores)

plt.figure()
plt.plot(rec, prec, label=f"PR curve (AP = {ap:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve – SVM – Papyrus")
plt.legend()
plt.show()

In [None]:
cm = confusion_matrix(true_labels, binary_preds)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Pred 0","Pred 1"],
            yticklabels=["True 0","True 1"])
plt.title("Confusion Matrix – SVM – Papyrus")
plt.show()