In [1]:
import xml.etree.ElementTree as ET
import csv
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import re
from chembl_webresource_client.new_client import new_client
import yaml

# DrugBank

I applied to drugbank for access to all their data, after being accepted, and downloaded the file containing all the data and preprocessed it.

In [None]:
# Load and manipulate the file

tree = ET.parse('/home/resperanca/Tuberculosis_Tese/drugbank.xml')
root = tree.getroot()
ns = {'db': 'http://www.drugbank.ca'}
tuberculosis_drugs = []

In [None]:

keywords = [
    'tuberculosis', 'mycobacterium tuberculosis', 'mtb',
    'pulmonary tuberculosis', 'extrapulmonary tuberculosis', 'latent tuberculosis',
    'active tuberculosis', 'multidrug-resistant tuberculosis', 'extensively drug-resistant tuberculosis',
    'tb-mdr', 'tb-xdr', 'tuberculose', 'coinfection', 'tb/hiv',
]
keywords = [k.lower() for k in keywords]

tb_targets = []
tb_drugs = []

#DRUGS

for drug in root.findall('db:drug', ns):
    drugbank_id = drug.find('db:drugbank-id', ns).text
    name = drug.find('db:name', ns).text if drug.find('db:name', ns) is not None else ''
    found = False

    # Indication
    indication = drug.find('db:indication', ns)
    if indication is not None and indication.text and any(k in indication.text.lower() for k in keywords):
        found = True

    # Categorie
    for category in drug.findall('db:categories/db:category', ns):
        if category.text and any(k in category.text.lower() for k in keywords):
            found = True

    # SMILES
    smiles = ''
    for prop in drug.findall('db:calculated-properties/db:property', ns):
        kind = prop.find('db:kind', ns)
        if kind is not None and kind.text == 'SMILES':
            smiles = prop.find('db:value', ns).text
            break

    # Mechanism-of-action
    moa = drug.find('db:mechanism-of-action', ns)
    moa_text = moa.text.strip() if moa is not None and moa.text else ''

    # IC50
    ic50 = ''
    for assay in drug.findall('db:experimental-properties/db:property', ns):
        kind = assay.find('db:kind', ns)
        if kind is not None and kind.text == 'IC50':
            ic50 = assay.find('db:value', ns).text
            break

    # Targets
    related_target_ids = []
    for target_type in ['targets', 'enzymes', 'carriers', 'transporters']:
        for target in drug.findall(f'db:{target_type}/db:{target_type[:-1]}', ns):
            t_name = target.find('db:name', ns)
            organism = target.find('db:organism', ns)
            polypeptide = target.find('db:polypeptide', ns)

            target_name = t_name.text if t_name is not None else ''
            org = organism.text if organism is not None else ''
            uniprot_id = polypeptide.attrib['id'] if polypeptide is not None and 'id' in polypeptide.attrib else ''
            sequence = ''
            if polypeptide is not None:
                seq_elem = polypeptide.find('db:sequence', ns)
                if seq_elem is not None and seq_elem.text:
                    sequence = seq_elem.text.strip()

            target_name_lower = target_name.lower() if target_name else ''
            org_lower = org.lower() if org else ''

            #Filter only TB-related targets
            if any(k in target_name_lower for k in keywords) or any(k in org_lower for k in keywords):
                action_list = target.find('db:actions', ns)
                action_types = [a.text for a in action_list.findall('db:action', ns)] if action_list is not None else []
                pharmacological_action = target.attrib.get('known-action', 'unknown')

                target_entry = {
                    'target_name': target_name,
                    'organism': org,
                    'uniprot_id': uniprot_id,
                    'sequence': sequence,
                    'target_type': target_type[:-1],
                    'action_types': ";".join(action_types),
                    'pharmacological_action': pharmacological_action
                }

                
                if uniprot_id and not any(t['uniprot_id'] == uniprot_id for t in tb_targets):
                    tb_targets.append(target_entry)

                if uniprot_id:
                    related_target_ids.append(uniprot_id)

    # If the drug is related to TB, store
    if found:
        tb_drugs.append({
            'drugbank_id': drugbank_id,
            'name': name,
            'smiles': smiles,
            'indication': indication.text.strip() if indication is not None and indication.text else '',
            'mechanism_of_action': moa_text,
            'ic50': ic50,
            'targets': related_target_ids
        })



In the next step, the script runs two nested cycles. The first runs through all the TB-related drugs stored in the tb_drugs list. The second runs through all the tuberculosis-related molecular targets stored in the tb_targets list. It checks whether the target's uniprot_id is present in the drug's target list (drug[“targets”]). If it is, there is a known interaction between the drug and the target, and the interaction variable is set to 1. Otherwise, it is set to 0, indicating that there is no known interaction.

In [None]:
# DRUG × TARGET Matrix
with open('tb_drug_target_matrix.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow([
        'DrugBank ID', 'Drug Name', 'SMILES', 'Indication', 'Mechanism of Action', 'IC50',
        'Target Name', 'Organism', 'UniProt ID', 'Target Sequence', 'Target Type',
        'Action Type(s)', 'Pharmacological Action', 'Dataset', 'Interaction'
    ])

    for drug in tb_drugs:
        for target in tb_targets:
            interaction = 1 if target['uniprot_id'] in drug['targets'] else 0
            writer.writerow([
                drug['drugbank_id'], drug['name'], drug['smiles'],
                drug['indication'], drug['mechanism_of_action'], drug['ic50'],
                target['target_name'], target['organism'], target['uniprot_id'],
                target['sequence'], target['target_type'], target['action_types'],
                target['pharmacological_action'], 'DrugBank', interaction
            ])

print(f" {len(tb_drugs)} drogas relacionadas com TB exportadas.")
print(f" {len(tb_targets)} alvos relacionados com TB exportados.")

### Get Seq Uniprot


In [None]:
df = pd.read_csv('tb_drugbank_2.csv')

seq_cache = {}

if 'Target Sequence' not in df.columns:
    df['Target Sequence'] = ""


for i, row in tqdm(df.iterrows(), total=len(df)):
    uniprot_id = str(row['UniProt ID']).strip()

    if pd.isna(uniprot_id) or uniprot_id == '':
        continue

    if uniprot_id in seq_cache:
        df.at[i, 'Target Sequence'] = seq_cache[uniprot_id]
        continue

    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            lines = response.text.splitlines()
            if lines and lines[0].startswith('>'):
                sequence = ''.join(lines[1:]).strip()
                seq_cache[uniprot_id] = sequence
                df.at[i, 'Target Sequence'] = sequence
        else:
            print(f" {uniprot_id} não encontrado (status {response.status_code})")
    except requests.exceptions.RequestException as e:
        print(f" Erro ao buscar {uniprot_id}: {e}")


#df.to_csv("tb_drugbank_2.csv", index=False)



In [None]:
num_unique_drugs = df['SMILES'].nunique()
num_unique_targets = df['Target Sequence'].nunique()

print(f"Unique Drugs (SMILES): {num_unique_drugs}")
print(f"Unique Targets: {num_unique_targets}")

# BINDING

Regarding BindingDb, I downloaded the file with all the data and then preprocessed it.

In [None]:
tsv_file_path = '/home/resperanca/Tuberculosis_Tese/BindingDB_All_202406.tsv'
csv_file_path = '/home/resperanca/Tuberculosis_Tese/Binding.csv'

try:
     df = pd.read_csv(tsv_file_path, sep='\t')
    
except pd.errors.ParserError as e:
     error_line = int(str(e).split('line ')[1].split(',')[0])
     with open(tsv_file_path, 'r') as file:
         lines = file.readlines()
         problematic_line = lines[error_line - 1]
         print(f"problematic line: {problematic_line}")
         
     # Ignore the problematic line
     df = pd.read_csv(tsv_file_path, sep='\t', on_bad_lines='skip')

df.to_csv(csv_file_path, index=False)
Binding = pd.read_csv(csv_file_path)
print(Binding.head())

In [28]:
print(df.columns.tolist())

['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI', 'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name', 'Target Name', 'Target Source Organism According to Curator or DataSource', 'Ki (nM)', 'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)', 'pH', 'Temp (C)', 'Curation/DataSource', 'Article DOI', 'BindingDB Entry DOI', 'PMID', 'PubChem AID', 'Patent Number', 'Authors', 'Institution', 'Link to Ligand in BindingDB', 'Link to Target in BindingDB', 'Link to Ligand-Target Pair in BindingDB', 'Ligand HET ID in PDB', 'PDB ID(s) for Ligand-Target Complex', 'PubChem CID', 'PubChem SID', 'ChEBI ID of Ligand', 'ChEMBL ID of Ligand', 'DrugBank ID of Ligand', 'IUPHAR_GRAC ID of Ligand', 'KEGG ID of Ligand', 'ZINC ID of Ligand', 'Number of Protein Chains in Target (>1 implies a multichain complex)', 'BindingDB Target Chain Sequence', 'PDB ID(s) of Target Chain', 'UniProt (SwissProt) Recommended Name of Target Chain', 'UniProt (SwissProt) Entry Name of Targ

In [None]:
input_path = '/home/resperanca/Tuberculosis_Tese/Binding.csv'
output_path = '/home/resperanca/Tuberculosis_Tese/tb_bindingdb_2.csv'


keywords = [
    'tuberculosis', 'mycobacterium tuberculosis', 'mtb',
    'pulmonary tuberculosis', 'latent tuberculosis', 'tb-mdr',
    'tb-xdr', 'tuberculose', 'coinfection', 'tb/hiv'
]
keywords = [k.lower() for k in keywords]

df = pd.read_csv(input_path)

filtered = df[
    df['Target Name'].fillna('').str.lower().str.contains('|'.join(keywords)) |
    df['Target Source Organism According to Curator or DataSource'].fillna('').str.lower().str.contains('|'.join(keywords))
].copy()


# Make sure we always have the same name in the column to make it easier to merge in the future.
filtered = filtered[[
    'BindingDB Reactant_set_id',
    'BindingDB Ligand Name',
    'Ligand SMILES',
    'Target Name',
    'Target Source Organism According to Curator or DataSource',
    'UniProt (SwissProt) Primary ID of Target Chain',
    'IC50 (nM)'
]].rename(columns={
    'BindingDB Reactant_set_id': 'Ligand ID',
    'BindingDB Ligand Name': 'Ligand Name',
    'Ligand SMILES': 'SMILES',
    'Target Name': 'Target Name',
    'Target Source Organism According to Curator or DataSource': 'Organism',
    'UniProt (SwissProt) Primary ID of Target Chain': 'UniProt ID',
    'IC50 (nM)': 'IC50'
})


filtered['Target Sequence'] = ''
filtered['Dataset'] = 'BindingDB'



### Get Seq Uniprot

In [None]:
seq_cache = {}

for i, row in tqdm(filtered.iterrows(), total=len(filtered)):
    uid = str(row['UniProt ID']).strip()

    if not uid or pd.isna(uid):
        continue

    if uid in seq_cache:
        filtered.at[i, 'Target Sequence'] = seq_cache[uid]
        continue

    try:
        url = f"https://rest.uniprot.org/uniprotkb/{uid}.fasta"
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            lines = response.text.splitlines()
            sequence = ''.join(lines[1:]) if lines and lines[0].startswith('>') else ''
            seq_cache[uid] = sequence
            filtered.at[i, 'Target Sequence'] = sequence
        else:
            seq_cache[uid] = ''
    except requests.exceptions.RequestException as e:
        print(f"Erro ao buscar {uid}: {e}")
        seq_cache[uid] = ''

filtered.to_csv(output_path, index=False)


The next step is to transform the IC50 values into pChEMBL. This function is useful when you have IC50 values (an indicator of a drug's potency) in different formats and want to standardise them in pChEMBL, a logarithmic scale where higher values indicate greater potency.

In [None]:
df = pd.read_csv('/home/resperanca/Tuberculosis_Tese/tb_bindingdb_2.csv')

def to_pchembl(value, unit="nM"):
    """
    Converte valores de IC50/Ki/Kd em diferentes unidades (nM, µM, mM, M) para pChEMBL.
    Retorna np.nan se o valor não for válido ou for censurado.
    """
    try:
        value_str = str(value).strip()
        unit = str(unit).lower().strip()

        # ignorar valores censurados (>, <, ~)
        if any(sym in value_str for sym in [">", "<", "~"]):
            return np.nan

        # extrair número
        match = re.search(r'\d+\.?\d*', value_str)
        if not match:
            return np.nan

        val = float(match.group())
        if val <= 0:
            return np.nan

        # converter para M
        if unit in ["m", "mol/l", "molar"]:
            molar = val
        elif unit in ["mm", "mmol/l", "millimolar"]:
            molar = val * 1e-3
        elif unit in ["µm", "um", "umol/l", "micromolar"]:
            molar = val * 1e-6
        elif unit in ["nm", "nmol/l", "nanomolar"]:
            molar = val * 1e-9
        else:
            return np.nan  # unidade desconhecida

        return -np.log10(molar)
    except Exception:
        return np.nan

df['pChEMBL'] = df['IC50'].apply(ic50_to_pchembl)

output_path = '/home/resperanca/Tuberculosis_Tese/tb_bindingdb_2.csv'
df.to_csv(output_path, index=False)


# Chembl



For ChEMBL, we use an official Python library provided by ChEMBL, which allows you to query the ChEMBL database directly from Python scripts.The ChEMBL database contains detailed information on chemical compounds, biological targets, bioactivities, assays and much more.

In [None]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import numpy as np
import yaml, time, math
import requests

organism_name = "Mycobacterium tuberculosis"
target_resource = new_client.target

targets = target_resource.filter(organism__icontains=organism_name)
filtered_targets = [t for t in targets if organism_name in (t.get("organism") or "")]
id_name = {t["target_chembl_id"]: t.get("pref_name", "") for t in filtered_targets}
id_uniprot = {
    t["target_chembl_id"]: (
        t["target_components"][0].get("accession", "")
        if isinstance(t.get("target_components"), list) and t["target_components"] else ""
    )
    for t in filtered_targets
}

with open("id_name_chembl.yaml", "w") as f:
    yaml.dump(id_name, f)


def to_molar(value, units):
  
    units = (units or "").strip().lower()
    v = float(value)
    if v <= 0 or not math.isfinite(v):
        return None
    if units in ("m", "mol/l"):
        return v
    if units in ("mm", "mmol/l", "mmolar"):
        return v * 1e-3
    if units in ("um", "µm", "umol/l", "micromolar"):
        return v * 1e-6
    if units in ("nm", "nmol/l", "nanomolar"):
        return v * 1e-9
    
    return None

def pchembl_from(value_molar):
    return -np.log10(value_molar) if value_molar and value_molar > 0 else None


activity = new_client.activity
rows = []
for t in filtered_targets:
    chembl_id = t["target_chembl_id"]
    target_name = id_name.get(chembl_id, "")
    uniprot_id = id_uniprot.get(chembl_id, "")
    organism = t.get("organism", "")

   
    acts = activity.filter(target_chembl_id=chembl_id).only(
        ["canonical_smiles", "standard_type", "standard_value", "standard_units", "relation"]
    )

    for a in acts:
        typ = a.get("standard_type")
        if typ not in {"IC50", "Ki", "Kd"}:
            continue
        smi = a.get("canonical_smiles") or ""
        val = a.get("standard_value")
        units = a.get("standard_units")
        if not smi or val is None:
            continue

        molar = to_molar(val, units)
        pchem = pchembl_from(molar)
        if pchem is None:
            continue

       
        rel = (a.get("relation") or "").strip()
        if rel in (">", "<", ">=", "<="):
            continue

        rows.append({
            "SMILES": smi,
            "Target Name": target_name,
            "Organism": organism,
            "UniProt ID": uniprot_id,
            "Type": typ,
            "Std Value": float(val),
            "Std Units": units,
            "Value (M)": molar,
            "pChEMBL": pchem,
            "Dataset": "ChEMBL"
        })

df = pd.DataFrame(rows)

df.sort_values(["pChEMBL"], ascending=False, inplace=True)
df = df.drop_duplicates(subset=["SMILES", "UniProt ID", "Type"], keep="first").reset_index(drop=True)


df["Label"] = (df["pChEMBL"] >= 6.5).astype(int)

def fetch_uniprot_seq(uid):
    if not uid: return ""
    try:
        r = requests.get(f"https://rest.uniprot.org/uniprotkb/{uid}.fasta", timeout=10)
        if r.status_code == 200:
            lines = r.text.splitlines()
            return "".join(lines[1:]).strip() if lines and lines[0].startswith(">") else ""
    except requests.RequestException:
        pass
    return ""

if "Target Sequence" not in df.columns:
    df["Target Sequence"] = ""

missing = df["UniProt ID"].fillna("").eq("")

for idx, uid in df.loc[~missing, "UniProt ID"].drop_duplicates().items():
    seq = fetch_uniprot_seq(uid)
    if seq:
        df.loc[df["UniProt ID"] == uid, "Target Sequence"] = seq


df.to_csv("tb_chembl_curated.csv", index=False)
print(f"{len(df)} bioactivity rows (unique SMILES–target–type) guardados.")


### Get Seq Uniprot

In [None]:
seq_cache = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    uid = row['UniProt ID']
    if not uid or pd.isna(uid):
        continue
    if uid in seq_cache:
        df.at[i, 'Target Sequence'] = seq_cache[uid]
        continue
    try:
        r = requests.get(f"https://rest.uniprot.org/uniprotkb/{uid}.fasta", timeout=10)
        if r.status_code == 200:
            lines = r.text.splitlines()
            if lines and lines[0].startswith('>'):
                seq = ''.join(lines[1:]).strip()
                df.at[i, 'Target Sequence'] = seq
                seq_cache[uid] = seq
    except Exception as e:
        print(f"Erro ao buscar sequência para {uid}: {e}")


df.to_csv('tb_chembl_2.csv', index=False)


In [41]:
df = pd.read_csv('tb_chembl_2.csv')

The next step is to transform the IC50 values into pChEMBL. This function is useful when you have IC50 values (an indicator of a drug's potency) in different formats and want to standardise them in pChEMBL, a logarithmic scale where higher values indicate greater potency.

In [None]:

df['pChEMBL'] = df['IC50'].apply(ic50_to_pchembl)

df.to_csv('tb_chembl_2.csv', index=False)


## Concatenate chembl with binding

In [3]:
df_che = pd.read_csv('tb_chembl_2.csv')
df_bin = pd.read_csv('tb_bindingdb_2.csv')

In [None]:

num_unique_drugs = df_bin['SMILES'].nunique()
num_unique_targets = df_bin['Target Sequence'].nunique()
print(f"Unique Drugs (SMILES): {num_unique_drugs}")
print(f"Unique Targets: {num_unique_targets}")

In [None]:
df_bin_cleaned = df_bin.drop(columns=['Ligand ID', 'Ligand Name'])
tb_bind_chem = pd.concat([df_che, df_bin_cleaned], ignore_index=True)
tb_bind_chem.to_csv("tb_bind_chem.csv", index=False)
print(tb_bind_chem.head())


In [None]:

num_unique_drugs = tb_bind_chem['SMILES'].nunique()
num_unique_targets = tb_bind_chem['Target Sequence'].nunique()

print(f"Unique Drugs (SMILES): {num_unique_drugs}")
print(f"Unique Targets: {num_unique_targets}")

### remove duplicates with matching smiles targets sequences and pchembl

In [None]:
tb_bind_chem_unique = tb_bind_chem.drop_duplicates(subset=['SMILES', 'Target Sequence', 'pChEMBL'])

For repeated cases, where we had the same SMILE and the same Target, but different Labels, we kept the pChEMBL median per group.

In [None]:
pchembl_medians = tb_bind_chem_unique.groupby(['SMILES', 'Target Sequence'])['pChEMBL'].transform('median')

tb_bind_chem_unique['pChEMBL'] = pchembl_medians

tb_bind_chem_clean = tb_bind_chem_unique.drop_duplicates(subset=['SMILES', 'Target Sequence', 'pChEMBL'])
tb_bind_chem_clean.to_csv("tb_bind_chem_clean.csv", index=False)

In [26]:
df = pd.read_csv('tb_bind_chem_clean.csv')

In [None]:
# Binary column
tb_bind_chem_clean['interaction'] = (df['pChEMBL'] >= 6).astype(int)



In [None]:
# Interctions
interaction_counts = tb_bind_chem_clean['interaction'].value_counts()
print(f"Negativs (0): {interaction_counts.get(0, 0)}")
print(f"Positivs (1): {interaction_counts.get(1, 0)}")


In [29]:
tb_bind_chem_clean.to_csv("tb_bind_chem_final.csv", index=False)

## Concatenate  DrugBnak with binding + ChEMBL

In [3]:
df_drug = pd.read_csv('tb_drugbank_2.csv')
df_2 = pd.read_csv('tb_bind_chem_final.csv')

In [35]:
df_drug = df_drug.rename(columns={'Interaction': 'interaction'})

In [None]:
# Remove
df_drug_cleaned = df_drug.drop(columns=[
    'Drug Name',
    'DrugBank ID',
    'Indication',
    'Mechanism of Action'
])


df_combined = pd.concat([df_2, df_drug_cleaned], ignore_index=True)

### remove duplicates with matching smiles targets sequences and interction

In [None]:

tb_final = df_combined.drop_duplicates(subset=['SMILES', 'Target Sequence', 'interaction'])
tb_final.to_csv("tb_final.csv", index=False)


In [5]:
tb_final = pd.read_csv('tb_final_filtrado.csv')

In [None]:

num_unique_drugs = tb_final['SMILES'].nunique()
num_unique_targets = tb_final['Target Sequence'].nunique()

print(f"Unique Drugs (SMILES): {num_unique_drugs}")
print(f"Unique Targets: {num_unique_targets}")

In [None]:
# Interections
interaction_counts = tb_final['interaction'].value_counts()

print(f"Negativs (0): {interaction_counts.get(0, 0)}")
print(f"Positivs (1): {interaction_counts.get(1, 0)}")

In [4]:
target_names_unicos = tb_final['Target Name'].unique()

### After manual cross-referencing, unrelated targets were removed

In [None]:
targets_remover = [
    'Mycobacterium tuberculosis variant bovis',
    'Mycobacterium tuberculosis',
    'Thymidylate synthase',
    'BirA bifunctional protein',
    'Hypoxanthine-guanine phosphoribosyltransferase',
    'tRNA (guanine-N(1)-)-methyltransferase',
    '1 4-dihydroxy-2-naphthoate octaprenyltransferase',
    'Cytochrome P450 144',
    'Cytochrome P450 130',
    'DNA-directed RNA polymerase subunit beta',
    'Possible cellulase CelA1 (Endoglucanase) (Endo-1 4-beta-glucanase) (FI-cmcase) (Carboxymethyl cellulase)',
    'Cyclase',
    'Conserved protein',
    'ATP-dependent dethiobiotin synthetase BioD',
    'Invasin',
    'Hydroxymycolate synthase MmaA4',
    'Possible exported protein'
]

df_filtrado = tb_final[~tb_final['Target Name'].str.strip().isin(targets_remover)]

df_filtrado.to_csv('/home/resperanca/Tuberculosis_Tese/tb_final_filtrado.csv', index=False)


In [18]:
df= pd.read_csv('tb_final_filtrado.csv')

It was necessary to adapt the tuberculosis dataset to be used by the Barlow twins model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = df.rename(columns={
    "SMILES": "smiles",
    "Target Sequence": "sequence",
    "interaction": "label"
})

df = df[["smiles", "sequence", "label"]]

df = df.dropna(subset=["label"])

train_df, temp_df = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df["label"]
)
train_df["split"] = "train"

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"]
)
val_df["split"] = "val"
test_df["split"] = "test"
final_df = pd.concat([train_df, val_df, test_df])
final_df = final_df[["smiles", "sequence", "label", "split"]]
final_df.to_csv("tb_fina_BARLOW.csv", index=False)


In [33]:
df = pd.read_csv('tb_fina_BARLOW.csv')

In [None]:
df = df[df["smiles"].notna()]  
df = df[df["smiles"].apply(lambda x: isinstance(x, str))] 
df = df[df["sequence"].apply(lambda x: isinstance(x, str))]

In [35]:
df.to_csv("tb_fina_BARLOW.csv", index=False)