In [None]:
import pandas as pd
#import sys
#import os
import numpy as np
from tdc.multi_pred import DTI


# Dataset Papyrus

Installs or reinstalls papyrus-scripts and rdkit-pypi from GitHub if the import fails, then restarts the Jupyter kernel.

In [None]:
#try:
    #import papyrus_scripts
#except:
    #!pip uninstall papyrus-scripts -y
    #!pip install rdkit-pypi
    #!pip install https://github.com/OlivierBeq/Papyrus-scripts/tarball/master --no-cache-dir
    #get_ipython().kernel.do_shutdown(True)

### Extract drugs and proteins

Loads drug and protein data using the read_papyrus and read_protein_set functions.

In [None]:
from papyrus_scripts.reader import read_papyrus, read_protein_set

sample_data = read_papyrus(is3d=False, chunksize=None, source_path=None)

In [None]:
protein_data = read_protein_set(source_path=None)
protein_data.head()

### Merge drugs and proteins

Converts IDs to strings, expands protein IDs into separate rows, merges drug and protein datasets by ID, removes rows without sequences, and prepares the data for export.

In [None]:
sample_data['TID'] = sample_data['TID'].astype(str) # TID is ID
protein_data['TID'] = protein_data['TID'].astype(str)

protein_data_exploded = protein_data.assign(TID=protein_data['TID'].str.split(';')).explode('TID')

sample_data = sample_data.merge(protein_data_exploded[['TID', 'Sequence']], on='TID', how='left')

print(sample_data.head())

In [44]:
sample_data = sample_data.dropna(subset=['Sequence'])

In [None]:
#sample_data.to_csv("Papyrus_merge.csv", index=False, encoding="utf-8")

# Dataset Davis

In [None]:
data_DAVIS= DTI(name = 'DAVIS')
DAVIS = data_DAVIS.get_data()

In [None]:
unique_smiles_count = DAVIS["Drug"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = DAVIS["Target"].nunique()
print(f"Unique Targets: {unique_target_count}")

# Dataset KIBA

In [None]:
data_KIBA = DTI(name = 'KIBA')
KIBA = data_KIBA.get_data()

In [None]:
unique_smiles_count = KIBA["Drug"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = KIBA["Target"].nunique()
print(f"Unique Targets: {unique_target_count}")

#  Dataset BindingDB

In [None]:
dataBD_KD = DTI(name = 'BindingDB_Kd')
BD_KD = dataBD_KD.get_data()

In [None]:
unique_smiles_count = BD_KD["Drug_ID"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = BD_KD["Target"].nunique()
print(f"Unique Targets: {unique_target_count}")

In [None]:
dataBD_IC50 = DTI(name = 'BindingDB_IC50')
BD_IC50 = dataBD_IC50.get_data()


In [None]:
unique_smiles_count = BD_IC50["Drug_ID"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = BD_IC50["Target"].nunique()
print(f"Unique Targets: {unique_target_count}")

In [None]:
dataBD_KI = DTI(name = 'BindingDB_Ki')
BD_KI = dataBD_KI.get_data()

In [None]:
unique_smiles_count = BD_KI["Drug_ID"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = BD_KI["Target"].nunique()
print(f"Unique Targets: {unique_target_count}")

# Dataset Metz

In [None]:
file_path = "Metz.csv"
METZ = pd.read_csv(file_path)
print(METZ.shape[0])
print(METZ.columns)

In [None]:
unique_smiles_count = METZ["SMILES"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_smiles_count = METZ["ProteinSequence"].nunique()
print(f"Unique Targets: {unique_smiles_count}")

# Dataset Human


In [None]:
file_path = "Human.csv"
df_Human = pd.read_csv(file_path)
print(df_Human.shape[0])
print(df_Human.columns)

In [None]:
unique_smiles_count = df_Human["compound_iso_smiles"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = df_Human["target_sequence"].nunique()
print(f"Unique Targets: {unique_target_count}")

# Dataset C. elegans

In [None]:
file_path = "C_elegans.csv"
df_elangs = pd.read_csv(file_path)
print(df_elangs.shape[0])
print(df_elangs.columns)

In [None]:
unique_smiles_count = df_elangs["smile"].nunique()
print(f"Unique SMILES: {unique_smiles_count}")
unique_target_count = df_elangs["protein"].nunique()
print(f"Unique Targets: {unique_target_count}")

# 