In [1]:
import os
import time
import torch
from datasets import load_from_disk
from smiles_integration_app import get_drugs_with_edsnlp, get_smiles, create_enriched_reports_dataset

# Integration des SMILES dans comptes-rendus médicaux

## Dictionnaires

In [14]:
import pandas as pd

chembl = open('../data/chembl_smiles_synonyms.txt','rt') #open the chembl dico
drugbank = open('../data/drugbank_smiles.txt','rt') #open the drugbank dico
dico = open('../data/dico.txt','rt') #open the drugbank dico
df_chembl = pd.read_csv(chembl, sep='\t', names=['id', 'smiles', 'synonyms'])
df_drugbank = pd.read_csv(drugbank, sep='\t', names=['id', 'smiles','synonyms'])
df_dico = pd.read_csv(dico, sep='\t', names=['id', 'smiles','synonyms'])
chembl.close()
drugbank.close()
dico.close()

In [15]:
print(f"{df_drugbank.shape[0]} médicament dans DrugBank")
print(f"{df_chembl.shape[0]} médicament dans DrugBank")
print(f"{df_dico.shape[0]} médicament dans DrugBank")

12699 médicament dans DrugBank
84877 médicament dans DrugBank
96206 médicament dans DrugBank
96206 médicament dans DrugBank


In [16]:
df_drugbank.shape[0] + df_chembl.shape[0]

97576

In [18]:
df_dico.head()

Unnamed: 0,id,smiles,synonyms
0,0,BC#N.C=CCCCCN(C)C,['HEX-5-ENYLDIMETHYLAMINE CYANOBORANE']
1,1,BC#N.CCCCCCCCCCCCCCCCCN(C)C,['HEPTADECYLDIMETHYLAMINE CYANOBORANE']
2,2,BC#N.CCCCCCCCCCCCCCCN(C)C,['DIMETHYLPENTADECYLAMINE CYANOBORANE']
3,3,BC#N.CCCCCCCCCCCCCN(C)C,['DIMETHYLTRIDECYLAMINE CYANOBORANE']
4,4,BC#N.CCCCCCCCCCCCN(C)C,['DODECYLDIMETHYLAMINE CYANOBORANE']


## Intégration - test des fonctions

In [6]:
os.chdir("/home/bourhani@clb.loc/SMILES/scripts")

# Check if the GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# file = "/home/bourhani@clb.loc/SMILES/tests/medical_reports.xlsx" 
# reports = pd.read_excel(file, engine='openpyxl')
file = "/home/bourhani@clb.loc/saepred/data_test/featurized/OncoBERT_nobias_2LAB/train" 
dataset = load_from_disk(file)
dataset

Using device: cuda


Dataset({
    features: ['ippr', 'text', 'labels', 'dt_since_first', 'dt_since_last', 'ipp_id', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 559820
})

In [8]:
# ----------------------------- DRUGS DETECTION -----------------------------
# detect the drugs in the reports and return a dataframe with the drugs in each text
drugs_dataset = dataset.map(get_drugs_with_edsnlp, batched=True, desc="Detect the drugs in the reports")

Detect the drugs in the reports:   0%|          | 0/559820 [00:00<?, ? examples/s]

Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2248.98it/s]
100%|██████████| 1000/1000 [00:03<00:00, 272.13it/s]
Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2228.45it/s]
100%|██████████| 1000/1000 [00:04<00:00, 235.95it/s]
Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2196.03it/s]
100%|██████████| 1000/1000 [00:04<00:00, 226.30it/s]
Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2054.41it/s]
100%|██████████| 1000/1000 [00:04<00:00, 231.58it/s]
Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2111.29it/s]
100%|██████████| 1000/1000 [00:03<00:00, 257.40it/s]
Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2156.02it/s]
100%|██████████| 1000/1000 [00:04<00:00, 237.26it/s]
Adding terms into the pipeline: 100%|██████████| 1968/1968 [00:00<00:00, 2217.18it/s]
100%|██████████| 1000/1000 [00:04<00:00, 221.90it/s]
Adding terms into the pipel

KeyboardInterrupt: 

In [20]:
# ----------------------------- SMILES TRANSFORMATION -----------------------------
# find their corresponding smiles and return a dataframe with smiles for each drugs
drugs_dataset_cuda = drugs_dataset.with_format("torch", device=device)
smiles_dataset = drugs_dataset.map(get_smiles, batched=True, desc="Get the smiles formula for each drugs")

In [11]:
smiles_dataset = load_from_disk("/home/bourhani@clb.loc/saepred/data_test/featurized/smiles_dataset/train")
smiles_dataset

Dataset({
    features: ['ippr', 'text', 'labels', 'dt_since_first', 'dt_since_last', 'ipp_id', '__index_level_0__', 'input_ids', 'attention_mask', 'drugs', 'smiles'],
    num_rows: 559820
})

In [12]:
save_path = "/home/bourhani@clb.loc/saepred/data_test/featurized/OncoBERT_nobias_2LAB_smiles/train"
enriched_dataset = create_enriched_reports_dataset(smiles_dataset, save_path)

Replacing drug name by their smiles:   0%|          | 0/559820 [00:00<?, ? examples/s]

Saving the dataset (0/6 shards):   0%|          | 0/559820 [00:00<?, ? examples/s]

### Verifications

In [13]:
train = load_from_disk("/home/bourhani@clb.loc/saepred/data_test/featurized/OncoBERT_nobias_2LAB_smiles/train")

In [14]:
max_drugs_entry = max(train["drugs"], key=len)
max_drugs_index = train["drugs"].index(max_drugs_entry)

In [15]:
print(train[max_drugs_index]['drugs'])
print("")
print(train[max_drugs_index]['smiles'])
print("")
print(train[max_drugs_index]['text'])
print("")
print(train[max_drugs_index]['enriched_text'])

['ACICLOVIR', 'ACICLOVIR', 'ALBUMINE', 'ALBUMINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARACYTINE', 'ARTISIAL', 'ARTISIAL', 'ARTISIAL', 'ARTISIAL', 'ARTISIAL', 'AXEPIM', 'AXEPIM', 'AXEPIM', 'BETADINE', 'CANDESARTAN', 'CANDESARTAN', 'CANDESARTAN', 'CANDESARTAN', 'CANDESARTAN', 'CANDESARTAN', 'CANDESARTAN', 'CANDESARTAN', 'CEFEPIME', 'CEFEPIME', 'CEFEPIME', 'CEFOTAXIME', 'CEFOTAXIME', 'CIFLOX', 'CIFLOX', 'CIFLOX', 'CIPROFLOXACINE', 'CIPROFLOXACINE', 'CIPROFLOXACINE', 'CIPROFLOXACINE', 'CIPROFLOXACINE', 'COBALT', 'COBALT', 'EUPANTOL', 'EUPANTOL', 'FASTURTEC', 'FASTURTEC', 'FENTANYL', 'FLAGYL', 'FLAGYL', 'FLAGYL', 'FLAGYL', 'FLAGYL', 'FLUCONAZOLE', 'FLUCONAZOLE', 'FLUCONAZOLE', 'FLUCONAZOLE', 'FLUCONAZOLE', 'FORLAX', 'FORLAX', 'FUNGIZONE', 'GLUCOSE', 'GLUCOSE', 'HYDROCORTISONE', 'HYDROCORTISONE', 'HYDROCORTISONE', 'HYDROCORTISONE', '