# Preprocessing Step  

The metrics scripts require output data in the format: `cOS_{name_generator}_{type_cluster}_{number_of_cluster}_one_column.csv`.  

This notebook preprocesses the data for specific generators like Molpher, DrugEx, and REINVENT, which produce output with more than just a single column of SMILES.  


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Scaffolds.MurckoScaffold import GetScaffoldForMol
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from rdkit.Chem.Scaffolds.MurckoScaffold import MakeScaffoldGeneric

# Convert output sets to format compatibile with scripts to calculate metrics

## Molpher

In [14]:
receptor = 'Leukocyte_elastase' 
type_cluster = 'dis'
number = 4

In [15]:
df = pd.read_csv(f"data/output_sets_new/{receptor}/cOS_Molpher_{type_cluster}_{number}_all_columns.csv", header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,CHEMBL3902357,CHEMBL3926710,COCCCN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2...,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,COCCCN1C(=O)N(C2=CC=CC(C(F)(F)F)=C2)C2=C(C(=O)...,0.790164,16,52.810135,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,C1CCC(C2CCC(C3CCCCC3C3CCCCC3)CC2)CC1
1,CHEMBL3902357,CHEMBL3926710,COCCCN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2...,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,COCCCN1C(=O)N(C2=CC=CC(C(F)(F)F)=C2)C(C)=C(C(C...,0.790164,16,52.810135,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,C1CCC(C2CCC(C3CCCCC3C3CCCCC3)CC2)CC1
2,CHEMBL3902357,CHEMBL3926710,COCCCN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2...,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,COCCCN1C(=O)N(C2=CC=CC(C(F)(F)F)=C2)C(C)=C(C(=...,0.790164,16,52.810135,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,C1CCC(C2CCC(C3CCCCC3C3CCCCC3)CC2)CC1
3,CHEMBL3902357,CHEMBL3926710,COCCCN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2...,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,COCCCCN1C(=O)N(C2=CC=CC(C(F)(F)F)=C2)C(C)=C(C(...,0.790164,16,52.810135,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,C1CCC(C2CCC(C3CCCCC3C3CCCCC3)CC2)CC1
4,CHEMBL3902357,CHEMBL3926710,COCCCN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2...,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,COC(=O)C1=C(C)N(C2=CC=CC(C(F)(F)F)=C2)C(=O)N(C...,0.790164,16,52.810135,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,C1CCC(C2CCC(C3CCCCC3C3CCCCC3)CC2)CC1
...,...,...,...,...,...,...,...,...,...,...
1721136,CHEMBL293138,CHEMBL408621,CC(SC(=O)c1cccs1)C(=O)NCC(=O)N1CSCC1C(=O)O,CC1(C)C2CC[C@]1(C)C(OC(=O)/C=C/c1cc(O)c(O)c(O)...,CC1=CC(C=CC(=O)OC2CC3CCC2(C)C3(C)C)=CC=C1O,0.199458,76,2319.140899,C(CCCC1CCCC1)CCCC1CCCC1,C1CCC(CCCCC2CC3CCC2C3)CC1
1721137,CHEMBL293138,CHEMBL408621,CC(SC(=O)c1cccs1)C(=O)NCC(=O)N1CSCC1C(=O)O,CC1(C)C2CC[C@]1(C)C(OC(=O)/C=C/c1cc(O)c(O)c(O)...,CC1=CC(C=CC(=O)OC2CC3CCC2(C)C3(C)C)=CC(O)=C1O,0.199458,76,2319.140899,C(CCCC1CCCC1)CCCC1CCCC1,C1CCC(CCCCC2CC3CCC2C3)CC1
1721138,CHEMBL293138,CHEMBL408621,CC(SC(=O)c1cccs1)C(=O)NCC(=O)N1CSCC1C(=O)O,CC1(C)C2CC[C@]1(C)C(OC(=O)/C=C/c1cc(O)c(O)c(O)...,CCC1=CC(C=CC(=O)OC2CC3CCC2(C)C3(C)C)=CC(O)=C1O,0.199458,76,2319.140899,C(CCCC1CCCC1)CCCC1CCCC1,C1CCC(CCCCC2CC3CCC2C3)CC1
1721139,CHEMBL293138,CHEMBL408621,CC(SC(=O)c1cccs1)C(=O)NCC(=O)N1CSCC1C(=O)O,CC1(C)C2CC[C@]1(C)C(OC(=O)/C=C/c1cc(O)c(O)c(O)...,CC1(C)C2CCC1(C)C(OC(=O)C=CC1=CC(O)=C(O)C(CO)=C...,0.199458,76,2319.140899,C(CCCC1CCCC1)CCCC1CCCC1,C1CCC(CCCCC2CC3CCC2C3)CC1


In [16]:
morphs = df[4]
len(morphs)

1721141

In [17]:
morphs.to_csv(f"data/output_sets_new/{receptor}/cOS_Molpher_{type_cluster}_{number}_one_column.csv", header=None, index = False)

## DrugEx

In [86]:
numbers = [0,1,2,3,4]
receptor = 'Leukocyte_elastase' 
type_cluster = 'sim'
generator = 'DrugEx_RNN_epsilon_0.6' # oprions: DrugEx | DrugEx3(with new parameters, update numbers of epochs)

In [87]:
for number in numbers:
    df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")
    delete_row = df.index[df.SMILES=='SMILES'].tolist()
    df = df.drop(delete_row)
    morphs = df.SMILES.drop_duplicates(keep='last')
    print(len(morphs))

    morphs.to_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_one_column.csv", header=False, index=False)

  df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")


922155


  df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")


909276


  df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")


914099


  df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")


909200


  df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")


910148


# REINVENT

In [55]:
number = 1
receptor = 'Leukocyte_elastase' 
type_cluster = 'dis'
generator = 'REINVENT' 

In [56]:
df = pd.read_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_all_columns.csv")
df

Unnamed: 0,Agent,Prior,Target,Score,SMILES,SMILES_state,Scaffold,SAScore,SAScore (raw),QED,QED (raw),QSAR,QSAR (raw),Alerts,Alerts (raw),step
0,37.8266,37.8266,58.9304,0.755914,CC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C(=O)...,1,O=C(NCc1cnnnc1)N1C(=O)N(c2ccccc2)C=CC1c1ccccc1...,0.841697,4.1938,0.199819,0.1998,0.886507,8.5534,1.0,1.0,1
1,18.4896,18.4896,89.1868,0.841222,CC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C(=O)...,1,O=C(NCc1cnoc1)N1C(=O)N(c2ccccc2)C=CC1c1ccccc1S...,0.879382,3.9470,0.352420,0.3524,0.940555,8.8293,1.0,1.0,1
2,17.3997,17.3997,82.6228,0.781425,CC(C)N1CCN(C(=O)c2cc(C#N)ccc2C2C3=C(CCCC3=O)N(...,1,O=C1CCCC2=C1C(c1ccccc1C(=O)N1CCNCC1)NC(=O)N2c1...,0.924723,3.5392,0.486663,0.4867,0.796847,8.2842,1.0,1.0,1
3,24.6254,24.6254,-24.6254,0.000000,CC1=C(C#N)=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N1C(=...,0,,0.000000,0.0000,0.000000,0.0000,0.000000,0.0000,0.0,0.0,1
4,19.7625,19.7625,87.0788,0.834698,CCN=c1cccc(-c2cc(C#N)ccc2C2C(C#N)=C(C)N(c3cccc...,1,N=c1cccc(-c2ccccc2C2C=CN(c3ccccc3)C(=O)N2C2CN(...,0.864719,4.0499,0.369902,0.3699,0.928189,8.7503,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083656,41.6454,41.6454,-38.9689,0.020910,C=CC12CCC(S(=O)(=O)NCC)=C(C)C1CCC1C3CCC(C(=O)N...,1,C1=CC2CCC3C4CCCC4CCC3C2CC1,0.719745,4.7627,0.498409,0.4984,0.004836,5.6679,1.0,1.0,1
1083657,25.2745,25.2745,76.3833,0.794202,N#Cc1ccc(C2C3=C(CCC3=O)N(c3cccc(C(F)(F)F)c3)C(...,1,O=C1CCC2=C1C(c1ccccc1C1=CCCNC1)N(C(=O)NC1CCS(=...,0.820223,4.3134,0.428449,0.4284,0.859450,8.4578,1.0,1.0,1
1083658,29.5946,29.5946,-29.5946,0.000000,N#Cc1ccc(C2C3=C(CCC3=O)N(c3cccc(C(F)(F)F)c3)C(...,1,O=C1CCC2=C1C(c1ccccc1C1=CCC(OC(=O)N3CCCC3)CC1)...,0.000000,0.0000,0.000000,0.0000,0.000000,0.0000,0.0,0.0,1
1083659,22.2524,22.2524,85.7259,0.843580,CC1=C(C#N)C(c2ccc(C#N)c(CNCC3CCOCC3)c2)N(C)C(=...,1,O=C1NC(c2cccc(CNCC3CCOCC3)c2)C=CN1c1ccccc1,0.926750,3.5161,0.539610,0.5396,0.875346,8.5118,1.0,1.0,1


In [57]:
df_sorted = df.sort_values(by=['Score'], ascending=False)

# Vybrání nejlepšího řádku pro každou unikátní hodnotu ve sloupci "ID"
df_unique = df_sorted.drop_duplicates(subset=['SMILES'], keep='first')
len(df_unique)

688234

In [58]:
morphs = df_unique.SMILES

In [59]:
morphs.to_csv(f"data/output_sets/{receptor}/{generator}/cOS_{generator}_{type_cluster}_{number}_one_column.csv", header=False, index=False)