In [1]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import itertools
from rdkit import DataStructs
from rdkit.Chem.Scaffolds.MurckoScaffold import MakeScaffoldGeneric
from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
from rdkit.Chem import AllChem
import numpy as np

# Showing scaffolds in each cluster for each generators

In [94]:

receptor = 'Leukocyte_elastase'
df = pd.read_csv(f"data/input_recall_sets_new/{receptor}/{receptor}_split_to_clusters_using_KMedoids.csv")
df

Unnamed: 0,molregno,stand_type,pchembl_value,stand_value,canonical_smiles,stand_inchi,chembl_id,tid,pref_name,scaffolds_csk,mfp,clusters
0,2135949,IC50,7.18,66.0,CC1CC2=C(C1=O)C(c1ccc(C#N)cc1)NC(=O)N2c1cccc(C...,XVAKYHYZPKKXPJ-UHFFFAOYSA-N,CHEMBL3926578,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0
1,2108566,IC50,7.70,20.0,Cc1c(-c2cccc(C(F)(F)F)c2)c(=O)c(C(=O)NCc2ccc(C...,GVZLHJFVWYZJOL-UHFFFAOYSA-N,CHEMBL3899195,235,Leukocyte elastase,C1CCC(C2CCCC(CCCC3CCC(CC4CCCC4)CC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
2,2116233,IC50,7.38,42.0,COCCn1cc(C(=O)NCc2ccc(S(C)(=O)=O)cc2)c(=O)c(-c...,MBIXKVGDRHLSCV-UHFFFAOYSA-N,CHEMBL3906862,235,Leukocyte elastase,C1CCC(CCCC2CCCC(C3CCCCC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
3,2526639,IC50,7.05,89.0,Cc1cccc(C(=O)n2cc(C#N)c3ncccc32)c1,YYSDWURRUMMAIG-UHFFFAOYSA-N,CHEMBL4790048,235,Leukocyte elastase,C1CCC(CC2CCC3CCCCC32)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4
4,2192565,IC50,8.42,3.8,N#Cc1ccc(C2C3=C(CCC3=O)N(c3cccc(C(F)(F)F)c3)C(...,FFGCQYSSTYWKTM-UHFFFAOYSA-N,CHEMBL3983194,235,Leukocyte elastase,C1CCC(CCC2CC(C3CCCCC3)C3CCCC3C2C2CCCCC2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
...,...,...,...,...,...,...,...,...,...,...,...,...
211,2165415,IC50,8.52,3.0,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,FPTMCPUSBLSGFP-UHFFFAOYSA-N,CHEMBL3956044,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3CCC3CC3)CC2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,2
212,2320305,IC50,6.10,800.0,O=C(Nc1ccccc1)c1cn(C(=O)C2CC2)c2ncccc12,YAQCMERNUDLGCS-UHFFFAOYSA-N,CHEMBL4280854,235,Leukocyte elastase,C1CCC(CCC2CC(CC3CC3)C3CCCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4
213,69173,IC50,6.64,230.0,Cc1ccc(N2C(=O)N(c3ccc(C)cc3)C2=O)cc1,AZZFGWWOGRBVOV-UHFFFAOYSA-N,CHEMBL296771,235,Leukocyte elastase,C1CCC(C2CC(C3CCCCC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
214,2137653,IC50,8.41,3.9,CN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2)C1c...,OZHWFTPSRFSWRF-UHFFFAOYSA-N,CHEMBL3928282,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3CC3CC4CCCCC4C3)C3CCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0


In [95]:
for x in range(5):
    df_ = df[df.clusters==x]
    IPythonConsole.drawOptions.drawMolsSameScale=False
    mols=[Chem.MolFromSmiles(z) for z in df_.scaffolds_csk]
    d=Draw.MolsToGridImage(mols,molsPerRow=8, subImgSize=(180, 180), returnPNG=False)
    d.save(f"img/clusters/scaffolds_in_cluster_{x}_{receptor}.png")

# Calculate statistics about end runs for Molpher

In [39]:
good_runs_0 = 11039
good_runs_1 = 11013
good_runs_2 = 11093
good_runs_3 = 7755
good_runs_4 = 5814


In [48]:
all_runs_0 = 26732
all_runs_1 = 28730
all_runs_2 = 26732
all_runs_3 = 27390
all_runs_4 = 25440

In [49]:
df = pd.DataFrame(columns = ['all_runs', 'good_runs', 'percentage'])

In [50]:
df.loc[len(df.index)] = [all_runs_0,good_runs_0,f"{((good_runs_0/all_runs_0)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_1,good_runs_1,f"{((good_runs_1/all_runs_1)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_2,good_runs_2,f"{((good_runs_2/all_runs_2)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_3,good_runs_3,f"{((good_runs_3/all_runs_3)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_4,good_runs_4,f"{((good_runs_4/all_runs_4)*100):.1f}%"]
df

Unnamed: 0,all_runs,good_runs,percentage
0,26732,11039,41.3%
1,28730,11013,38.3%
2,26732,11093,41.5%
3,27390,7755,28.3%
4,25440,5814,22.9%


In [47]:
df.to_csv("data/information_about_runs/Glucocorticoid_receptor/dis/information_about_runs.csv", index_label = False)


In [38]:
print(f"{31.655:.2f}")

31.66


In [78]:
number = 4
type_split = 'sim'
dff = pd.read_csv(f'data/output_sets/Leukocyte_elastase/cOS_Molpher_{type_split}_{number}_all_columns.csv', header = None)
dff.columns = ['chembl_start', 'chembl_stop', 'start_smiles', 'stop_smiles', 'morph', 'tanimoto', 'length', 'time']
print(len(dff.groupby(['chembl_start', 'chembl_stop']).count()))

dff = pd.read_csv(f"data/input_recall_sets/Leukocyte_elastase/IS_Molpher_Leukocyte_elastase_{type_split}_{number}.csv")
len(dff)

13970


30450

In [79]:
# FOR SIM SPLIT

good_runs_0 = 13130
good_runs_1 = 13664
good_runs_2 = 13026
good_runs_3 = 14398
good_runs_4 = 13970


all_runs_0 = 29070
all_runs_1 = 29756
all_runs_2 = 30102
all_runs_3 = 30450
all_runs_4 = 30450

df = pd.DataFrame(columns = ['all_runs', 'good_runs', 'percentage'])


df.loc[len(df.index)] = [all_runs_0,good_runs_0,f"{((good_runs_0/all_runs_0)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_1,good_runs_1,f"{((good_runs_1/all_runs_1)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_2,good_runs_2,f"{((good_runs_2/all_runs_2)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_3,good_runs_3,f"{((good_runs_3/all_runs_3)*100):.1f}%"]
df.loc[len(df.index)] = [all_runs_4,good_runs_4,f"{((good_runs_4/all_runs_4)*100):.1f}%"]
df


df.to_csv(f"data/information_about_runs/Leukocyte_elastase/{type_split}/information_about_runs.csv", index_label = False)


# Create statistics about input and between spilt

In [111]:
receptor = 'Leukocyte_elastase'
df = pd.read_csv(f"data/input_recall_sets_new/{receptor}/{receptor}_split_to_clusters_using_KMedoids.csv")
df

Unnamed: 0,molregno,stand_type,pchembl_value,stand_value,canonical_smiles,stand_inchi,chembl_id,tid,pref_name,scaffolds_csk,mfp,clusters
0,2135949,IC50,7.18,66.0,CC1CC2=C(C1=O)C(c1ccc(C#N)cc1)NC(=O)N2c1cccc(C...,XVAKYHYZPKKXPJ-UHFFFAOYSA-N,CHEMBL3926578,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0
1,2108566,IC50,7.70,20.0,Cc1c(-c2cccc(C(F)(F)F)c2)c(=O)c(C(=O)NCc2ccc(C...,GVZLHJFVWYZJOL-UHFFFAOYSA-N,CHEMBL3899195,235,Leukocyte elastase,C1CCC(C2CCCC(CCCC3CCC(CC4CCCC4)CC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
2,2116233,IC50,7.38,42.0,COCCn1cc(C(=O)NCc2ccc(S(C)(=O)=O)cc2)c(=O)c(-c...,MBIXKVGDRHLSCV-UHFFFAOYSA-N,CHEMBL3906862,235,Leukocyte elastase,C1CCC(CCCC2CCCC(C3CCCCC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
3,2526639,IC50,7.05,89.0,Cc1cccc(C(=O)n2cc(C#N)c3ncccc32)c1,YYSDWURRUMMAIG-UHFFFAOYSA-N,CHEMBL4790048,235,Leukocyte elastase,C1CCC(CC2CCC3CCCCC32)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4
4,2192565,IC50,8.42,3.8,N#Cc1ccc(C2C3=C(CCC3=O)N(c3cccc(C(F)(F)F)c3)C(...,FFGCQYSSTYWKTM-UHFFFAOYSA-N,CHEMBL3983194,235,Leukocyte elastase,C1CCC(CCC2CC(C3CCCCC3)C3CCCC3C2C2CCCCC2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
...,...,...,...,...,...,...,...,...,...,...,...,...
211,2165415,IC50,8.52,3.0,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,FPTMCPUSBLSGFP-UHFFFAOYSA-N,CHEMBL3956044,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3CCC3CC3)CC2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,2
212,2320305,IC50,6.10,800.0,O=C(Nc1ccccc1)c1cn(C(=O)C2CC2)c2ncccc12,YAQCMERNUDLGCS-UHFFFAOYSA-N,CHEMBL4280854,235,Leukocyte elastase,C1CCC(CCC2CC(CC3CC3)C3CCCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4
213,69173,IC50,6.64,230.0,Cc1ccc(N2C(=O)N(c3ccc(C)cc3)C2=O)cc1,AZZFGWWOGRBVOV-UHFFFAOYSA-N,CHEMBL296771,235,Leukocyte elastase,C1CCC(C2CC(C3CCCCC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
214,2137653,IC50,8.41,3.9,CN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2)C1c...,OZHWFTPSRFSWRF-UHFFFAOYSA-N,CHEMBL3928282,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3CC3CC4CCCCC4C3)C3CCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0


In [112]:
df_0 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==0].scaffolds_csk]
df_1 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==1].scaffolds_csk.tolist()]
df_2 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==2].scaffolds_csk.tolist()]
df_3 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==3].scaffolds_csk.tolist()]
df_4 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==4].scaffolds_csk.tolist()]


In [121]:

all_df = [df_0,df_1,df_2,df_3,df_4]
results = {}
results['index'] = ['cluster_0_max', 'cluster_1_max', 'cluster_2_max', 'cluster_3_max', 'cluster_4_max']
results['cluster_0'] = []
results['cluster_1'] = []
results['cluster_2'] = []
results['cluster_3'] = []
results['cluster_4'] = []

for y in range(5):
    for x in range(5):
        if x == y:
            results[f"cluster_{y}"].append("-")
        else:
            unique_combinations_0 = []
            for i in range(len(all_df[y])):
                for j in range(len(all_df[x])):
                    scaf_1 = all_df[y][i]
                    scaf_2 = all_df[x][j]
                    tanimoto = DataStructs.TanimotoSimilarity(scaf_1,scaf_2)
                    #print(DataStructs.TanimotoSimilarity(scaf_1,scaf_2))
                    #print(DataStructs.TanimotoSimilarity(scaf_2,scaf_1))
                    unique_combinations_0.append(tanimoto)
            max = np.array(unique_combinations_0).max()
            results[f"cluster_{y}"].append(max)

In [122]:
results

{'index': ['cluster_0_max',
  'cluster_1_max',
  'cluster_2_max',
  'cluster_3_max',
  'cluster_4_max'],
 'cluster_0': ['-',
  0.7333333333333333,
  0.6161616161616161,
  0.34782608695652173,
  0.8791208791208791],
 'cluster_1': [0.7333333333333333,
  '-',
  0.9733333333333334,
  0.5714285714285714,
  0.918918918918919],
 'cluster_2': [0.6161616161616161,
  0.9733333333333334,
  '-',
  0.6017699115044248,
  0.918918918918919],
 'cluster_3': [0.34782608695652173,
  0.5714285714285714,
  0.6017699115044248,
  '-',
  0.4336283185840708],
 'cluster_4': [0.8791208791208791,
  0.918918918918919,
  0.918918918918919,
  0.4336283185840708,
  '-']}

In [124]:

dfff = pd.DataFrame.from_dict(results).set_index('index')
dfff
#dfff.to_csv(f"data/information_about_clusters/{receptor}/max_value_between_clusters.csv")

In [None]:
a = list(itertools.permutations(df,2))
        for x in a:
            start_id = x[0]
            stop_id = x[1]

In [19]:
dff =[1,2,3,4,5]
dff_1 = [3,5,7]

In [17]:
a = list(itertools.combinations(dff,2))
a

[(1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 4),
 (3, 5),
 (4, 5)]

In [21]:
unique_combinations = []
for i in range(len(dff)):
    for j in range(len(dff_1)):
        unique_combinations.append((dff[i], dff_1[j]))
 
print(unique_combinations)

[(1, 3), (1, 5), (1, 7), (2, 3), (2, 5), (2, 7), (3, 3), (3, 5), (3, 7), (4, 3), (4, 5), (4, 7), (5, 3), (5, 5), (5, 7)]


# Create statistic inside cluster

In [151]:
receptor = 'Leukocyte_elastase'
df = pd.read_csv(f"data/input_recall_sets_new/{receptor}/{receptor}_split_to_clusters_using_KMedoids.csv")
df

Unnamed: 0,molregno,stand_type,pchembl_value,stand_value,canonical_smiles,stand_inchi,chembl_id,tid,pref_name,scaffolds_csk,mfp,clusters
0,2135949,IC50,7.18,66.0,CC1CC2=C(C1=O)C(c1ccc(C#N)cc1)NC(=O)N2c1cccc(C...,XVAKYHYZPKKXPJ-UHFFFAOYSA-N,CHEMBL3926578,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3)C3CCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0
1,2108566,IC50,7.70,20.0,Cc1c(-c2cccc(C(F)(F)F)c2)c(=O)c(C(=O)NCc2ccc(C...,GVZLHJFVWYZJOL-UHFFFAOYSA-N,CHEMBL3899195,235,Leukocyte elastase,C1CCC(C2CCCC(CCCC3CCC(CC4CCCC4)CC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
2,2116233,IC50,7.38,42.0,COCCn1cc(C(=O)NCc2ccc(S(C)(=O)=O)cc2)c(=O)c(-c...,MBIXKVGDRHLSCV-UHFFFAOYSA-N,CHEMBL3906862,235,Leukocyte elastase,C1CCC(CCCC2CCCC(C3CCCCC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
3,2526639,IC50,7.05,89.0,Cc1cccc(C(=O)n2cc(C#N)c3ncccc32)c1,YYSDWURRUMMAIG-UHFFFAOYSA-N,CHEMBL4790048,235,Leukocyte elastase,C1CCC(CC2CCC3CCCCC32)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4
4,2192565,IC50,8.42,3.8,N#Cc1ccc(C2C3=C(CCC3=O)N(c3cccc(C(F)(F)F)c3)C(...,FFGCQYSSTYWKTM-UHFFFAOYSA-N,CHEMBL3983194,235,Leukocyte elastase,C1CCC(CCC2CC(C3CCCCC3)C3CCCC3C2C2CCCCC2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
...,...,...,...,...,...,...,...,...,...,...,...,...
211,2165415,IC50,8.52,3.0,CCOC(=O)C1=C(C)N(c2cccc(C(F)(F)F)c2)C(=O)N(C)C...,FPTMCPUSBLSGFP-UHFFFAOYSA-N,CHEMBL3956044,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3CCC3CC3)CC2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,2
212,2320305,IC50,6.10,800.0,O=C(Nc1ccccc1)c1cn(C(=O)C2CC2)c2ncccc12,YAQCMERNUDLGCS-UHFFFAOYSA-N,CHEMBL4280854,235,Leukocyte elastase,C1CCC(CCC2CC(CC3CC3)C3CCCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,4
213,69173,IC50,6.64,230.0,Cc1ccc(N2C(=O)N(c3ccc(C)cc3)C2=O)cc1,AZZFGWWOGRBVOV-UHFFFAOYSA-N,CHEMBL296771,235,Leukocyte elastase,C1CCC(C2CC(C3CCCCC3)C2)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,1
214,2137653,IC50,8.41,3.9,CN1C(=O)N(c2cccc(C(F)(F)F)c2)C2=C(C(=O)CC2)C1c...,OZHWFTPSRFSWRF-UHFFFAOYSA-N,CHEMBL3928282,235,Leukocyte elastase,C1CCC(C2CCC(C3CCCCC3CC3CC4CCCCC4C3)C3CCCC23)CC1,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,0


In [152]:
df_0 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==0].scaffolds_csk]
df_1 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==1].scaffolds_csk.tolist()]
df_2 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==2].scaffolds_csk.tolist()]
df_3 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==3].scaffolds_csk.tolist()]
df_4 = [(AllChem.GetMorganFingerprintAsBitVect\
                    (Chem.MolFromSmiles(x),10, nBits=2048)) for x in df[df.clusters==4].scaffolds_csk.tolist()]


In [153]:
tanimoto_array = {
"index": ["mean", "min", "max"],
"cluster_0" : [],
"cluster_1" : [],
"cluster_2" : [],
"cluster_3" : [],
"cluster_4" : [],
}

df_all = [df_0, df_1, df_2, df_3, df_4]
for x in range(5):
    a = list(itertools.combinations(df_all[x],2))
    arr = []
    for i in a:
    
        tanimoto = DataStructs.TanimotoSimilarity(i[0],i[1])
        arr.append(tanimoto)
    tanimoto_array[f"cluster_{x}"].append(np.array(arr).mean())
    tanimoto_array[f"cluster_{x}"].append(np.array(arr).min())
    tanimoto_array[f"cluster_{x}"].append(np.array(arr).max())
        

In [154]:
tanimoto_array

{'index': ['mean', 'min', 'max'],
 'cluster_0': [0.23029342266808148, 0.06164383561643835, 0.9811320754716981],
 'cluster_1': [0.2236857710009376, 0.061224489795918366, 0.9666666666666667],
 'cluster_2': [0.23487403269231502, 0.08391608391608392, 0.9782608695652174],
 'cluster_3': [0.1983618482477161, 0.015306122448979591, 1.0],
 'cluster_4': [0.20949814592237753, 0.055776892430278883, 0.9866666666666667]}

In [155]:
df = pd.DataFrame.from_dict(tanimoto_array).set_index('index')

In [156]:
df.to_csv(f"data/information_about_clusters/{receptor}/statistic_incide_cluster.csv")

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from scipy.stats import entropy  # Pro výpočet KL divergence
import sys
import pandas as pd

def smiles_to_fingerprint(smiles_list, radius=3, n_bits=2048):
    """Převod seznamu SMILES na Morgan Fingerprinty."""
    fingerprints = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            fingerprints.append(fp)
    return np.array(fingerprints)

def calculate_distribution(fingerprints):
    """Vypočítá distribuci (normalizovaný histogram) fingerprintů."""
    summed_fp = np.sum(fingerprints, axis=0)
    distribution = summed_fp / np.sum(summed_fp)  # Normalizace
    return distribution

def calculate_kl_divergence(distribution_p, distribution_q):
    """Spočítá KL divergenci mezi dvěma distribučními pravděpodobnostmi."""
    distribution_p = smooth_distribution(distribution_p)
    distribution_q = smooth_distribution(distribution_q)
    return entropy(distribution_p, distribution_q)


def smooth_distribution(distribution, epsilon=1e-10):
    """Zajistí, že v distribuci nejsou nulové hodnoty."""
    return distribution + epsilon

In [3]:

type_cluster = 'dis'
num = 0
generator = 'Molpher'

output_set = pd.read_csv(f"data/output_sets/Glucocorticoid_receptor/cOS_{generator}_{type_cluster}_{num}_one_column.csv", header=None)[0].tolist()

recall_set = pd.read_csv(f"data/input_recall_sets/Glucocorticoid_receptor/cRS_Glucocorticoid_receptor_{type_cluster}_{num}.csv", header=None)[0].tolist()

print("----------------------------------------------------------------------------------------------------------------")
print(f"{generator} {type_cluster} {num}")

# Převod SMILES na Fingerprinty
fingerprints_1 = smiles_to_fingerprint(output_set)
fingerprints_2 = smiles_to_fingerprint(recall_set)




----------------------------------------------------------------------------------------------------------------
Molpher dis 0


In [4]:
len(fingerprints_1)

916551

In [5]:
len(output_set)

916551

In [6]:
# Výpočet distribuce
distribution_1 = calculate_distribution(fingerprints_1)
distribution_2 = calculate_distribution(fingerprints_2)



In [7]:
distribution_1

array([0.00010575, 0.00523781, 0.00105501, ..., 0.0001443 , 0.00038624,
       0.00015004])

In [8]:
distribution_2

array([0.        , 0.00140287, 0.00024398, ..., 0.        , 0.00079292,
       0.00012199])

In [9]:
# Výpočet KL divergence
kl_divergence = calculate_kl_divergence(distribution_1, distribution_2)

print(f"KL Divergence: {kl_divergence}")

KL Divergence: 2.6624849764176273


In [20]:
df_KL = pd.DataFrame(columns = ['generator', 'KL'])
df_KL


Unnamed: 0,generator,KL


In [21]:
df_KL.loc[len(df_KL)] = ['Molpher', kl_divergence]
df_KL

Unnamed: 0,generator,KL
0,Molpher,2.662485
