In [9]:
import glob
import pandas as pd
import numpy as np

from rdkit import DataStructs
from rdkit.Chem import AllChem, rdFingerprintGenerator

### Confirmation of duplicates

In [None]:
# Specify the base directory, only one is needed
base_dir = '../../data/preprocessed/random/'
dfs = []

# Loop through folds and data types (train/test)
for fold in range(5):  # folds 0 to 4
    for data_type in ['train', 'test']:
        # Construct the filename pattern CHANGE HERE
        file_path = f"{base_dir}fold{fold}_smiles_{data_type}.npz"

        # Load the .npz file
        data_files = glob.glob(file_path)  # This will find the matching files
        for data_file in data_files:
            with np.load(data_file, allow_pickle=True) as data:
                df = pd.DataFrame(data['smiles'])
                # Append the DataFrame to the list
                dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

In [11]:
final_df = final_df.rename(columns={0:'smiles'}).drop_duplicates(subset='smiles', keep='first')
print(len(final_df))
final_df

13329


Unnamed: 0,smiles
0,Cc1cc(O)cc(C)c1Cl
1,CC(Oc1ccc(Cl)cc1Cl)C(=O)O
2,COC(=O)C=Cc1ccccc1
3,CC(C)NCC(O)c1ccc2ccccc2c1
4,COc1ccc2c(c1)OC(C)(C)C(c1ccccc1)C2c1ccc(OCCN2C...
...,...
13324,O=C(C[S+]([O-])Cc1ccco1)NC/C=C\COc1cc(CN2CCCCC...
13325,CC(C)=CC(NC(=O)C1=C([O-])c2sc(Cl)cc2S(=O)(=O)N...
13326,CCCCCCCCOC(=O)C1=C(C)NC(C)=C(C(=O)NC2CC2)C1c1c...
13327,N#C/N=c1\ccccn1Cc1ccc(Cl)cc1


In [12]:
duplicates = final_df[final_df.duplicated(subset='smiles', keep=False)]

# Calculate the percentage of duplicates
duplicate_percentage = (len(duplicates) / len(final_df)) * 100 if len(final_df) > 0 else 0

# Display the percentage of duplicates
print(f"Percentage of duplicates in 'smiles': {duplicate_percentage:.2f}%")

Percentage of duplicates in 'smiles': 0.00%


### Modification of smiles

- replace @ with nothing

In [13]:
final_df['smiles'] = final_df['smiles'].str.replace('@', '', regex=False)

In [14]:
duplicates = final_df[final_df.duplicated(subset='smiles', keep=False)]

# Calculate the percentage of duplicates
duplicate_percentage = (len(duplicates) / len(final_df)) * 100 if len(final_df) > 0 else 0
print(f"Percentage of duplicates in 'smiles': {duplicate_percentage:.2f}%")

Percentage of duplicates in 'smiles': 0.35%


In [15]:
final_df = final_df.drop_duplicates(subset='smiles', keep='first')
final_df.reset_index(drop=True, inplace=True)
print(len(final_df))

13304


### Tanimoto similarity on full dataset

In [16]:
all_smiles = final_df['smiles']

morgan_generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)

fps = []

for smiles in all_smiles:
    mol = AllChem.MolFromSmiles(smiles)
    if mol is not None:
        fps.append(morgan_generator.GetCountFingerprint(mol))
    else:
        print(f"Invalid SMILES: {smiles}")

#### Identify only similarities of 1

In [27]:
results_sim = []
# Calculate Tanimoto similarity for each pair of fingerprints
for i in range(len(fps)):
    for j in range(i + 1, len(fps)):  # To avoid duplicate comparisons
        try:
            similarity = DataStructs.TanimotoSimilarity(fps[i], fps[j])
            if similarity == 1:
                results_sim.append({'smile1': all_smiles[i], 'smile2': all_smiles[j], 'similarity': similarity})
        except:
            print('Error in', j)

In [30]:
# Convert results to a DataFrame
similarity = pd.DataFrame(results_sim)
similarity.to_csv('sim_random_smiles.csv')