In [1]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import rdFMCS

In [3]:
df = pd.read_csv('../../../Template_Based_Docking_Project_GitRepo/Expanded_Astex_Diverse_Set/'+
                 'expanded_Astex_with_smiles.csv')

df = df[df['Uniprot_ID'].isin(os.listdir('../data/aligned'))]

In [5]:
base = '../data/aligned/'

rmsd = []

for unp, protein, ligand in df[['Uniprot_ID', 'Protein_ID', 'Ligand_Name']].values:
    path = base + unp + '/' + protein + '_' + ligand + '/rmsd_all_in_' + protein + '_' + ligand + '.csv'
    try:
        to_add = pd.read_csv(path)
        to_add['uniprot_id'] = unp
        rmsd += [to_add.copy()]
    except:
        continue

rmsd_df = pd.concat(rmsd)
del rmsd
del to_add
rmsd_df = rmsd_df.reset_index(drop=True)

In [6]:
# before: (27609, 4)
rmsd_df.shape

(78625, 4)

In [7]:
rmsd_df.to_csv('../data/rmsd_values_base.csv')

In [8]:
# use precalculated mcs to speed up
precalculated_df = pd.read_csv('../../Analysis_of_Docking/data/pre-calculated/rmsd_values_plus_smiles.csv')
pairs_ligands = {t_d_mcs[0]+'_'+t_d_mcs[1]:t_d_mcs[2] for t_d_mcs in precalculated_df[['template', 'docked', 'mcs_smartsString']].values}
pairs_ligands.update({t_d_mcs[1]+'_'+t_d_mcs[0]:t_d_mcs[2] for t_d_mcs in precalculated_df[['template', 'docked', 'mcs_smartsString']].values})

rmsd_df['smiles_template'] = None
rmsd_df['smiles_docked'] = None
rmsd_df['mcs_smartsString'] = None

for index, template, docked in rmsd_df[['template', 'docked']].itertuples():

    smiles_template = df.loc[df['Ligand_Name']==template.split('_')[1], 'Smiles'].values[0]
    smiles_docked = df.loc[df['Ligand_Name']==docked.split('_')[1], 'Smiles'].values[0]

    rmsd_df.at[index, 'smiles_template'] = smiles_template
    rmsd_df.at[index, 'smiles_docked'] = smiles_docked

    if '_'.join([template, docked]) in pairs_ligands:

        rmsd_df.at[index, 'mcs_smartsString'] = pairs_ligands['_'.join([template, docked])]
    else:
        try:
            mcs = rdFMCS.FindMCS([Chem.MolFromSmiles(smiles_template), Chem.MolFromSmiles(smiles_docked)],
                                 matchValences=True)

            rmsd_df.at[index, 'mcs_smartsString'] = mcs.smartsString
        except Exception as e:
            print('Exception', e)
            print(index, template, docked)

print('Done!')


Done!


In [25]:
rmsd_df.to_csv('../data/rmsd_values_plus_smiles.csv', index=False)