In [191]:
import pandas as pd
import rdkit
from rdkit import Chem
import pickle

In [182]:
def find_duplicates(data, smi_col='SMILES', name_col='Name'):

    mols = [Chem.MolFromSmiles(smi) for smi in data[smi_col]]
    smiles_rdkit = [Chem.MolToSmiles(mol) for mol in mols]

    for i, smi1 in enumerate(smiles_rdkit):
        for j, smi2 in enumerate(smiles_rdkit):
            if i == j:
                continue
            if smi1 == smi2:
                print(f'{data.loc[i][name_col]} ({i}) and {data.loc[j][name_col]} ({j}) are duplicates')

## Datasets

### Amino

In [117]:
amino_data = pd.read_csv('/Users/matt/Git/MattPhD/Datasets/Amino/Amino_All.csv')
amino_data.drop(['ID','Charged SMILES'], axis=1, inplace=True)
amino_data.columns = ['Name','Concentration (mM)', 'SMILES', 'Exp. % MGS', 'Exp. error', 'Set']
amino_data['Exp. % MGS'] = [round(float(m.strip('%')),1) for m in amino_data['Exp. % MGS'].values]
amino_data['Exp. error'] = [round(float(m.strip('%')),1) for m in amino_data['Exp. error'].values]
amino_data['Name'] = [str(n).lower() for n in amino_data['Name']]
amino_data['Set'] = [s.lower() for s in amino_data['Set']]
amino_data['Set'] = [s.replace('prediction','predict') for s in amino_data['Set']]
amino_data = amino_data[['Name','SMILES','Concentration (mM)','Exp. % MGS', 'Exp. error', 'Set']]
amino_data.to_csv('datasets/amino.csv', index=False)

In [125]:
find_duplicates(amino_data)

### Glyco

In [129]:
glyco_data = pd.read_csv('/Users/matt/Git/MattPhD/Datasets/BenV0/Glyco.csv')
glyco_data.drop(['ID','Formula','Class','Sugar'], inplace=True, axis=1)
glyco_data['Name'] = [str(n).lower() for n in glyco_data['Name']]
glyco_data['Set'] = [s.lower() for s in glyco_data['Set']]
glyco_data['Set'] = [s.replace('prediction', 'predict') for s in glyco_data['Set']]
glyco_data.to_csv('datasets/glyco.csv', index=False)

In [130]:
find_duplicates(glyco_data)

In [168]:
glyco2_data = pd.read_csv('/Users/matt/Git/MattPhD/Datasets/BenV2/Glyco2.csv')
glyco2_data.set_index('ID', inplace=True)
glyco2_duplicates = ['i169','i170','i171','i172','i173','i196','i197','i198','i199','i200','i233','i234']
glyco2_data.drop(glyco2_duplicates, inplace=True)
glyco2_data.reset_index(inplace=True)
glyco2_data.drop(['ID','Formula','Class','Sugar','MGS SEM','Literature reference','Literature ID','Simulated'], inplace=True, axis=1)
glyco2_data['Name'] = [str(n).lower() for n in glyco2_data['Name']]
glyco2_data.columns = ['Name','SMILES','Exp. % MGS', 'Concentration (mM)']
glyco2_data = glyco2_data[['Name','SMILES','Concentration (mM)','Exp. % MGS']]
glyco2_data.to_csv('datasets/glyco2.csv', index=False)

In [180]:
find_duplicates(glyco2_data)

In [181]:
glyco_data.iloc[:124, :].to_csv("datasets/glyco.csv", index=False)

## Descriptors
### Amino

In [383]:
amino_mgs = pickle.load(open("/Users/matt/Git/MattPhD/Datasets/Amino/AminoMGS.pkl","rb"))
amino_data = pd.read_csv("/Users/matt/Git/MattPhD/Datasets/Amino/Amino.csv")
amino_data.set_index('ID', inplace=True)

In [384]:
amino_pred = pd.read_csv('/Users/matt/Git/MattPhD/Datasets/Amino/amino_pred.csv', header=1)
amino_pred_ids = {row['Molport ID']: row['Name'] for _, row in amino_pred.iterrows()}

In [385]:
for id in amino_pred['Alternative ID']:
    name = amino_pred.loc[amino_pred['Alternative ID'] == id, 'Name'].values
    if len(name) > 0:
        name = name[0]
        amino_pred_ids[id] = name

In [394]:
# Symmetry functions
amino_sf = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/wACSF/Data/AminoSymMD.pkl','rb'))
amino_sf.drop([index for index in amino_sf.index if index not in amino_mgs.index], inplace=True)
amino_sf['Name'] = [amino_data.loc[i]['Name'] for i in amino_sf.index]
amino_sf.set_index('Name', inplace=True)

amino_sf_pred = pickle.load(open("/Users/matt/Git/MattPhD/Descriptors/wACSF/Data/AminoFilteredSym.pkl", "rb"))
amino_sf_pred = amino_sf_pred[amino_sf_pred.index.isin(amino_pred_ids)]
amino_sf_pred['Name'] = [amino_pred_ids[id] for id in amino_sf_pred.index]
amino_sf_pred.set_index('Name', inplace=True)

amino_sf = pd.concat([amino_sf, amino_sf_pred], axis=0)
amino_sf.to_csv('descriptors/wacsf_amino.csv', index=True)

In [411]:
# SOAPs
amino_soaps = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/SOAPs/Data/AminoSOAPs_GA_1_6_7_8.pkl','rb'))
amino_soaps.drop([index for index in amino_soaps.index if index not in amino_mgs.index], inplace=True)
amino_soaps['Name'] = [amino_data.loc[i]['Name'] for i in amino_soaps.index]
amino_soaps.set_index('Name', inplace=True)

amino_soaps_pred = pickle.load(open("/Users/matt/Git/MattPhD/Descriptors/SOAPs/Data/AminoFilteredSOAPs_GA_Scaled_1_6_7_8.pkl","rb"))
amino_soaps_pred = amino_soaps_pred[amino_soaps_pred.index.isin(amino_pred_ids)]
amino_soaps_pred['Name'] = [amino_pred_ids[id] for id in amino_soaps_pred.index]
amino_soaps_pred.set_index('Name',inplace=True)

amino_soaps = pd.concat([amino_soaps, amino_soaps_pred], axis=0)
amino_soaps.to_csv('descriptors/soaps_amino.csv', index=True)

In [412]:
# Hydration indices
amino_hi = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/HydrationIndex/Data/AminoHydIdx_All.pkl', 'rb'))
amino_hi['Name'] = [amino_data.loc[i]['Name'] for i in amino_hi.index]
amino_hi.set_index('Name', inplace=True)
amino_hi.to_csv('descriptors/hydidx_amino.csv')


In [415]:
# Hydration histograms
amino_hh = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/RadPDF/Data/AminoRadPDF.pkl', 'rb'))
amino_hh.drop(['a061','a065','a066','a067'], errors='ignore', inplace=True)
amino_hh['Name'] = [amino_data.loc[i]['Name'] for i in amino_hh.index]
amino_hh.set_index('Name', inplace=True)
amino_hh.to_csv('descriptors/hydhist_amino.csv')


### Glyco

In [429]:
glyco2_data = pd.read_csv('/Users/matt/Git/MattPhD/Datasets/BenV2/Glyco2.csv')
glyco2_data.set_index('ID', inplace=True)
glyco2_duplicates = ['i169','i170','i171','i172','i173','i196','i197','i198','i199','i200','i233','i234']
glyco2_data.drop(duplicates, inplace=True)

In [445]:
# Symmetry functions
glyco_sf = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/wACSF/Data/BenV0SymDFT.pkl','rb'))
glyco_sf = glyco_sf.iloc[:123,:]
glyco_sf.drop(glyco2_duplicates,inplace=True, errors='ignore')
glyco_sf['Name'] = [glyco2_data.loc[i,'Name'] for i in glyco_sf.index]
glyco_sf.set_index('Name',inplace=True)
glyco_sf.to_csv('descriptors/wacsf_glyco.csv')

glyco2_sf = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/wACSF/Data/BenV2SymMD.pkl','rb'))
glyco2_sf.drop(glyco2_duplicates,inplace=True, errors='ignore')
glyco2_sf['Name'] = [glyco2_data.loc[i,'Name'] for i in glyco2_sf.index]
glyco2_sf.set_index('Name', inplace=True)
glyco2_sf.to_csv('descriptors/wacsf_glyco2.csv')

In [451]:
# SOAPs
glyco_soaps = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/SOAPs/Data/BenV0SOAPs_GA.pkl','rb'))
glyco_soaps['Name'] = [glyco2_data.loc[i,'Name'] for i in glyco_soaps.index]
glyco_soaps.set_index('Name', inplace=True)
glyco_soaps.to_csv('descriptors/soaps_glyco.csv')

glyco2_soaps = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/SOAPs/Data/BenV2_GA_1_6_8.pkl','rb'))
glyco2_soaps.drop(glyco2_duplicates, inplace=True, errors='ignore')
glyco2_soaps['Name'] = [glyco2_data.loc[i,'Name'] for i in glyco2_soaps.index]
glyco2_soaps.set_index('Name', inplace=True)
glyco2_soaps.to_csv('descriptors/soaps_glyco2.csv')

In [470]:
# Hydration indices
glyco2_hi = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/HydrationIndex/Data/BenV2HydIdx_All.pkl', 'rb'))
glyco2_hi.drop(glyco2_duplicates, inplace=True, errors='ignore')
glyco2_hi['Name'] = [glyco2_data.loc[i,'Name'] for i in glyco2_hi.index]
glyco2_hi.set_index('Name', inplace=True)
glyco2_hi.to_csv('descriptors/hydidx_glyco2.csv', index=True)

glyco_hi = glyco2_hi.iloc[:123, :]
glyco_hi.to_csv('descriptors/hydidx_glyco.csv', index=True)

In [471]:
# Hydration histograms
glyco2_hh = pickle.load(open('/Users/matt/Git/MattPhD/Descriptors/RadPDF/Data/BenV2RadPDF.pkl', 'rb'))
glyco2_hh.drop(glyco2_duplicates, inplace=True, errors='ignore')
glyco2_hh['Name'] = [glyco2_data.loc[i,'Name'] for i in glyco2_hh.index]
glyco2_hh.set_index('Name', inplace=True)
glyco2_hh.to_csv('descriptors/hydhist_glyco2.csv', index=True)

glyco_hh = glyco2_hh.iloc[:123, :]
glyco_hh.to_csv('descriptors/hydhist_glyco.csv', index=True)