In [39]:

import csv
import pandas as pd
import random
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
import seaborn as sns
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolStandardize

In [40]:
def mol2alt_sentence(mol, radius):
    """Same as mol2sentence() expect it only returns the alternating sentence
    Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string).
    Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii
    combined.
    NOTE: Words are ALWAYS reordered according to atom order in the input mol object.
    NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller
    
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
    radius : float 
        Fingerprint radius
    
    Returns
    -------
    list
        alternating sentence
    combined
    """
    radii = list(range(int(radius) + 1))
    info = {}
    _ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info)  # info: dictionary identifier, atom_idx, radius

    mol_atoms = [a.GetIdx() for a in mol.GetAtoms()]
    
#     print(mol_atoms)
    dict_atoms = {x: {r: None for r in radii} for x in mol_atoms}

    for element in info:
        for atom_idx, radius_at in info[element]:
            dict_atoms[atom_idx][radius_at] = element  # {atom number: {fp radius: identifier}}

    # merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc.
    identifiers_alt = []
    for atom in dict_atoms:  # iterate over atoms
        for r in radii:  # iterate over radii
            identifiers_alt.append(dict_atoms[atom][r])

    alternating_sentence = map(str, [x for x in identifiers_alt if x])

    return list(alternating_sentence)

In [41]:
allowedAtomsDict = {
    'H' : 1,'h' : 0,
    'B' : 5,'b' : 0,
    'C' : 6,'c' : 0,
    'N' : 7,'n' : 0,
    'O' : 8,'o' : 0,
    'F' : 9,'f' : 0,
    'P' : 15,'p': 0,
    'S' : 16,'s': 0,
    'Cl': 17,'Br' : 35
}

word = "AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZzBrCl"

def isValidCharacter(c):
    if c not in word or (c in word and c in "HhBbCcNnOoFfPpSsClBr"):
        return True
    return False

def isValidSmiles(smiles,atom_weight = 600,heavy_atom_count = 50):
    '''
        1. smiles能够被rdkit包处理
        2. smiles只包含特定元素
        3. smiles原子权重
    '''
    t_weight = 0
    heavyAtomCount = 0
    left = -len(smiles)-1
    right = -1
    idx = -1
    while True:
        if idx <= left:
            break
        c = smiles[idx]
        if smiles[idx] == 'r' or smiles[idx] == 'l' :
            c = (smiles[idx-1] if idx -1 > right else "#") + c
            idx = idx - 1
        idx = idx - 1
        if isValidCharacter(c) == True:
            if c in allowedAtomsDict.keys():
                t_weight = t_weight + int(allowedAtomsDict[c])
                heavyAtomCount = heavyAtomCount + (1 if int(allowedAtomsDict[c]) > 1 else 0)
        else:
            return False
#     print(type(t_weight),ttype(heavy_atom_count))
    return  True if t_weight >= 3 and t_weight <= atom_weight and heavyAtomCount <= heavy_atom_count else False

In [42]:
lfc = MolStandardize.fragment.LargestFragmentChooser()
def standardizeAndcanonical(smi):
    
    # standardize
    mol = Chem.MolFromSmiles(smi)
    mol2 = lfc.choose(mol)
    smi2 = Chem.MolToSmiles(mol2)
#     print(smi2)
#     # canonical
#     can_smi = Chem.MolToSmiles(Chem.MolFromSmiles(smi2))
# #     print(can_smi)
#     print(can_smi == smi2)
    return smi2

In [43]:
# smiles = 'C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[C@H](Cc4c(CO)c[n+](C)n4C)C3)[C@H](C)[C@H]12'
smiles = ['COC(=O)CCC(=O)CN.Cl', 'C[C@@H](O)[C@H]1C(=O)N2C(C(=O)O)=C(S[C@@H]3CN[C@H](Cc4c(CO)c[n+](C)n4C)C3)[C@H](C)[C@H]12.[Cl-]', 'CC(C)NCCNC(=O)CN(CC(=O)N(C)C1Cc2ccccc2C1)c1cc(Cl)ccc1Oc1ccc(Cl)cc1.Cl', 'Cc1nn(CC(=O)O)c2nc3ccccc3c(NCCCN(C)C)c12.Cl', 'COc1ccc(C(N)CCc2ccc(C)o2)cc1.Cl', 'Cc1c[nH]c2cc(/C=C/C(=O)NC3CCC(CCN4CCc5ccc(C#N)cc5CC4)CC3)ccc12.Cl', 'COc1ccccc1CN(C)CC1CC1c1cc(F)ccc1OC.Cl', 'CC(CCc1ccccc1)NCC(O)CON=C1c2ccccc2-c2ccccc21.Cl',
 'COc1ccc2c(c1)CC(NCc1ccccc1)CC2.Cl', 'CNCCCNc1c2ccccc2nc2cccc([N+](=O)[O-])c12.Cl']
for smi in smiles:
    smi = standardizeAndcanonical(smi)
    mol = Chem.MolFromSmiles(smi)
    sent_0 = mol2alt_sentence(mol,0)
    sent_1 = mol2alt_sentence(mol,1)
    # print(sent_0)
#     print(sent_1[len(sent_0):])
    print(len(sent_0),len(sent_1))

10 20
31 62
40 80
25 50
18 36
36 72
24 48
30 60
20 40
23 46


In [None]:
path = "../dataset/pretrain/pretrain_data.txt"

valid_smiles = []
total = 0
total2 = 0

with open(path,"r") as f:
    for smi in f.readlines():
        if smi[-1] == "\n":
            smi = smi[:-1]
        smi = standardizeAndcanonical(smi)
        if isValidSmiles(smi) == True:
            t = Chem.MolFromSmiles(smi)
            if t != None: # 能够处理
                total2 += 1
                sentence_rid_0 = mol2alt_sentence(t,0)
                sentence_rid_1 = mol2alt_sentence(t,1)
#                 print(sentence[0] == 'none')
                if sentence_rid_0[0] != 'None' and 2*len(sentence_rid_0) != len(sentence_rid_1):
#                     print(len(sentence_rid_0),len(sentence_rid_1))
                    total += 1
                    valid_smiles.append(smi)
        if total2 % 100000 == 0:
            print(total == total2)
            print(total2 / 4000000)
print(total,total2)

In [None]:
import random

path_train = "../dataset/pretrain/pretrain_data_train.txt"
path_test = "../dataset/pretrain/pretrain_data_test.txt"

valid_id = [i for i in range(len(valid_smiles))]
random.shuffle(valid_id)

with open(path_train,"w") as f:
    
    for i in range(0,int(len(valid_id)*0.8)):
        idx = valid_id[i]
        f.write(valid_smiles[i]+"\n")

with open(path_test,"w") as f:
    for i in range(int(len(valid_id)*0.8)+1,len(valid_id)):
        idx = valid_id[i]
        f.write(valid_smiles[i]+"\n")