In [2]:
from pymongo import MongoClient
import csv

# Connection to MongoDB
client = MongoClient('localhost', 27017)  # Update with your MongoDB host and port
db = client['belka']  # Database where you want to store the molecules
collection = db['molecules_collection']  # Collection for storing each molecule

# Process each pair of files
for i in range(1, 12):  # Assuming files are numbered from 1 to 11
    smiles_file = f'./u_files/u{i}.txt'
    properties_file = f'./u_files/swissadme{i}.csv'
    
    with open(smiles_file, 'r') as file_smiles, open(properties_file, newline='') as file_props:
        smiles_lines = file_smiles.readlines()
        properties_reader = csv.DictReader(file_props)
        
        for smiles, props in zip(smiles_lines, properties_reader):
            smiles = smiles.strip()
            # Creating the document
            molecule_data = {
                'SMILES': smiles,
                'properties': props
            }
            # Inserting the document into MongoDB
            collection.insert_one(molecule_data)

print("Data insertion complete.")

Data insertion complete.


In [9]:
from pymongo import MongoClient
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
from tqdm.auto import tqdm
import pickle
from bson.binary import Binary

from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Avalon.pyAvalonTools import GetAvalonFP
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
from rdkit.Chem.Descriptors3D import NPR1, NPR2, PMI1, PMI2, PMI3, RadiusOfGyration, InertialShapeFactor, Eccentricity, Asphericity, SpherocityIndex
from rdkit.Chem.rdPartialCharges import ComputeGasteigerCharges


# MongoDB Setup
client = MongoClient('localhost', 27017)
db = client['belka']
collection = db['molecules_collection']

# Fetch all molecule documents
molecules = list(collection.find({}))

# Define processing function
def compute_properties(molecule):
    smiles = molecule['SMILES']
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)  # Add Hydrogens

    # Compute properties
    ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    maccs = MACCSkeys.GenMACCSKeys(mol)
    RDKFingerprint = Chem.rdmolops.RDKFingerprint(mol)
    Torsion = Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprint(mol)
    Avalon = GetAvalonFP(mol)
    tpsa = rdMolDescriptors.CalcTPSA(mol)
    log_p = Descriptors.MolLogP(mol)

    
     # # Generate and optimize conformation
    AllChem.EmbedMolecule(mol, AllChem.ETKDG()) 
    AllChem.MMFFOptimizeMolecule(mol) 

    # Calculate properties
    mol_weight = Descriptors.MolWt(mol)
    log_p = Descriptors.MolLogP(mol)
    tpsa = rdMolDescriptors.CalcTPSA(mol)
    # 3D descriptors
    pmi1, pmi2, pmi3 = PMI1(mol), PMI2(mol), PMI3(mol)
    npr1, npr2 = NPR1(mol), NPR2(mol)  # Normalized principal moments ratio
    rg = RadiusOfGyration(mol)  # Radius of gyration
    inertial_shape_factor = InertialShapeFactor(mol)
    eccentricity = Eccentricity(mol)
    asphericity = Asphericity(mol)
    spherocity_index = SpherocityIndex(mol)
    num_rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    ComputeGasteigerCharges(mol)
    partial_charges = [atom.GetProp('_GasteigerCharge') for atom in mol.GetAtoms()]

    # Compute hybrid descriptors
    logd = rdMolDescriptors.CalcCrippenDescriptors(mol)[1]  # LogD at pH 7.4 approximation

    # Compute volumetric properties
    volume = Descriptors.MolMR(mol)  # Molar refractivity approximates volume




    # Serialize binary data for MongoDB
    ecfp_binary = Binary(pickle.dumps(list(map(bool, ecfp.ToList())), protocol=pickle.HIGHEST_PROTOCOL))
    maccs_binary = Binary(pickle.dumps(list(map(bool, maccs.ToList())), protocol=pickle.HIGHEST_PROTOCOL))
    torsion_binary = Binary(pickle.dumps(list(map(bool, Torsion.ToList())), protocol=pickle.HIGHEST_PROTOCOL))
    avalon_binary = Binary(pickle.dumps(list(map(bool, Avalon.ToList())), protocol=pickle.HIGHEST_PROTOCOL))
    rdk_binary = Binary(pickle.dumps(list(map(bool, RDKFingerprint.ToList())), protocol=pickle.HIGHEST_PROTOCOL))

    # Append new fields to document
    new_fields = {
        'ECFP': ecfp_binary,
        'MACCS': maccs_binary,
        'TPSA': tpsa,
        'LogP': log_p,
        'Torsion': torsion_binary,
        'Avalon': avalon_binary,
        'RDK': rdk_binary,
        'Molecular_Weight': mol_weight,
        'LogP': log_p,
        'TPSA': tpsa,
        'Num_Rotatable_Bonds': num_rotatable_bonds,
        'PMI1': pmi1,
        'PMI2': pmi2,
        'PMI3': pmi3,
        'NPR1': npr1,
        'NPR2': npr2,
        'Radius_of_Gyration': rg,
        'Inertial_Shape_Factor': inertial_shape_factor,
        'Eccentricity': eccentricity,
        'Asphericity': asphericity,
        'Spherocity_Index': spherocity_index,
        'Partial_Charges': partial_charges,
        'LogD': logd,
        'Volume': volume,

    }

    return new_fields

# Update documents with new properties
for molecule in tqdm(molecules):
    new_properties = compute_properties(molecule)
    collection.update_one({'_id': molecule['_id']}, {'$set': new_properties})

100%|██████████| 2110/2110 [01:03<00:00, 33.07it/s]
