In [13]:
# Importing the require modules
import numpy as np
import pandas as pd

import sys
import os
import re
from pprint import pprint

from openbabel import pybel
from openbabel import openbabel

In [3]:
def filterAtom(atom: str) -> str:
    '''Removes any number from the string e.g. C3 becomes C'''
    pattern_order = r'[0-9]'
    return re.sub(pattern_order, '', atom)

def atomType(mol, atomIdx):
    '''get the atomic type given an atom index'''
    return mol.OBMol.GetAtom(atomIdx).GetType()

In [10]:
def getBonds(mol):
    '''Iterate through all the bonds in a molecule'''
    
    bonds = {}
    
    for bond in ob.OBMolBondIter(mol.OBMol):
        begin = filterAtom(atomType(mol, bond.GetBeginAtomIdx()))
        end = filterAtom(atomType(mol, bond.GetEndAtomIdx()))
        
        # Swap them for lexographic order
        if (end < begin):
            begin, end = end, begin
        
        #Appending to the dictionary of list
        if (f"{begin} - {end}" in bonds.keys()):
            bonds[f"{begin} - {end}"].append(round(bond.GetLength(),4))
        else:
            bonds[f"{begin} - {end}"]=[round(bond.GetLength(),4)]
        
    return bonds

In [5]:
def getAngles(mol):
    '''Iterate through all the bond angles in a molecule'''
    
    angles = {}
    
    for angle in ob.OBMolAngleIter(mol.OBMol):
        a = angle[0] + 1
        b = mol.OBMol.GetAtom(angle[1] + 1)
        c = angle[2] + 1
        bond_angle = b.GetAngle(a, c)

        aType = filterAtom(atomType(mol, a))
        bType = filterAtom(b.GetType())
        cType = filterAtom(atomType(mol, c))
        
        # Swap them for lexographic order
        if (cType < aType):
            aType, cType = cType, aType
            
        #Appedning to the dictonary of lists if the angle b/t the elements already existse
        if(f"{aType} - {bType} - {cType}" in angles.keys()):
            angles[f"{aType} - {bType} - {cType}"].append(round(bond_angle ,3))
        else:
            angles[f"{aType} - {bType} - {cType}"] = [round(bond_angle ,3)]                                                                                      
                                                                                                     
    return angles

In [6]:
def getTorsions(mol):
    '''Iterate through all the torsions in a molecule'''
    
    torsions = {}
    
    for torsion in ob.OBMolTorsionIter(mol.OBMol):
        a = torsion[0] + 1
        b = torsion[1] + 1
        c = torsion[2] + 1
        d = torsion[3] + 1
        torsion_angle = mol.OBMol.GetTorsion(a, b, c, d)

        aType = filterAtom(atomType(mol, a))
        bType = filterAtom(atomType(mol, b))
        cType = filterAtom(atomType(mol, c))
        dType = filterAtom(atomType(mol, d))

        # Switch if not in lexographic order
        if(dType < aType):
            aType, dType = dType, aType
            
        #Appedning to the dictonary of lists if the torsion angle b/t the elements already existse
        if(f"{aType} - {bType} - {cType} -{dType}" in torsions.keys()):
            torsions[f"{aType} - {bType} - {cType} -{dType}"].append(round(torsion_angle, 3))
        
        # Checking for palindromic sequence
        elif(f"{dType} - {cType} - {bType} -{aType}" in torsions.keys()):
            torsions[f"{dType} - {cType} - {bType} -{aType}"].append(round(torsion_angle, 3))
            
        else:
            torsions[f"{aType} - {bType} - {cType} -{dType}"] = [round(torsion_angle, 3)]
    
    return torsions

In [37]:
def analyze_molecular_data(file: str):
    '''Fetches the bond lengths, angles and dihedrals from a given file'''
    
    # Load the File
    extension=file.split('.')[-1]
    mol = next(pybel.readfile(extension, file))

    # Get bond lengths
    bond_lengths = getBonds(mol)
    
    # Get bond angles
    bond_angles = getAngles(mol)
    
    # Get torsions
    dihedrals = getTorsions(mol)

    return (bond_lengths, bond_angles, dihedrals)


def mergeDictionaries(dict1 : dict, dict2) -> dict:
    '''Merges two Python dictionary by combining elements for a common key
    The function merges dict2 into dict 1'''
    
    for key, value in dict2:
        if key in dict1:
            dict1[key].extend(value)
        else:
            dict1[key] = value
    
    return dict1


# Create a pandas DataFrame to store the molecular data
# data = pd.DataFrame({
#     'Bond Length': bond_lengths,
#     'Bond Angle': bond_angles,
#     'Dihedral': dihedrals
# })

In [14]:
# Example usage
xyz_file = '21395061/pubchemqc/1/000025003.xyz'
#mol = next(pybel.readfile('xyz', xyz_file))
#pprint(getBonds(mol))
#pprint(getAngles(mol))
#pprint(getTorsions(mol))

molecular_data = analyze_molecular_data(xyz_file)
pprint(molecular_data)

({'C - C': [1.5216, 1.52, 1.5151],
  'C - Cl': [1.8124, 1.7853],
  'C - H': [1.1053, 1.0903, 1.0896, 1.0871, 1.088, 1.0868],
  'C - O': [1.193]},
 {'C - C - C': [111.77, 112.772, 113.7],
  'C - C - Cl': [101.188, 106.85, 109.628, 109.119],
  'C - C - H': [109.839, 109.054, 110.736, 111.398, 111.201, 112.922],
  'C - C - O': [124.6],
  'C - H - Cl': [107.231, 107.75],
  'C - H - H': [108.517, 109.413, 109.244, 109.987],
  'C - O - H': [122.405]},
 {'C - C - C -Cl': [-59.374, 69.713],
  'C - C - C -H': [-40.336,
                   -169.91,
                   58.827,
                   -172.086,
                   -178.096,
                   -49.009,
                   -47.071,
                   71.732,
                   -168.028,
                   -175.648,
                   -56.845,
                   63.394],
  'C - C - C -O': [142.7, 13.127],
  'Cl - C - C -Cl': [-169.206],
  'Cl - C - C -H': [73.065, -51.005, 72.072, 64.837, -176.36, -56.121],
  'Cl - C - C -O': [-103.899]})


In [47]:
#Collecting data from the Molecular Dataset

bond_lengths = {}
bond_angles = {}
dihedrals = {}

ext = ('sdf', 'xyz')

for root, dirs, files in os.walk('21395061/cod-crest/'):
    for file in files:
        extension=file.split('.')[-1]
        if(extension not in ext): continue
            
        filename = os.path.join(root,file)
        print(filename)

        length, angle, torsion = analyze_molecular_data(filename)
        print(length)
        bond_lengths = mergeDictionaries(bond_lengths, length)
        bond_angles = mergeDictionaries(bond_angles, angle)
        dihedrals = mergeDictionaries(dihedrals, torsion)

21395061/cod-crest/R/RLFWWDJHLFCNIJ-UHFFFAOYSA-N.xyz
{'C - H': [1.0957, 1.0882, 1.0876, 1.0905, 1.0866, 1.0891], 'Car - H': [1.0772, 1.0797, 1.0796, 1.0799, 1.0803], 'C - Nar': [1.4514], 'Car - Car': [1.382, 1.392, 1.3853, 1.3931, 1.4575, 1.356, 1.3847, 1.3823], 'Car - Nar': [1.3984, 1.3839, 1.3996], 'Nar - Nar': [1.4004], 'Car - O': [1.2189], 'C - Car': [1.4881], 'Car - Npl': [1.3865], 'H - Npl': [1.0121, 1.0097]}


ValueError: too many values to unpack (expected 2)

In [51]:
length, angle, torsion = analyze_molecular_data('21395061/cod-crest/R/RLFWWDJHLFCNIJ-UHFFFAOYSA-N.xyz')
pprint(length)

{'C - Car': [1.4881],
 'C - H': [1.0957, 1.0882, 1.0876, 1.0905, 1.0866, 1.0891],
 'C - Nar': [1.4514],
 'Car - Car': [1.382, 1.392, 1.3853, 1.3931, 1.4575, 1.356, 1.3847, 1.3823],
 'Car - H': [1.0772, 1.0797, 1.0796, 1.0799, 1.0803],
 'Car - Nar': [1.3984, 1.3839, 1.3996],
 'Car - Npl': [1.3865],
 'Car - O': [1.2189],
 'H - Npl': [1.0121, 1.0097],
 'Nar - Nar': [1.4004]}


In [52]:
pprint(angle)

{'C - Car - Car': [129.648],
 'C - Car - H': [111.08, 109.643, 110.643],
 'C - Car - Nar': [118.188],
 'C - H - H': [108.023, 108.386, 108.997, 108.839, 109.302, 108.977],
 'C - Nar - Car': [120.213],
 'C - Nar - H': [112.02, 109.167, 108.49],
 'C - Nar - Nar': [115.298],
 'Car - Car - Car': [119.902,
                     119.649,
                     120.551,
                     119.713,
                     120.367,
                     119.804,
                     108.098],
 'Car - Car - H': [119.397, 119.451, 120.211, 120.131, 119.779, 120.405],
 'Car - Car - Nar': [125.449],
 'Car - Car - Npl': [121.087, 130.766],
 'Car - H - Car': [120.941, 119.996, 120.073, 119.499],
 'Car - Nar - Car': [104.84, 119.424, 120.672, 110.117],
 'Car - Nar - Nar': [105.842],
 'Car - O - Car': [128.962],
 'Car - O - Nar': [126.153],
 'H - Car - Npl': [110.85, 115.252],
 'H - H - Npl': [112.833],
 'Nar - Car - Nar': [120.704, 110.802]}


In [53]:
pprint(torsion)

{'C - Car - Car -Car': [-179.382],
 'C - Car - Car -Npl': [-1.965],
 'C - Nar - Car -C': [45.651],
 'C - Nar - Car -Car': [-135.872],
 'C - Nar - Car -Nar': [176.65],
 'C - Nar - Nar -Car': [-60.369, 138.335],
 'Car - C - Nar -H': [55.531, 176.151, -65.213],
 'Car - Car - C -H': [106.339, -12.98, -133.233],
 'Car - Car - Car -Car': [0.772, 0.294, -1.058, -1.069, 0.272, 0.786],
 'Car - Car - Car -H': [179.481,
                        -179.661,
                        -178.435,
                        179.377,
                        179.444,
                        177.655,
                        -179.06,
                        179.77,
                        -179.877,
                        -177.93],
 'Car - Car - Car -Nar': [179.67, 179.562, 1.184, 2.328],
 'Car - Car - Car -O': [-176.452],
 'Car - Car - Npl -H': [-17.449, -147.148, 165.419, 35.72],
 'Car - Nar - Car -Car': [-44.752, 134.62, -164.504],
 'Car - Nar - Car -Nar': [156.84, -23.788, -4.288, -4.873],
 'Car - Nar - Car -O