In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import torch

from openbabel import pybel
from openbabel import openbabel as ob

import os
from contextlib import contextmanager
import pickle
import blosc

from pprint import pprint
import re

## Importing the required pickle files and merging them

In [2]:
def mergeDictionaries(dict1 : dict, dict2: dict):
    '''Merges two Python dictionaries with one being merged being a numpy arrays (with dtype as object) 
        as values by combining elements for a common key
        The function merges dict2 into dict 1'''
    for key, value in dict2.items():
        if key in dict1:
            dict1[key].extend(list(value))
        else:
            dict1[key] = list(value)

In [3]:
bond_angles={}
folders = ['0', 'A', 'B', 'C', 'D', 'H', 'I', 'J', 'K', 'N', 'O', 'P', 'Q'
          , 'T', 'U', 'V', 'W', 'X', 'Z']

for fold in folders:
    with open(f"pkl/{fold}/cod-crest_bond_angles.pkl", "rb") as f:
        compressed_pickle = f.read()
        mergeDictionaries(bond_angles, pickle.loads(compressed_pickle))

In [4]:
# Checking if the import was successful
bond_angles.keys()

dict_keys(['Cl - C - Npl', 'C - C - Cl', 'C - C - Npl', 'C - Npl - H', 'C - Npl - C', 'C - C - H', 'C - C - C', 'H - C - Npl', 'H - C - P', 'Npl - C - P', 'P - C - P', 'C - P - O', 'O - P - O', 'HO - O - P', 'C - O - HO', 'C - C - O', 'C - N - Car', 'H - C - N', 'C - C - N', 'Car - Car - N', 'Car - Car - Car', 'C - Car - Car', 'Car - Car - H', 'C - C - Car', 'Car - C - H', 'H - C - H', 'Car - C - Car', 'O - C - O', 'HO - O - HO', 'H - C - O', 'H - C - Nam', 'C - C - Nam', 'C - Nam - C', 'C - Nam - H', 'Nam - C - O', 'Car - C - O', 'Car - C - Nam', 'Car - Car - Cl', 'C - O - C', 'C - C - Nar', 'C - Nar - Car', 'Car - Nar - Nar', 'C - Nar - Nar', 'Nar - Car - O', 'Nar - Car - Nar', 'C - Car - Nar', 'Car - Nar - Car', 'Car - Car - Nar', 'C - O - Car', 'Car - Car - O', 'Car - Car - Ntr', 'Car - Ntr - O', 'O - Ntr - O', 'O.co - Cac - O.co', 'C - Cac - O.co', 'Cac - C - H', 'Cac - C - Npl', 'Npl - C - Npl', 'Car - S - Car', 'C - Car - S', 'Car - Car - S', 'H - Car - S', 'H - C - Nox', 'C - C

## Making a list of exception

In [5]:
def correctHybridization(bond_angle, key):
    '''Function to get the hybridization of an atom'''
    central_atom = key.split(' ')[2]
    no_neighbours = len(bond_angle[-2])
    
    if(central_atom=='C' and no_neighbours==2): # Checking for the neighbourse
        return 3 #Making the hybridization sp3
        
    elif(central_atom=='C' and no_neighbours==0):
        return 1 # Making the hybridization sp
        
    elif(central_atom=='C' and no_neighbours==1):
        return 2 # Making the hybridization sp2
    
    #elif(central_atom=='C' and bond_angle[7]==3 and no_neighbours!=2):
     #   
    
    else:
        return bond_angle[7]
    #1. Checking for no. of elements in neighbour and their type
    #2. Checking hybridization based on bond orders.
    #3. Summing up the bond orders

In [6]:
## Discarding outliers i.e. we don't really need them for predicting bond_angles
def discardOutlier(bond_angle, key):
    
    '''Function to get the hybridization of an atom'''
    
    central_atom = key.split(' ')[2]
    no_neighbours = len(bond_angle[-2])
    
    if(no_neighbours==2 and central_atom=='C' and bond_angle[0]>165): # Checking for the neighbourse
        return 1
        
    elif(no_neighbours==1 and central_atom=='C' and bond_angle[0]>170):
        return 1
    
    else:
        return 0

In [7]:
## Some helper functions
def filterAtom(atom: str) -> str:
    '''Removes any number from the string e.g. C3 becomes C'''
    pattern_order = r'[0-9]'
    return re.sub(pattern_order, '', atom)

In [8]:
## Checker function to see if that bond_angle actually exists in the sdf

@contextmanager
def temporary_directory_change(new_directory):
    original_directory = os.getcwd()  # Save the current working directory
    os.chdir(new_directory)           # Change to the new directory
    yield                             # Provide control back to the caller
    os.chdir(original_directory)      # Revert to the original directory
    
    
def checkBondAngle(bondAngle, key, file):
    if(file[0]=='0'): folder='O/'
    else: folder = file[:2]
        
    with temporary_directory_change('21395061/cod-crest/' + folder):
        filename = file.split('.')[0]
        try:
            mol = next(pybel.readfile('sdf', filename[2:]+'.sdf'))

        except:
            print("Sdf not opened able", filename)
            try:
                mol = next(pybel.readfile('xyz', filename[2:]+'.xyz'))
            except:
                return 1

        for angle in ob.OBMolAngleIter(mol.OBMol):
            b = mol.OBMol.GetAtom(angle[0] + 1)
            a = mol.OBMol.GetAtom(angle[1] + 1)
            c = mol.OBMol.GetAtom(angle[2] + 1)
            bond_angle = mol.OBMol.GetAngle(a, b, c)
            
            a_atomType = filterAtom(a.GetType())
            b_atomType = filterAtom(b.GetType())
            c_atomType = filterAtom(c.GetType())
            atoms = key.split(' ')
            
            cond_1 = (atoms[0]==a_atomType) and (atoms[2]==b_atomType) and (atoms[4]==c_atomType)
            cond_2 = (atoms[4]==a_atomType) and (atoms[2]==b_atomType) and (atoms[0]==c_atomType)
            
            if(np.abs(bond_angle - bondAngle)<=1 and (cond_1 or cond_2)):
                return 0
            
        return 1

## Filtering out the errorenous data points 

In [13]:
# Do a pre-filter over the entire dataset
for key in bond_angles:
    for num, j in enumerate(bond_angles[key]):
        
        # Correcting the hybridization based on no. of neighbouring atoms
        j[7] = correctHybridization(j, key)
            
        if(discardOutlier(j, key)==1):
            print(key, j[:-1])
            if(checkBondAngle(j[0], key, j[-1])):
                print("Deleted")
                bond_angles[key].pop(num)

C - C - H [170.2042362444216 'B/BRXOSZVUMUVRBG-ISUNJMEDSA-N.xyz'
 -0.32263784394910744 0.18529995639751465 0.19304586463069182
 0.9377989664042766 0.7288575098085266 3 0 0 0
 list([['H', 0.04831057871168722, 0.9297611203458154, 0], ['H', 0.12484791041185199, 0.8077118204805321, 0]])]
C - C - H [179.23252024775596 'C/CSXCKTBBSXTYDY-GJJXMTLWSA-M.xyz'
 -0.11975558488202243 -0.19916352673514995 0.22882232594066335
 0.7058783018401552 0.7019243667569639 3 0 0 0
 list([['C', -0.17156807316350886, 0.8922917180967372, 0], ['C', -0.22546619324490058, 1.0077025222855882, 0]])]
C - C - H [178.33111123550574 'C/CSXCKTBBSXTYDY-GJJXMTLWSA-M.xyz'
 -0.17156807316350886 -0.11975558488202243 0.16156100793968042
 0.8922917180967372 0.7769485622740788 3 0 0 0
 list([['H', 0.03725433104030093, 0.8607240046992728, 0], ['C', -0.048820303951156635, 0.9293458061351141, 0]])]
H - C - H [177.26962434919517 'H/HMHWMRRWHGJAIT-UHFFFAOYSA-M.xyz'
 -0.16340797212218594 0.10834161777363208 0.3566284608893144
 0.9641866

In [10]:
for key in bond_angles:
    if(key.split(' ')[2]!='C'):
        continue
        
    for num, j in enumerate(bond_angles[key]):
        if(j[0]>160 and j[7]!=1):
            #j[7] = correctHybridization(j, key)
            
            # Deleting those molecules which had parsing error in file
            ret = checkBondAngle(j[0], key, j[-1])
            if(ret==1):  bond_angles[key].pop(num)

### Dumping into a corrected bond-angle pickle file

In [15]:
with open('corrected_BondAngles.pkl', 'wb') as file:
    pickle.dump(bond_angles, file)

KeyboardInterrupt: 