In [3]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [4]:
num_pattern = r'-?\d*\.*\d+'

In [10]:
re.search(num_pattern, 'a.321bc').group(0)

'.321'

In [12]:
atom_row_pattern = f'({num_pattern})\\s+({num_pattern})\\s+({num_pattern})\\s+([a-zA-Z]+)\\s+\\d+'

In [34]:
bond_pattern = r'(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+'

In [58]:
class Atom:
    def __init__(self, x, y, z, name):
        self.x = x
        self.y = y
        self.z = z
        self.name = name
        
    def __repr__(self):
        repr_str = f'[{self.name}] x: {self.x}, y: {self.y}, z: {self.z}'
        return repr_str
    
    def __sub__(self, other):
        x = self.x - other.x
        y = self.y - other.y
        z = self.z - other.z
        return np.array([x, y, z])
    
    def __add__(self, other):
        x = self.x + other.x
        y = self.y + other.y
        z = self.z + other.z
        return np.array([x, y, z])
    
    def coord(self):
        return np.array([self.x, self.y, self.z])
        
        
class Bond:
    def __init__(self, index1, index2, order=1):
        self.index1 = index1
        self.index2 = index2
        self.order = order
        
    def __repr__(self):
        repr_str = f'[{self.index1} - {self.index2}] order: {self.order}'
        return repr_str

In [59]:
molecule = {
    'atoms': [],
    'bonds': []
}

filename = 'tempo_2006285.mol'
with open(filename, 'r', encoding='utf-8') as f:
    for line in f:
        found = re.search(atom_row_pattern, line)
        if found:
            x, y, z, name = found.group(1),found.group(2), found.group(3), found.group(4)
            atom = Atom(float(x), float(y), float(z), name)
            molecule['atoms'].append(atom)
        else:
            found = re.match(bond_pattern, line.rstrip().lstrip())
            if found:
                idx1, idx2, order = found.group(1), found.group(2), found.group(3)
                if int(order) == 0:
                    # matched wrong line
                    continue
                bond = Bond(int(idx1), int(idx2), int(order))
                molecule['bonds'].append(bond)

In [60]:
for i, atom in enumerate(molecule['atoms']):
    # 1-based index
    print(f'{i+1}: {atom}')

1: [O] x: -2.0904, y: -0.833, z: -0.084
2: [N] x: -0.8358, y: -0.3332, z: 0.1294
3: [C] x: 0.2799, y: -1.3339, z: 0.05
4: [C] x: -0.7143, y: 1.161, z: 0.0499
5: [C] x: 1.6979, y: -0.6901, z: 0.2443
6: [C] x: 0.758, y: 1.6686, z: 0.2441
7: [C] x: 1.843, y: 0.7345, z: -0.3354
8: [C] x: 0.0258, y: -2.3431, z: 1.2159
9: [C] x: 0.2189, y: -2.0966, z: -1.3123
10: [C] x: -1.5927, y: 1.7186, z: 1.2158
11: [C] x: -1.2831, y: 1.6724, z: -1.3123
12: [H] x: 1.9522, y: -0.6151, z: 1.3061
13: [H] x: 2.4817, y: -1.3187, z: -0.1889
14: [H] x: 0.9941, y: 1.7895, z: 1.3059
15: [H] x: 0.8946, y: 2.6641, z: -0.1893
16: [H] x: 2.8292, y: 1.1274, z: -0.0673
17: [H] x: 1.7927, y: 0.7143, z: -1.426
18: [H] x: -0.0093, y: -1.8209, z: 2.1759
19: [H] x: 0.8121, y: -3.0982, z: 1.2754
20: [H] x: -0.9251, y: -2.8691, z: 1.1062
21: [H] x: 0.37, y: -1.4167, z: -2.1523
22: [H] x: 0.9825, y: -2.8765, z: -1.3682
23: [H] x: -0.7431, y: -2.5923, z: -1.46
24: [H] x: -1.259, y: 1.3156, z: 2.1759
25: [H] x: -1.5413, y: 2.807

In [61]:
molecule['bonds']

[[1 - 2] order: 1,
 [2 - 3] order: 1,
 [2 - 4] order: 1,
 [3 - 5] order: 1,
 [3 - 8] order: 1,
 [3 - 9] order: 1,
 [4 - 6] order: 1,
 [4 - 10] order: 1,
 [4 - 11] order: 1,
 [5 - 7] order: 1,
 [5 - 12] order: 1,
 [5 - 13] order: 1,
 [6 - 7] order: 1,
 [6 - 14] order: 1,
 [6 - 15] order: 1,
 [7 - 16] order: 1,
 [7 - 17] order: 1,
 [8 - 18] order: 1,
 [8 - 19] order: 1,
 [8 - 20] order: 1,
 [9 - 21] order: 1,
 [9 - 22] order: 1,
 [9 - 23] order: 1,
 [10 - 24] order: 1,
 [10 - 25] order: 1,
 [10 - 26] order: 1,
 [11 - 27] order: 1,
 [11 - 28] order: 1,
 [11 - 29] order: 1]

In [65]:
vec_n_o = molecule['atoms'][0] - molecule['atoms'][1]
print(vec_n_o)

[-1.2546 -0.4998 -0.2134]


In [69]:
vec_c3_c4 = molecule['atoms'][2] - molecule['atoms'][3]
print(vec_c3_c4)

[ 9.9420e-01 -2.4949e+00  1.0000e-04]


In [70]:
vec_n_o.dot(vec_c3_c4)

-0.0003936399999999927

In [71]:
vec_n_o_norm = vec_n_o / np.linalg.norm(vec_n_o)
print(vec_n_o_norm)

[-0.91761123 -0.36555244 -0.15608021]


In [72]:
vec_c3_c4_norm = vec_c3_c4 / np.linalg.norm(vec_c3_c4)
print(vec_c3_c4_norm)

[ 3.70183457e-01 -9.28958668e-01  3.72343047e-05]


In [73]:
vec_n_o_norm.dot(vec_c3_c4_norm)

-0.00010720027770776481

In [76]:
vec_y = vec_c3_c4_norm - vec_n_o_norm * (vec_n_o_norm.dot(vec_c3_c4_norm))
vec_y = vec_y / np.linalg.norm(vec_y)
print(vec_y)

[ 3.70085091e-01 -9.28997861e-01  2.05024624e-05]


In [77]:
vec_y.dot(vec_n_o_norm)

-2.930310481026252e-18

In [78]:
vec_x = vec_n_o_norm

In [80]:
vec_z = np.cross(vec_x, vec_y)
vec_z = vec_z / np.linalg.norm(vec_z)
print(vec_z)

[-0.14500568 -0.05774415  0.98774438]


In [81]:
vec_z.dot(vec_x)

-2.7755575615628914e-17

In [82]:
vec_z.dot(vec_y)

-3.3203691532368573e-18

In [83]:
def get_new_coord(coord, x, y, z):
    new_x = coord.dot(x)
    new_y = coord.dot(y)
    new_z = coord.dot(z)
    return np.array([new_x, new_y, new_z])

In [86]:
origin = molecule['atoms'][1]    # N
H_coords = []
for atom in molecule['atoms']:
    if atom.name.upper() == 'H':
        xyz = atom.coord() - origin.coord()
        new_xyz = get_new_coord(xyz, vec_x, vec_y, vec_z)
        H_coords.append(new_xyz)

In [90]:
for xyz in H_coords:
    dist = np.linalg.norm(xyz)
    print(f'{xyz}: {dist}')

[-2.63891048  1.29370586  0.77428105]: 3.039249002632064
[-2.63424301  2.14327816 -0.73854852]: 3.475389386816965
[-2.63872334 -1.29474093  0.77416187]: 3.0394969238346
[-2.63376205 -2.14409658 -0.7387885 ]: 3.4755806910500584
[-3.86627010e+00 -5.36448537e-04 -8.10076240e-01]: 3.9502238227725774
[-2.55209015e+00 -3.88486116e-04 -1.97797204e+00]: 3.2288601177505347
[-0.53399148  1.6879874   1.98747765]: 2.6616753727680615
[-0.68024697  3.1785658   1.05266277]: 3.4167404657070457
[0.85648797 2.3228171  1.1242111 ]: 2.718989065811041
[-0.35425133  1.452771   -2.36601842]: 2.7989427968431224
[-0.50503725  3.03561528 -1.59604913]: 3.4666115357795717
[ 0.98883086  2.13297337 -1.45291315]: 2.763750795567502
[-0.53380795 -1.68830972  1.98757673]: 2.6619169652714567
[-0.67964126 -3.17896092  1.0527892 ]: 3.4170264660959244
[ 0.85694683 -2.32270849  1.12448128]: 2.719152583067011
[-0.35380222 -1.45323327 -2.36588714]: 2.799014987455408
[-0.50427711 -3.03618153 -1.59590422]: 3.4669300468858615
[ 