In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
num_pattern = r'-?\d*\.*\d+'

In [None]:
re.search(num_pattern, 'a.321bc').group(0)

In [None]:
atom_row_pattern = f'({num_pattern})\\s+({num_pattern})\\s+({num_pattern})\\s+([a-zA-Z]+)\\s+\\d+'

In [None]:
bond_pattern = r'(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+'

In [None]:
class Atom:
    def __init__(self, x, y, z, name):
        self.x = x
        self.y = y
        self.z = z
        self.name = name
        
    def __repr__(self):
        repr_str = f'[{self.name}] x: {self.x}, y: {self.y}, z: {self.z}'
        return repr_str
    
    def __sub__(self, other):
        x = self.x - other.x
        y = self.y - other.y
        z = self.z - other.z
        return np.array([x, y, z])
    
    def __add__(self, other):
        x = self.x + other.x
        y = self.y + other.y
        z = self.z + other.z
        return np.array([x, y, z])
    
    def coord(self):
        return np.array([self.x, self.y, self.z])
        
        
class Bond:
    def __init__(self, index1, index2, order=1):
        self.index1 = index1
        self.index2 = index2
        self.order = order
        
    def __repr__(self):
        repr_str = f'[{self.index1} - {self.index2}] order: {self.order}'
        return repr_str

In [None]:
molecule = {
    'atoms': [],
    'bonds': []
}

filename = 'tempo_2006285.mol'
with open(filename, 'r', encoding='utf-8') as f:
    for line in f:
        found = re.search(atom_row_pattern, line)
        if found:
            x, y, z, name = found.group(1),found.group(2), found.group(3), found.group(4)
            atom = Atom(float(x), float(y), float(z), name)
            molecule['atoms'].append(atom)
        else:
            found = re.match(bond_pattern, line.rstrip().lstrip())
            if found:
                idx1, idx2, order = found.group(1), found.group(2), found.group(3)
                if int(order) == 0:
                    # matched wrong line
                    continue
                bond = Bond(int(idx1), int(idx2), int(order))
                molecule['bonds'].append(bond)

In [None]:
for i, atom in enumerate(molecule['atoms']):
    # 1-based index
    print(f'{i+1}: {atom}')

In [None]:
molecule['bonds']

In [None]:
vec_n_o = molecule['atoms'][0] - molecule['atoms'][1]
print(vec_n_o)

In [None]:
vec_c3_c4 = molecule['atoms'][2] - molecule['atoms'][3]
print(vec_c3_c4)

In [None]:
vec_n_o.dot(vec_c3_c4)

In [None]:
vec_n_o_norm = vec_n_o / np.linalg.norm(vec_n_o)
print(vec_n_o_norm)

In [None]:
vec_c3_c4_norm = vec_c3_c4 / np.linalg.norm(vec_c3_c4)
print(vec_c3_c4_norm)

In [None]:
vec_n_o_norm.dot(vec_c3_c4_norm)

In [None]:
vec_y = vec_c3_c4_norm - vec_n_o_norm * (vec_n_o_norm.dot(vec_c3_c4_norm))
vec_y = vec_y / np.linalg.norm(vec_y)
print(vec_y)

In [None]:
vec_y.dot(vec_n_o_norm)

In [None]:
vec_x = vec_n_o_norm

In [None]:
vec_z = np.cross(vec_x, vec_y)
vec_z = vec_z / np.linalg.norm(vec_z)
print(vec_z)

In [None]:
vec_z.dot(vec_x)

In [None]:
vec_z.dot(vec_y)

In [None]:
def get_new_coord(coord, x, y, z):
    new_x = coord.dot(x)
    new_y = coord.dot(y)
    new_z = coord.dot(z)
    return np.array([new_x, new_y, new_z])

In [None]:
origin = molecule['atoms'][1]    # N
H_coords = []
for atom in molecule['atoms']:
    if atom.name.upper() == 'H':
        xyz = atom.coord() - origin.coord()
        new_xyz = get_new_coord(xyz, vec_x, vec_y, vec_z)
        H_coords.append(new_xyz)

In [None]:
for xyz in H_coords:
    dist = np.linalg.norm(xyz)
    print(f'{xyz}: {dist}')