add some features: SI, refactor mass functions

ekwan · Mar 2, 2021 · 75bf1e8 · 75bf1e8
1 parent 63a7eb5
commit 75bf1e8
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 46 deletions.
diff --git a/cctk/__init__.py b/cctk/__init__.py
@@ -13,3 +13,4 @@
 from .mae_file import MAEFile
 from .pdb_file import PDBFile
 
+from .si_file import SIFile
diff --git a/cctk/gaussian_file.py b/cctk/gaussian_file.py
@@ -389,7 +389,8 @@ def read_file(cls, filename, return_lines=False, extended_opt_info=False):
                     try:
                         corrected_free_energy = get_corrected_free_energy(gibbs_vals[0], frequencies,
                                                                         frequency_cutoff=100.0, temperature=temperature[0])
-                        properties[-1]["quasiharmonic_gibbs_free_energy"] = float(corrected_free_energy)
+                        properties[-1]["quasiharmonic_gibbs_free_energy"] = float(f"{float(corrected_free_energy):.6f}") # yes this is dumb
+                        print(properties[-1])
                     except:
                         pass
 

diff --git a/cctk/helper_functions.py b/cctk/helper_functions.py
@@ -620,3 +620,81 @@ def bytes_to_numpy(arr_bytes):
     load_bytes = BytesIO(arr_bytes)
     loaded_np = np.load(load_bytes, allow_pickle=True)
     return loaded_np
+
+def compute_mass_spectrum(formula_dict):
+    """
+    Computes the expected low-res mass spec ions for a given formula.
+
+    Args:
+        formula dict (dict): e.g. {"C": 6, "H": 6}
+
+    Returns:
+        list of m/z ions
+        list of relative weights (out of 1 total)
+    """
+    form_vec = np.zeros(shape=92, dtype=np.int8)
+    for z, n in formula_dict.items():
+        if isinstance(z, str):
+            z = get_number(z)
+        assert isinstance(z, int), "atomic number must be integer"
+        form_vec[z - 1] += n
+
+    masses, weights = _recurse_through_formula(form_vec, [0], [1], **kwargs)
+
+    new_masses, indices = np.unique(np.round(masses, decimals=1), return_inverse=True)
+    new_weights = np.zeros_like(new_masses)
+    for k in range(len(new_weights)):
+        new_weights[k] = np.sum(weights[np.nonzero(indices == k)])
+    new_weights = new_weights / np.max(new_weights)
+
+    return new_masses, new_weights
+
+def _recurse_through_formula(formula, masses, weights, cutoff=0.0000001, mass_precision=4, weight_precision=8):
+    """
+    Recurses through a formula and generates m/z isotopic pattern using tail recursion.
+
+    To prevent blowup of memory, fragments with very low abundance are ignored. Masses and weights are also rounded after every step.
+    To prevent error accumulation, internal precisions several orders of magnitude lower than the precision of interest should be employed.
+    The default values should work nicely for low-res MS applications.
+
+    Args:
+        formula (np.ndarray, dtype=np.int8): vector containing atoms left to incorporate
+        masses (np.ndarray): list of mass fragments at current iteration
+        weights (np.ndarray): relative weights at current iteration
+        cutoff (float): cutoff for similarity (masses within ``cutoff`` will be combined)
+        mass_precision (int): number of decimal places to store for mass
+        weight_precision (int): number of decimal places to store for weight
+
+    Returns:
+        masses
+        weights
+    """
+    # check how many elements we haven't recursed thru yet
+    if np.array_equal(formula, np.zeros(shape=92, dtype=np.int8)):
+        return masses[np.argsort(masses)], weights[np.argsort(masses)]
+
+    # get masses/weights for current element
+    current_e = np.nonzero(formula)[0][0]
+    e_masses, e_weights = get_isotopic_distribution(current_e)
+
+    # combinatorially add the new masses and weights to our current lists
+    new_masses = np.zeros(shape=(len(masses)*len(e_masses)))
+    new_weights = np.zeros(shape=(len(masses)*len(e_masses)))
+    for i in range(len(masses)):
+        for j in range(len(e_masses)):
+            new_masses[i*len(e_masses)+j] = masses[i] + e_masses[j]
+            new_weights[i*len(e_masses)+j] = weights[i] * e_weights[j]
+
+    # delete duplicates and adjust weights (complicated)
+    newer_masses, indices = np.unique(np.round(new_masses, decimals=mass_precision), return_inverse=True)
+    newer_weights = np.zeros_like(newer_masses)
+    for k in range(len(newer_weights)):
+        newer_weights[k] = np.sum(new_weights[np.nonzero(indices == k)])
+    newer_weights = np.round(newer_weights, decimals=weight_precision)
+
+    # prune the low-abundance masses/weights and move on to the next element
+    formula[current_e] += -1
+    above_cutoff = np.nonzero(newer_weights > cutoff)
+    return _recurse_through_formula(formula, newer_masses[above_cutoff], newer_weights[above_cutoff], cutoff, mass_precision, weight_precision)
+
+
diff --git a/cctk/molecule.py b/cctk/molecule.py
@@ -20,6 +20,7 @@
     compute_chirality,
     numpy_to_bytes,
     bytes_to_numpy,
+    _recurse_through_formula,
 )
 import cctk.topology as top
 
@@ -733,60 +734,17 @@ def rotate_molecule(self, axis, degrees):
 
         return self
 
-    def _recurse_through_formula(self, formula, masses, weights, cutoff=0.0000001, mass_precision=4, weight_precision=8):
-        """
-        Recurses through a formula and generates m/z isotopic pattern using tail recursion.
-
-        To prevent blowup of memory, fragments with very low abundance are ignored. Masses and weights are also rounded after every step.
-        To prevent error accumulation, internal precisions several orders of magnitude lower than the precision of interest should be employed.
-        The default values should work nicely for low-res MS applications.
-
-        Args:
-            formula (np.ndarray, dtype=np.int8): vector containing atoms left to incorporate
-            masses (np.ndarray): list of mass fragments at current iteration
-            weights (np.ndarray): relative weights at current iteration
-            cutoff (float): cutoff for similarity (masses within ``cutoff`` will be combined)
-            mass_precision (int): number of decimal places to store for mass
-            weight_precision (int): number of decimal places to store for weight
-
-        Returns:
-            masses
-            weights
-        """
-        if np.array_equal(formula, np.zeros(shape=92, dtype=np.int8)):
-            return masses[np.argsort(masses)], weights[np.argsort(masses)]
-
-        current_e = np.nonzero(formula)[0][0]
-        e_masses, e_weights = get_isotopic_distribution(current_e)
-
-        new_masses = np.zeros(shape=(len(masses)*len(e_masses)))
-        new_weights = np.zeros(shape=(len(masses)*len(e_masses)))
-        for i in range(len(masses)):
-            for j in range(len(e_masses)):
-                new_masses[i*len(e_masses)+j] = masses[i] + e_masses[j]
-                new_weights[i*len(e_masses)+j] = weights[i] * e_weights[j]
-
-        newer_masses, indices = np.unique(np.round(new_masses, decimals=mass_precision), return_inverse=True)
-        newer_weights = np.zeros_like(newer_masses)
-        for k in range(len(newer_weights)):
-            newer_weights[k] = np.sum(new_weights[np.nonzero(indices == k)])
-        newer_weights = np.round(newer_weights, decimals=weight_precision)
-
-        formula[current_e] += -1
-        above_cutoff = np.nonzero(newer_weights > cutoff)
-        return self._recurse_through_formula(formula, newer_masses[above_cutoff], newer_weights[above_cutoff], cutoff, mass_precision, weight_precision)
-
     def calculate_mass_spectrum(self, **kwargs):
         """
-        Generates list of m/z values based on formula string (e.g. "C10H12")
+        Generates list of m/z values.
 
         Final weights rounded to one decimal point (because of low-res MS).
         """
         form_vec = np.zeros(shape=92, dtype=np.int8)
         for z in self.atomic_numbers:
             form_vec[z - 1] += 1
 
-        masses, weights = self._recurse_through_formula(form_vec, [0], [1], **kwargs)
+        masses, weights = _recurse_through_formula(form_vec, [0], [1], **kwargs)
 
         new_masses, indices = np.unique(np.round(masses, decimals=1), return_inverse=True)
         new_weights = np.zeros_like(new_masses)

diff --git a/cctk/si_file.py b/cctk/si_file.py
@@ -0,0 +1,72 @@
+import numpy as np
+
+import cctk
+from cctk.helper_functions import get_symbol, get_number
+
+
+class SIFile(cctk.File):
+    """
+    Class representing Supporting Information files.
+
+    Attributes:
+        titles (list of str): title of each molecule
+        ensemble (cctk.Ensemble): ``cctk.Ensemble`` of molecules to print
+    """
+
+    def __init__(self, ensemble, titles):
+        if ensemble and isinstance(ensemble, cctk.Ensemble):
+            self.ensemble = ensemble
+        else:
+            raise ValueError(f"invalid ensemble {ensemble}!")
+
+        assert len(titles) == len(ensemble)
+        self.titles = titles
+
+    def write_file(self, filename, append=False):
+        """
+        Write an SI file.
+
+        Args:
+            filename (str): path to the new file
+            append (Bool): whether or not to append to file
+        """
+        for title, (molecule, properties) in zip(self.titles, self.ensemble.items()):
+            assert isinstance(molecule, cctk.Molecule), "molecule is not a valid Molecule object!"
+
+            text = f"{title}\n"
+            for key, value in generate_info(molecule, properties).items():
+                text += f"{key}:\t{value}\n"
+
+            for index, Z in enumerate(molecule.atomic_numbers, start=1):
+                line = molecule.get_vector(index)
+                text += f"{get_symbol(Z):>2}       {line[0]:>13.8f} {line[1]:>13.8f} {line[2]:>13.8f}\n"
+
+            if append:
+                text += "\n"
+                super().append_to_file(filename, text)
+            else:
+                super().write_file(filename, text)
+
+
+def generate_info(molecule, properties):
+    info = {
+        "Number of Atoms": molecule.num_atoms(),
+        "Stoichiometry": molecule.formula(),
+        "Charge": molecule.charge,
+        "Multiplicity": molecule.multiplicity,
+    }
+
+    if "route_card" in properties:
+        info["Route Card"] = properties["route_card"]
+
+    if "energy" in properties:
+        info["Energy"] = properties["energy"]
+    if "enthalpy" in properties:
+        info["Enthalpy"] = properties["enthalpy"]
+    if "gibbs_free_energy" in properties:
+        info["Gibbs Free Energy"] = properties["gibbs_free_energy"]
+    if "quasiharmonic_gibbs_free_energy" in properties:
+        info["Gibbs Free Energy (Quasiharmonic Correction)"] = properties["quasiharmonic_gibbs_free_energy"]
+
+    return info
+
diff --git a/scripts/generate_si.py b/scripts/generate_si.py
@@ -0,0 +1,6 @@
+import cctk
+
+ensemble = cctk.GaussianFile.read_file("test/static/gaussian_file.out").ensemble
+si_file = cctk.SIFile(ensemble, ["title"] * len(ensemble))
+
+si_file.write_file("si.txt")