Skip to content

Commit

Permalink
add some features: SI, refactor mass functions
Browse files Browse the repository at this point in the history
  • Loading branch information
corinwagen committed Mar 2, 2021
1 parent 63a7eb5 commit 75bf1e8
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 46 deletions.
1 change: 1 addition & 0 deletions cctk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@
from .mae_file import MAEFile
from .pdb_file import PDBFile

from .si_file import SIFile
3 changes: 2 additions & 1 deletion cctk/gaussian_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ def read_file(cls, filename, return_lines=False, extended_opt_info=False):
try:
corrected_free_energy = get_corrected_free_energy(gibbs_vals[0], frequencies,
frequency_cutoff=100.0, temperature=temperature[0])
properties[-1]["quasiharmonic_gibbs_free_energy"] = float(corrected_free_energy)
properties[-1]["quasiharmonic_gibbs_free_energy"] = float(f"{float(corrected_free_energy):.6f}") # yes this is dumb
print(properties[-1])
except:
pass

Expand Down
78 changes: 78 additions & 0 deletions cctk/helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,3 +620,81 @@ def bytes_to_numpy(arr_bytes):
load_bytes = BytesIO(arr_bytes)
loaded_np = np.load(load_bytes, allow_pickle=True)
return loaded_np

def compute_mass_spectrum(formula_dict):
"""
Computes the expected low-res mass spec ions for a given formula.
Args:
formula dict (dict): e.g. {"C": 6, "H": 6}
Returns:
list of m/z ions
list of relative weights (out of 1 total)
"""
form_vec = np.zeros(shape=92, dtype=np.int8)
for z, n in formula_dict.items():
if isinstance(z, str):
z = get_number(z)
assert isinstance(z, int), "atomic number must be integer"
form_vec[z - 1] += n

masses, weights = _recurse_through_formula(form_vec, [0], [1], **kwargs)

new_masses, indices = np.unique(np.round(masses, decimals=1), return_inverse=True)
new_weights = np.zeros_like(new_masses)
for k in range(len(new_weights)):
new_weights[k] = np.sum(weights[np.nonzero(indices == k)])
new_weights = new_weights / np.max(new_weights)

return new_masses, new_weights

def _recurse_through_formula(formula, masses, weights, cutoff=0.0000001, mass_precision=4, weight_precision=8):
"""
Recurses through a formula and generates m/z isotopic pattern using tail recursion.
To prevent blowup of memory, fragments with very low abundance are ignored. Masses and weights are also rounded after every step.
To prevent error accumulation, internal precisions several orders of magnitude lower than the precision of interest should be employed.
The default values should work nicely for low-res MS applications.
Args:
formula (np.ndarray, dtype=np.int8): vector containing atoms left to incorporate
masses (np.ndarray): list of mass fragments at current iteration
weights (np.ndarray): relative weights at current iteration
cutoff (float): cutoff for similarity (masses within ``cutoff`` will be combined)
mass_precision (int): number of decimal places to store for mass
weight_precision (int): number of decimal places to store for weight
Returns:
masses
weights
"""
# check how many elements we haven't recursed thru yet
if np.array_equal(formula, np.zeros(shape=92, dtype=np.int8)):
return masses[np.argsort(masses)], weights[np.argsort(masses)]

# get masses/weights for current element
current_e = np.nonzero(formula)[0][0]
e_masses, e_weights = get_isotopic_distribution(current_e)

# combinatorially add the new masses and weights to our current lists
new_masses = np.zeros(shape=(len(masses)*len(e_masses)))
new_weights = np.zeros(shape=(len(masses)*len(e_masses)))
for i in range(len(masses)):
for j in range(len(e_masses)):
new_masses[i*len(e_masses)+j] = masses[i] + e_masses[j]
new_weights[i*len(e_masses)+j] = weights[i] * e_weights[j]

# delete duplicates and adjust weights (complicated)
newer_masses, indices = np.unique(np.round(new_masses, decimals=mass_precision), return_inverse=True)
newer_weights = np.zeros_like(newer_masses)
for k in range(len(newer_weights)):
newer_weights[k] = np.sum(new_weights[np.nonzero(indices == k)])
newer_weights = np.round(newer_weights, decimals=weight_precision)

# prune the low-abundance masses/weights and move on to the next element
formula[current_e] += -1
above_cutoff = np.nonzero(newer_weights > cutoff)
return _recurse_through_formula(formula, newer_masses[above_cutoff], newer_weights[above_cutoff], cutoff, mass_precision, weight_precision)


48 changes: 3 additions & 45 deletions cctk/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
compute_chirality,
numpy_to_bytes,
bytes_to_numpy,
_recurse_through_formula,
)
import cctk.topology as top

Expand Down Expand Up @@ -733,60 +734,17 @@ def rotate_molecule(self, axis, degrees):

return self

def _recurse_through_formula(self, formula, masses, weights, cutoff=0.0000001, mass_precision=4, weight_precision=8):
"""
Recurses through a formula and generates m/z isotopic pattern using tail recursion.
To prevent blowup of memory, fragments with very low abundance are ignored. Masses and weights are also rounded after every step.
To prevent error accumulation, internal precisions several orders of magnitude lower than the precision of interest should be employed.
The default values should work nicely for low-res MS applications.
Args:
formula (np.ndarray, dtype=np.int8): vector containing atoms left to incorporate
masses (np.ndarray): list of mass fragments at current iteration
weights (np.ndarray): relative weights at current iteration
cutoff (float): cutoff for similarity (masses within ``cutoff`` will be combined)
mass_precision (int): number of decimal places to store for mass
weight_precision (int): number of decimal places to store for weight
Returns:
masses
weights
"""
if np.array_equal(formula, np.zeros(shape=92, dtype=np.int8)):
return masses[np.argsort(masses)], weights[np.argsort(masses)]

current_e = np.nonzero(formula)[0][0]
e_masses, e_weights = get_isotopic_distribution(current_e)

new_masses = np.zeros(shape=(len(masses)*len(e_masses)))
new_weights = np.zeros(shape=(len(masses)*len(e_masses)))
for i in range(len(masses)):
for j in range(len(e_masses)):
new_masses[i*len(e_masses)+j] = masses[i] + e_masses[j]
new_weights[i*len(e_masses)+j] = weights[i] * e_weights[j]

newer_masses, indices = np.unique(np.round(new_masses, decimals=mass_precision), return_inverse=True)
newer_weights = np.zeros_like(newer_masses)
for k in range(len(newer_weights)):
newer_weights[k] = np.sum(new_weights[np.nonzero(indices == k)])
newer_weights = np.round(newer_weights, decimals=weight_precision)

formula[current_e] += -1
above_cutoff = np.nonzero(newer_weights > cutoff)
return self._recurse_through_formula(formula, newer_masses[above_cutoff], newer_weights[above_cutoff], cutoff, mass_precision, weight_precision)

def calculate_mass_spectrum(self, **kwargs):
"""
Generates list of m/z values based on formula string (e.g. "C10H12")
Generates list of m/z values.
Final weights rounded to one decimal point (because of low-res MS).
"""
form_vec = np.zeros(shape=92, dtype=np.int8)
for z in self.atomic_numbers:
form_vec[z - 1] += 1

masses, weights = self._recurse_through_formula(form_vec, [0], [1], **kwargs)
masses, weights = _recurse_through_formula(form_vec, [0], [1], **kwargs)

new_masses, indices = np.unique(np.round(masses, decimals=1), return_inverse=True)
new_weights = np.zeros_like(new_masses)
Expand Down
72 changes: 72 additions & 0 deletions cctk/si_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np

import cctk
from cctk.helper_functions import get_symbol, get_number


class SIFile(cctk.File):
"""
Class representing Supporting Information files.
Attributes:
titles (list of str): title of each molecule
ensemble (cctk.Ensemble): ``cctk.Ensemble`` of molecules to print
"""

def __init__(self, ensemble, titles):
if ensemble and isinstance(ensemble, cctk.Ensemble):
self.ensemble = ensemble
else:
raise ValueError(f"invalid ensemble {ensemble}!")

assert len(titles) == len(ensemble)
self.titles = titles

def write_file(self, filename, append=False):
"""
Write an SI file.
Args:
filename (str): path to the new file
append (Bool): whether or not to append to file
"""
for title, (molecule, properties) in zip(self.titles, self.ensemble.items()):
assert isinstance(molecule, cctk.Molecule), "molecule is not a valid Molecule object!"

text = f"{title}\n"
for key, value in generate_info(molecule, properties).items():
text += f"{key}:\t{value}\n"

for index, Z in enumerate(molecule.atomic_numbers, start=1):
line = molecule.get_vector(index)
text += f"{get_symbol(Z):>2} {line[0]:>13.8f} {line[1]:>13.8f} {line[2]:>13.8f}\n"

if append:
text += "\n"
super().append_to_file(filename, text)
else:
super().write_file(filename, text)


def generate_info(molecule, properties):
info = {
"Number of Atoms": molecule.num_atoms(),
"Stoichiometry": molecule.formula(),
"Charge": molecule.charge,
"Multiplicity": molecule.multiplicity,
}

if "route_card" in properties:
info["Route Card"] = properties["route_card"]

if "energy" in properties:
info["Energy"] = properties["energy"]
if "enthalpy" in properties:
info["Enthalpy"] = properties["enthalpy"]
if "gibbs_free_energy" in properties:
info["Gibbs Free Energy"] = properties["gibbs_free_energy"]
if "quasiharmonic_gibbs_free_energy" in properties:
info["Gibbs Free Energy (Quasiharmonic Correction)"] = properties["quasiharmonic_gibbs_free_energy"]

return info

6 changes: 6 additions & 0 deletions scripts/generate_si.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import cctk

ensemble = cctk.GaussianFile.read_file("test/static/gaussian_file.out").ensemble
si_file = cctk.SIFile(ensemble, ["title"] * len(ensemble))

si_file.write_file("si.txt")

0 comments on commit 75bf1e8

Please sign in to comment.