In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
# 10ppm to -2ppm in 100000 steps
hnmr_ppm = np.linspace(10, -2, 10000).tolist()

# 230ppm to -20ppm in 100000 steps
cnmr_ppm = np.linspace(230, -20, 10000).tolist()

# 400cm^{-1} to -4000cm^{-1} in 1800 steps
ir_cm = np.linspace(400, 4000, 1800).tolist()

# 10ppm to -2ppm in 512 steps
two_d_h_nmr = np.linspace(10, -2, 512).tolist()
# 230ppm to -20ppm in 512 steps
two_d_c_nmr = np.linspace(230, -20, 512).tolist()

In [4]:
# Define spectrum dimensions
spectrum_dimensions = {
    "h_nmr_spectra": {"range": [-2, 10], "points": 10000, "unit": "ppm", "dimensions":hnmr_ppm},
    "c_nmr_spectra": {"range": [-20, 230], "points": 10000, "unit": "ppm","dimensions":cnmr_ppm},
    "hsqc_nmr_spectrum_h": {"range": [-2, 10], "points": 512, "unit": "ppm","dimensions":two_d_h_nmr},
    "hsqc_nmr_spectrum_c": {"range": [-20, 230], "points": 512, "unit": "ppm","dimensions":two_d_c_nmr},
    "ir_spectra": {"range": [400, 4000], "points": 1800, "unit": "cm^{-1}","dimensions":ir_cm},
}

meta_data_dict = {
    "smiles": {
        "format": "string",
        "unit": "SMILES",
        "info": "Canonical SMILES string generated using RDKit",
        "example": "CC(C)CCNc1ncc(F)cc1C(=O)O"
    },
    "molecular_formula": {
        "format": "string",
        "unit": "Molecular formula",
        "info": "Molecular formula determined by RDKit",
        "example": "C11H15FN2O2"
    },
    "h_nmr_spectra": {
        "format": "np.array(float)",
        "dimensions": hnmr_ppm,
        "info": "1D proton NMR spectrum intensity values",
        "unit": "ppm"
    },
    "c_nmr_spectra": {
        "format": "np.array(float)",
        "dimensions": cnmr_ppm,
        "info": "1D carbon-13 NMR spectrum intensity values",
        "unit": "ppm"
    },
    "h_nmr_peaks": {
        "format": "np.array(dict)",
        "fields": {
            "category": "str (multiplet type, e.g., 's', 'd', 't', 'm')",
            "centroid": "float (peak center in ppm)",
            "delta": "float (chemical shift in ppm)",
            "j_values": "str (coupling constants in Hz, separated by '_')",
            "nH": "int (number of protons)",
            "rangeMax": "float (maximum shift of peak range)",
            "rangeMin": "float (minimum shift of peak range)"
        },
        "info": "Array of peak information from 1H NMR spectrum automatically extracted"
    },
    "c_nmr_peaks": {
        "format": "np.array(dict)",
        "fields": {
            "delta (ppm)": "float (chemical shift)",
            "integral": "float (peak area)",
            "intensity": "float (peak height)",
            "width (ppm)": "float (peak width)"
        },
        "info": "Array of peak information from 13C NMR spectrum automatically extracted",
    },
    "hsqc_nmr_spectrum": {
        "format": "np.array(np.array(float))",
        "dimensions": {
            "h": two_d_h_nmr,
            "c": two_d_c_nmr
        },
        "unit": "ppm",
        "info": "2D HSQC NMR spectrum intensity matrix. Use np.stack to make a 2D array out of it"
    },
    "hsqc_nmr_peaks": {
        "format": "np.array(dict)",
        "fields": {
            "13C_centroid": "float (13C chemical shift)",
            "13C_max": "float (maximum 13C shift)",
            "13C_min": "float (minimum 13C shift)",
            "1H_centroid": "float (1H chemical shift)",
            "1H_max": "float (maximum 1H shift)",
            "1H_min": "float (minimum 1H shift)",
            "nH": "float (number of protons)"
        },
        "info": "Array of correlation peaks from HSQC spectrum"
    },
    "ir_spectra": {
        "format": "np.array(float)",
        "dimensions": ir_cm,
        "info": "IR absorption spectrum intensity values, from 400cm^{-1} to 4000cm^{-1} using 1800points",
        "unit": "cm^{-1}"
    },
    "msms_cfmid_positive_10ev": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-100",
        "info": "MS/MS spectrum at 10eV collision energy in positive ion mode"
    },
    "msms_cfmid_positive_20ev": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-100",
        "info": "MS/MS spectrum at 20eV collision energy in positive ion mode"
    },
    "msms_cfmid_positive_40ev": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-100",
        "info": "MS/MS spectrum at 40eV collision energy in positive ion mode"
    },
    "msms_cfmid_fragments_positive": {
        "format": "np.array(np.array([str, str]))",
        "fields": ["m/z", "formula"],
        "notation": "Including charge",
        "info": "Predicted fragments with SMILES/molecular formulas in positive ion mode"
    },
    "msms_cfmid_negative_10ev": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-100",
        "info": "MS/MS spectrum at 10eV collision energy in negative ion mode"
    },
    "msms_cfmid_negative_20ev": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-100",
        "info": "MS/MS spectrum at 20eV collision energy in negative ion mode"
    },
    "msms_cfmid_negative_40ev": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-100",
        "info": "MS/MS spectrum at 40eV collision energy in negative ion mode"
    },
    "msms_cfmid_fragments_negative": {
        "format": "np.array(np.array([str, str]))",
        "fields": ["m/z", "formula"],
        "notation": "Including charge",
        "info": "Predicted fragments with SMILES/molecular formulas in negative ion mode"
    },
    "msms_iceberg_positive": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-1",
        "info": "MS/MS spectrum with normalized intensities using ICEBERG (positive ion mode)"
    },
    "msms_iceberg_fragments_positive": {
        "format": "np.array(np.array([str, str]))",
        "fields": ["m/z", "formula"],
        "info": "Predicted fragments with molecular formulas using ICEBERG (positive ion mode)"
    },
    "msms_scarf_positive": {
        "format": "np.array(np.array([float, float]))",
        "fields": ["m/z", "intensity"],
        "normalization": "0-1",
        "info": "MS/MS spectrum with normalized intensities using SCARF (positive ion mode)"
    },
    "msms_scarf_fragments_positive": {
        "format": "np.array(np.array([str, str]))",
        "fields": ["m/z", "formula"],
        "info": "Predicted fragments with molecular formulas using SCARF (positive ion mode)"
    }
}


with open("../data/meta_data/meta_data_dict.json", "w") as outfile: 
    json.dump(meta_data_dict, outfile)
    
with open("../data/meta_data/spectrum_dimensions.json", "w") as outfile: 
    json.dump(spectrum_dimensions, outfile)
