In [1]:
from convertors.convert_to_json import *

In [2]:
global In_Silico_pattern
In_Silico_pattern = re.compile(r"in.silico|insilico|predicted|theoretical|Annotation.level.3", flags=re.IGNORECASE)

In [3]:
def in_filename(filename):
    """
    :param filename: The name of the file to be checked for the presence of the string "MSMS_Public" and matching a specific pattern.
    :return: Returns True if the filename does not contain "MSMS_Public" and matches a specific pattern defined by the In_Silico_pattern. Returns False otherwise.

    """
    if "MSMS_Public" not in filename:
        if re.search(In_Silico_pattern, filename):
            return True

    return False

In [4]:
def normalize_predicted(metadata_dict):
    """
    Normalize the predicted field in the given metadata dictionary.

    :param metadata_dict: A dictionary containing metadata information.
    :return: The updated metadata dictionary with the normalized predicted field.
    """
    comment_field = metadata_dict["COMMENT"]
    predicted = metadata_dict["PREDICTED"]
    filename = metadata_dict["FILENAME"]

    if re.search(In_Silico_pattern, comment_field) or predicted == "true" or in_filename(filename):
        metadata_dict["PREDICTED"] = "true"
        return metadata_dict
    else:
        metadata_dict["PREDICTED"] = "false"
        return metadata_dict

In [5]:
def convert_keys(dict_list):
    """
    Convert keys in metadata_dict based on the provided keys_dict and keys_list.

    :param metadata_dict: A dictionary containing metadata information.
    :return: A dictionary with converted keys based on the provided keys_dict and keys_list.
    """
    output = []
    for metadata_dict in dict_list:
        converted = {keys_dict[key.lower()]: val for key, val in metadata_dict.items() if key.lower() in keys_dict and keys_dict[key.lower()] in keys_list}
    
        converted.update({key: "" for key in keys_list if key not in converted})
        output.append(converted)

    return output

In [6]:
original_db_path = r"C:\Users\Axel\Documents\MSP_DB\ORIGINALS_msp_DB\DB Janvier 2024\DB_publi"

In [None]:
FINAL_MSP, FINAL_XML, FINAL_CSV, FINAL_JSON, FINAL_MGF = convert_to_json(original_db_path)

                                         -- CONVERTING JSON TO JSON --


                                                          Loading file: 3.47GB [00:42, 81.1MB/s]                            215MB/s] 
                                             converting JSON spectrums: 100%|[32m██████████[0m| 217259/217259 [00:59<00:00, 3628.05 spectrums/s] 


                                          -- CONVERTING MSP TO JSON --


                                           loading [MassBank_NIST.msp]:   6%|[32m▌         [0m| 5.50M/99.8M [00:02<00:37, 2.53MB/s]
                                          loading [MassBank_RIKEN.msp]:   4%|[32m▍         [0m| 5.19M/120M [00:02<00:49, 2.32MB/s]
                     loading [MSMS_Public_ExpBioInsilico_NEG_VS17.msp]:   6%|[32m▌         [0m| 3.20M/53.9M [00:01<00:19, 2.59MB/s]
                     loading [MSMS_Public_ExpBioInsilico_Pos_VS17.msp]:   6%|[32m▌         [0m| 15.3M/272M [00:06<01:48, 2.36MB/s]
                                              converting MSP spectrums:  25%|[32m██▌       [0m| 145000/572810 [00:24<01:16, 5593.45 spectrums/s]

In [None]:
print(FINAL_MSP)
print(FINAL_XML)
print(FINAL_CSV)
print(FINAL_JSON)
print(FINAL_MGF)