In [1]:
import pandas as pd

from convertors.convert_to_json import *

In [2]:
global In_Silico_pattern
In_Silico_pattern = re.compile(r"in.silico|insilico|predicted|theoretical|Annotation.level.3", flags=re.IGNORECASE)

In [3]:
def in_filename(filename):
    """
    :param filename: The name of the file to be checked for the presence of the string "MSMS_Public" and matching a specific pattern.
    :return: Returns True if the filename does not contain "MSMS_Public" and matches a specific pattern defined by the In_Silico_pattern. Returns False otherwise.

    """
    if "MSMS_Public" not in filename:
        if re.search(In_Silico_pattern, filename):
            return True

    return False

In [4]:
def normalize_predicted(metadata_dict):
    """
    Normalize the predicted field in the given metadata dictionary.

    :param metadata_dict: A dictionary containing metadata information.
    :return: The updated metadata dictionary with the normalized predicted field.
    """
    comment_field = metadata_dict["COMMENT"]
    predicted = metadata_dict["PREDICTED"]
    filename = metadata_dict["FILENAME"]

    if re.search(In_Silico_pattern, comment_field) or predicted == "true" or in_filename(filename):
        metadata_dict["PREDICTED"] = "true"
        return metadata_dict
    else:
        metadata_dict["PREDICTED"] = "false"
        return metadata_dict

In [5]:
def convert_keys(dict_list):
    """
    Convert keys in metadata_dict based on the provided keys_dict and keys_list.

    :param metadata_dict: A dictionary containing metadata information.
    :return: A dictionary with converted keys based on the provided keys_dict and keys_list.
    """
    output = []
    for metadata_dict in dict_list:
        converted = {keys_dict[key.lower()]: val for key, val in metadata_dict.items() if key.lower() in keys_dict and keys_dict[key.lower()] in keys_list}
    
        converted.update({key: "" for key in keys_list if key not in converted})
        output.append(converted)

    return output

In [6]:
original_db_path = r"D:\Axel\DB Janvier 2024\DB_publi_graphs\original"

In [7]:
FINAL_MSP, FINAL_XML, FINAL_CSV, FINAL_JSON, FINAL_MGF = convert_to_json(original_db_path)

                                         -- CONVERTING JSON TO JSON --


                                                          Loading file: 3.47GB [00:38, 90.5MB/s]                            226MB/s] 
                                             converting JSON spectrums: 100%|[32m██████████[0m| 217259/217259 [00:54<00:00, 3962.21 spectrums/s] 


                                          -- CONVERTING MSP TO JSON --


                                           loading [MassBank_NIST.msp]:   6%|[32m▌         [0m| 5.50M/99.8M [00:01<00:30, 3.09MB/s]
                                          loading [MassBank_RIKEN.msp]:   4%|[32m▍         [0m| 5.19M/120M [00:01<00:40, 2.83MB/s]
                     loading [MSMS_Public_ExpBioInsilico_NEG_VS17.msp]:   6%|[32m▌         [0m| 3.20M/53.9M [00:01<00:16, 3.04MB/s]
                     loading [MSMS_Public_ExpBioInsilico_Pos_VS17.msp]:   6%|[32m▌         [0m| 15.3M/272M [00:05<01:30, 2.84MB/s]
                                              converting MSP spectrums: 100%|[32m██████████[0m| 572810/572810 [01:35<00:00, 6010.65 spectrums/s] 


                                          -- CONVERTING MGF TO JSON --


                                            loading [BERKELEY-LAB.mgf]:   5%|[32m▍         [0m| 1.49M/32.5M [00:00<00:09, 3.13MB/s]
                                               loading [BILELIB19.mgf]:   6%|[32m▌         [0m| 2.40M/41.9M [00:00<00:09, 4.22MB/s]
                         loading [DEREPLICATOR_IDENTIFIED_LIBRARY.mgf]:   5%|[32m▌         [0m| 98.8k/1.92M [00:00<00:00, 3.45MB/s]
                                  loading [DRUGS-OF-ABUSE-LIBRARY.mgf]:   4%|[32m▍         [0m| 65.6k/1.51M [00:00<00:00, 3.02MB/s]
                          loading [ECG-ACYL-AMIDES-C4-C24-LIBRARY.mgf]:   5%|[32m▌         [0m| 353k/6.76M [00:00<00:01, 3.29MB/s]
                          loading [ECG-ACYL-ESTERS-C4-C24-LIBRARY.mgf]:   5%|[32m▍         [0m| 37.7k/774k [00:00<00:00, 3.08MB/s]
                                   loading [GNPS-COLLECTIONS-MISC.mgf]:   6%|[32m▌         [0m| 6.92k/121k [00:00<00:00, 3.36MB/s]
                    loading [GNPS-COLLECTIONS-PESTICIDES-NEGATIVE

In [8]:
FINAL_MSP = convert_keys(FINAL_MSP)

In [9]:
FINAL_XML = convert_keys(FINAL_XML)

In [10]:
FINAL_CSV = convert_keys(FINAL_CSV)

In [11]:
FINAL_JSON = convert_keys(FINAL_JSON)

In [12]:
FINAL_MGF = convert_keys(FINAL_MGF)

In [13]:
print(len(FINAL_MSP)+len(FINAL_XML)+len(FINAL_CSV)+len(FINAL_JSON) + len(FINAL_MGF))

919852


In [14]:
compteur =  0
for i in range(len(FINAL_MSP)):
    FINAL_MSP[i]["PREDICTED"] = ""
    FINAL_MSP[i] = normalize_predicted(FINAL_MSP[i])

In [15]:
compteur =  0
for i in range(len(FINAL_XML)):
    FINAL_XML[i]["PREDICTED"] = ""
    FINAL_XML[i] = normalize_predicted(FINAL_XML[i])

In [16]:
compteur =  0
for i in range(len(FINAL_CSV)):
    FINAL_CSV[i]["PREDICTED"] = ""
    FINAL_CSV[i] = normalize_predicted(FINAL_CSV[i])

In [17]:
compteur =  0
for i in range(len(FINAL_JSON)):
    FINAL_JSON[i]["PREDICTED"] = ""
    FINAL_JSON[i] = normalize_predicted(FINAL_JSON[i])

In [18]:
compteur =  0
for i in range(len(FINAL_MGF)):
    FINAL_MGF[i]["PREDICTED"] = ""
    FINAL_MGF[i] = normalize_predicted(FINAL_MGF[i])

In [19]:
msp_df = pd.DataFrame(FINAL_MSP)
del FINAL_MSP

In [20]:
xml_df = pd.DataFrame(FINAL_XML)
del FINAL_XML

In [21]:
csv_df = pd.DataFrame(FINAL_CSV)
del FINAL_CSV

In [22]:
json_df = pd.DataFrame(FINAL_JSON)
del FINAL_JSON

In [23]:
mgf_df = pd.DataFrame(FINAL_MGF)
del FINAL_MGF

In [24]:
DF = pd.concat([msp_df, xml_df, csv_df, json_df, mgf_df])
del msp_df, xml_df, csv_df, json_df

In [25]:
unique_instruments = DF['INSTRUMENT'].nunique()

print(f"Nombre d'instruments' différents : {unique_instruments}")

Nombre d'instruments' différents : 487


In [26]:
# unique_IONMODE = DF['IONMODE'].nunique()
# 
# print(f"Nombre de types de formats d'IONMODE différents : {unique_IONMODE}")