In [1]:
from convertors.convert_to_json import *
from mols_calculation import *
import pandas as pd
import os
import re

In [2]:
global In_Silico_pattern
In_Silico_pattern = re.compile(r"in.silico|insilico|predicted|theoretical|Annotation.level.3", flags=re.IGNORECASE)

In [3]:
def in_filename(filename):
    """
    :param filename: The name of the file to be checked for the presence of the string "MSMS_Public" and matching a specific pattern.
    :return: Returns True if the filename does not contain "MSMS_Public" and matches a specific pattern defined by the In_Silico_pattern. Returns False otherwise.

    """
    if "MSMS_Public" not in filename:
        if re.search(In_Silico_pattern, filename):
            return True

    return False

In [4]:
def normalize_predicted(metadata_dict):
    """
    Normalize the predicted field in the given metadata dictionary.

    :param metadata_dict: A dictionary containing metadata information.
    :return: The updated metadata dictionary with the normalized predicted field.
    """
    comment_field = metadata_dict["COMMENT"]
    predicted = metadata_dict["PREDICTED"]
    filename = metadata_dict["FILENAME"]

    if re.search(In_Silico_pattern, comment_field) or predicted == "true" or in_filename(filename):
        metadata_dict["PREDICTED"] = "true"
        return metadata_dict
    else:
        metadata_dict["PREDICTED"] = "false"
        return metadata_dict

In [5]:
def convert_keys(dict_list):
    """
    Convert keys in metadata_dict based on the provided keys_dict and keys_list.

    :param metadata_dict: A dictionary containing metadata information.
    :return: A dictionary with converted keys based on the provided keys_dict and keys_list.
    """
    output = []
    for metadata_dict in dict_list:
        converted = {keys_dict[key.lower()]: val for key, val in metadata_dict.items() if key.lower() in keys_dict and keys_dict[key.lower()] in keys_list}
    
        converted.update({key: "" for key in keys_list if key not in converted})
        output.append(converted)

    return output

In [6]:
original_db_path = r"C:\Users\Axel\Documents\PYTHON\FragHub\INPUT"

In [7]:
FINAL_MSP, FINAL_XML, FINAL_CSV, FINAL_JSON, FINAL_MGF = convert_to_json(original_db_path)

                                         -- CONVERTING JSON TO JSON --


                                                          Loading file: 3.47GB [00:44, 78.3MB/s]                            170MB/s] 
                                             converting JSON spectrums: 100%|[32m██████████[0m| 217259/217259 [01:09<00:00, 3144.49 spectrums/s] 


                                          -- CONVERTING MSP TO JSON --


                                           loading [MassBank_NIST.msp]:   6%|[32m▌         [0m| 5.50M/99.8M [00:02<00:40, 2.33MB/s]
                                          loading [MassBank_RIKEN.msp]:   4%|[32m▍         [0m| 5.19M/120M [00:02<00:51, 2.23MB/s]
                     loading [MSMS_Public_ExpBioInsilico_NEG_VS17.msp]:   6%|[32m▌         [0m| 3.20M/53.9M [00:01<00:20, 2.50MB/s]
                     loading [MSMS_Public_ExpBioInsilico_Pos_VS17.msp]:   6%|[32m▌         [0m| 15.3M/272M [00:06<01:52, 2.28MB/s]
                                              converting MSP spectrums: 100%|[32m██████████[0m| 572810/572810 [01:53<00:00, 5025.11 spectrums/s] 


                                          -- CONVERTING MGF TO JSON --


                                            loading [BERKELEY-LAB.mgf]:   5%|[32m▍         [0m| 1.49M/32.5M [00:00<00:12, 2.47MB/s]
                                               loading [BILELIB19.mgf]:   6%|[32m▌         [0m| 2.40M/41.9M [00:00<00:11, 3.38MB/s]
                         loading [DEREPLICATOR_IDENTIFIED_LIBRARY.mgf]:   5%|[32m▌         [0m| 98.8k/1.92M [00:00<00:00, 2.63MB/s]
                                  loading [DRUGS-OF-ABUSE-LIBRARY.mgf]:   4%|[32m▍         [0m| 65.6k/1.51M [00:00<00:00, 2.72MB/s]
                          loading [ECG-ACYL-AMIDES-C4-C24-LIBRARY.mgf]:   5%|[32m▌         [0m| 353k/6.76M [00:00<00:02, 2.68MB/s]
                          loading [ECG-ACYL-ESTERS-C4-C24-LIBRARY.mgf]:   5%|[32m▍         [0m| 37.7k/774k [00:00<00:00, 2.27MB/s]
                                   loading [GNPS-COLLECTIONS-MISC.mgf]:   6%|[32m▌         [0m| 6.92k/121k [00:00<00:00, 2.07MB/s]
                    loading [GNPS-COLLECTIONS-PESTICIDES-NEGATIVE

normalisé les clés

In [8]:
FINAL_MSP = convert_keys(FINAL_MSP)

In [9]:
FINAL_XML = convert_keys(FINAL_XML)

In [10]:
FINAL_CSV = convert_keys(FINAL_CSV)

In [11]:
FINAL_JSON = convert_keys(FINAL_JSON)

In [12]:
FINAL_MGF = convert_keys(FINAL_MGF)

In [13]:
print(len(FINAL_MSP)+len(FINAL_XML)+len(FINAL_CSV)+len(FINAL_JSON) + len(FINAL_MGF))

919852


ajouter la colonne 'PREDICTED' et la peupler

In [14]:
compteur =  0
for i in range(len(FINAL_MSP)):
    FINAL_MSP[i]["PREDICTED"] = ""
    FINAL_MSP[i] = normalize_predicted(FINAL_MSP[i])

In [15]:
compteur =  0
for i in range(len(FINAL_XML)):
    FINAL_XML[i]["PREDICTED"] = ""
    FINAL_XML[i] = normalize_predicted(FINAL_XML[i])

In [16]:
compteur =  0
for i in range(len(FINAL_CSV)):
    FINAL_CSV[i]["PREDICTED"] = ""
    FINAL_CSV[i] = normalize_predicted(FINAL_CSV[i])

In [17]:
compteur =  0
for i in range(len(FINAL_JSON)):
    FINAL_JSON[i]["PREDICTED"] = ""
    FINAL_JSON[i] = normalize_predicted(FINAL_JSON[i])

In [18]:
compteur =  0
for i in range(len(FINAL_MGF)):
    FINAL_MGF[i]["PREDICTED"] = ""
    FINAL_MGF[i] = normalize_predicted(FINAL_MGF[i])

In [19]:
msp_df = pd.DataFrame(FINAL_MSP)
del FINAL_MSP

In [20]:
xml_df = pd.DataFrame(FINAL_XML)
del FINAL_XML

In [21]:
csv_df = pd.DataFrame(FINAL_CSV)
del FINAL_CSV

In [22]:
json_df = pd.DataFrame(FINAL_JSON)
del FINAL_JSON

In [23]:
mgf_df = pd.DataFrame(FINAL_MGF)
del FINAL_MGF

On prend que les LC EXP

In [24]:
if not msp_df.empty:
    msp_df = msp_df[~msp_df['INSTRUMENTTYPE'].str.contains('GC|EI', case=False)]
    msp_df = msp_df[msp_df['PREDICTED'] == "false"]

In [25]:
if not xml_df.empty:
    xml_df = xml_df[~xml_df['INSTRUMENTTYPE'].str.contains('GC|EI', case=False)]
    xml_df = xml_df[xml_df['PREDICTED'] == "false"]

In [26]:
if not csv_df.empty:
    csv_df = csv_df[~csv_df['INSTRUMENTTYPE'].str.contains('GC|EI', case=False)]
    csv_df = csv_df[csv_df['PREDICTED'] == "false"]

In [27]:
if not json_df.empty:
    json_df = json_df[~json_df['INSTRUMENTTYPE'].str.contains('GC|EI', case=False)]
    json_df = json_df[json_df['PREDICTED'] == "false"]

In [28]:
if not mgf_df.empty:
    mgf_df = mgf_df[~mgf_df['INSTRUMENTTYPE'].str.contains('GC|EI', case=False)]
    mgf_df = mgf_df[mgf_df['PREDICTED'] == "false"]

passer RDkit sur chacun des spectres

In [29]:
if not msp_df.empty:
    msp_df = mols_derivation_and_calculation(msp_df)

                                            derivation and calculation: 100%|[32m██████████[0m| 44225/44225 [00:55<00:00, 796.41 rows/s] 
                                                    updating dataframe: 100%|[32m██████████[0m| 543284/543284 [00:20<00:00, 26166.16 rows/s]


In [30]:
if not xml_df.empty:
    xml_df = mols_derivation_and_calculation(xml_df)

In [31]:
if not csv_df.empty:
    csv_df = mols_derivation_and_calculation(csv_df)

In [32]:
if not json_df.empty:
    json_df = mols_derivation_and_calculation(json_df)

                                            derivation and calculation: 100%|[32m██████████[0m| 46930/46930 [00:56<00:00, 830.05 rows/s] 
                                                    updating dataframe: 100%|[32m██████████[0m| 199546/199546 [00:07<00:00, 27592.47 rows/s]


In [33]:
if not mgf_df.empty:
    mgf_df = mols_derivation_and_calculation(mgf_df)

                                            derivation and calculation: 100%|[32m██████████[0m| 39508/39508 [00:44<00:00, 891.80 rows/s] 
                                                    updating dataframe: 100%|[32m██████████[0m| 129950/129950 [00:03<00:00, 35089.31 rows/s]


écrire les fichiers de sorti

In [34]:
msp_df.to_csv(r"C:\Users\Axel\Documents\PYTHON\FragHub\TOOLS\Generate_csv_from_original_files\output\msp_df.csv", sep=";", quotechar='"', index=False, encoding="UTF-8")

In [35]:
xml_df.to_csv(r"C:\Users\Axel\Documents\PYTHON\FragHub\TOOLS\Generate_csv_from_original_files\output\xml_df.csv", sep=";", quotechar='"', index=False, encoding="UTF-8")

In [36]:
csv_df.to_csv(r"C:\Users\Axel\Documents\PYTHON\FragHub\TOOLS\Generate_csv_from_original_files\output\csv_df.csv", sep=";", quotechar='"', index=False, encoding="UTF-8")

In [37]:
json_df.to_csv(r"C:\Users\Axel\Documents\PYTHON\FragHub\TOOLS\Generate_csv_from_original_files\output\json_df.csv", sep=";", quotechar='"', index=False, encoding="UTF-8")

In [38]:
mgf_df.to_csv(r"C:\Users\Axel\Documents\PYTHON\FragHub\TOOLS\Generate_csv_from_original_files\output\mgf_df.csv", sep=";", quotechar='"', index=False, encoding="UTF-8")

calculer le nombre de spectres en LC exp

In [39]:
DF = pd.concat([msp_df, xml_df, json_df, csv_df, mgf_df], ignore_index=True, sort=False)
print(len(DF))

872780


Calculer le nombre de composé unique

In [42]:
# Création d'une colonne 'Short_InChIKey' pour stocker les versions courtes des InChIKeys
DF['Short_InChIKey'] = DF['INCHIKEY'].str[:14]  # Vous pouvez ajuster la longueur selon vos besoins

# Obtenir les InChIKeys courts uniques
unique_short_inchikeys = DF['Short_InChIKey'].drop_duplicates()

# Compter le nombre d'InChIKeys courts uniques
nombre_inchikeys_courts_uniques = len(unique_short_inchikeys)

print(nombre_inchikeys_courts_uniques)

35674
