In [None]:
import pandas as pd
import pandas_utils as pu

from rdkit_mol_identifiers import clean_structure_add_mol_id_columns

## Read sdf or msp to extract SMILES, run the cleanup with it

In [None]:

def read_sdf(filename, progress_bar=False):
  smiles_list = []
  with open(filename, 'r', encoding='UTF-8') as file:
    for line in tqdm(file) if progress_bar else file:
      line = line.rstrip()
      if line == "> <SMILES>":
        smi = file.readline().rstrip()
        if smi:
          smiles_list.append(smi)

      # MoNA sdf computed SMILES=
      if line.startswith("computed SMILES="):
        smi = line[16:]
        if smi:
          smiles_list.append(smi)

      # NPAtlas SMILES format
      if '<compound_smiles>' in line:
        smi = file.readline().rstrip()
        if smi:
          smiles_list.append(smi)

    df = pd.DataFrame({
      "SMILES": smiles_list
    })
    return df


def read_msp_fast(filename, progress_bar=False):
  smiles_list = []
  with open(filename, 'r', encoding='UTF-8') as file:
    for line in tqdm(file) if progress_bar else file:
      key_value_pair = line.split(":")
      if len(key_value_pair) > 1 and key_value_pair[0].lower() == "smiles":
        smi = str(key_value_pair[1]).strip()
        if smi:
          smiles_list.append(smi)

    df = pd.DataFrame({
      "SMILES": smiles_list
    })
    return df



In [None]:
df_mona_msms = read_sdf(
    r"C:\git\small_mol_database_review\databases\20231208\MoNA-export-LC-MS-MS_Spectra.sdf").rename(columns={"SMILES": "smiles"})
df_massbank_eu = read_msp_fast(
    r"C:\git\small_mol_database_review\databases\20231208\MassBank_NIST.msp").rename(columns={"SMILES": "smiles"})


In [None]:
df_gnps = pu.read_dataframe(r"C:\git\msn_library\data\gnpslib\20240229_ALL_GNPS_NO_PROPOGATED.tsv")
df_gnps = clean_structure_add_mol_id_columns(df_gnps, drop_mol=True)

In [None]:
df_nist23 = pu.read_dataframe(r"C:\git\msn_library\data\public_library\NIST23tandemLib_statistics.csv")
df_nist23

In [None]:
df_nist23 = clean_structure_add_mol_id_columns(df_nist23, drop_mol=True)
df_nist23

In [None]:
pu.save_dataframe(df_nist23, r"C:\git\msn_library\data\public_library\NIST23tandemLib_statistics_standardized.csv")

In [None]:
df_nist20 = pu.read_dataframe(r"C:\git\msn_library\data\public_library\NIST20_standardized.tsv")
df_nist20

In [None]:
pu.save_dataframe(df_gnps, r"C:\git\msn_library\data\gnpslib\20240229_ALL_GNPS_NO_PROPOGATED_standardized.tsv")

In [None]:
df_gnps = pu.read_dataframe(r"C:\git\msn_library\data\gnpslib\20240229_ALL_GNPS_NO_PROPOGATED_standardized.tsv")
df_gnps

In [None]:
df_gnps["split_inchikey"].nunique()

In [None]:
df_mona_msms = clean_structure_add_mol_id_columns(df_mona_msms, drop_mol=True)

In [None]:
df_mona_msms["split_inchikey"].nunique()

In [None]:
pu.save_dataframe(df_mona_msms, r"C:\git\small_mol_database_review\databases\20231208\MoNA_LC_MSMS.tsv")

In [None]:
df_massbank_eu = clean_structure_add_mol_id_columns(df_massbank_eu, drop_mol=True) 
df_massbank_eu

In [None]:
pu.save_dataframe(df_massbank_eu, r"C:\git\small_mol_database_review\databases\20231208\MassBank_nist.tsv")

In [None]:
df_massbank_eu["inchikey"].nunique()

In [None]:
pu.save_dataframe(df_mona_msms, r"C:\git\small_mol_database_review\databases\20231208\MoNA_LC_MSMS.tsv")

In [None]:
df_mona_msms = pu.read_dataframe(r"C:\git\small_mol_database_review\databases\20231208\MoNA_LC_MSMS.tsv")
df_massbank_eu = pu.read_dataframe(r"C:\git\small_mol_database_review\databases\20231208\MassBank_nist.tsv")

In [None]:
df_mona_msms

In [None]:
df_mona_msms["inchikey"].nunique()

In [None]:
df_mona_msms["split_inchikey"].nunique()

In [None]:
df_massbank_eu

In [None]:
df_massbank_eu["inchikey"].nunique()


In [None]:
df_massbank_eu["split_inchikey"].nunique()

In [None]:
weizmann = read_msp_fast(
    r"C:\git\msn_library\library\20240306_weizmann_nih_overlap_ms2.msp").rename(columns={"SMILES": "smiles"})

In [None]:
weizmann.nunique()