params/user.yml

# parameters
general:
  raw_data_path: ../data/input # required=True, help="Path to the raw data (or input data) folder", type= str
  treated_data_path: ../data/output/individual_analysis # required=True, help="Path to the treated data (or output data) folder", type= str
  polarity: pos # required=True, help="ionization mode to treat: pos, neg or both", type= str

data-download:
  record_id: 10018590 # required=False, help="Zenodod record id of the dataset to download", type= int
  record_name: enpkg_toy_dataset.zip # required=False, help="Name of the dataset to download", type= str

data-organization:
  source_path: ../data/input/enpkg_toy_dataset/msdata/processed # required=True, help="Path to the folder containing the mass spec data", type= str
  source_metadata_path: ../data/input/enpkg_toy_dataset/metadata # required=True, help="Path to the folder containing the metadata", type= str
  sample_metadata_filename: metadata.tsv # required=True, help="Name of the sample metadata file", type= str
  lcms_method_params_filename: lcms_method_params.txt # required=True, help="Name of the LCMS method parameters file", type= str
  lcms_processing_params_filename: lcms_processing_params.xml # required=True, help="Name of the LCMS processing parameters file", type= str

massive-id-addition:
  massive_id: MSV000087728 # required=True, help="MassIVE dataset id", type= str

taxo-info-fetching:
  recompute: True # required=False, help="Recompute taxonomical information for samples with results already done", type= bool, default= True)

isdb:
  adducts-formatter:
    taxo_db_metadata_path: ./db_metadata/230106_frozen_metadata.csv.gz # Path to your spectral library file
  general_params:
    recompute: True  # Recompute for samples with results already done
  
  paths:
    taxo_db_metadata_path: db_metadata/230106_frozen_metadata.csv.gz  # Path to your spectral library file
    spectral_db_pos_path: db_spectra/isdb_pos_cleaned.pkl # Path to the metadata of the spectral file in PI mode
    spectral_db_neg_path: db_spectra/isdb_neg.mgf # Path to the metadata of the spectral file in NI mode
    adducts_pos_path: data_loc/230106_frozen_metadata/230106_frozen_metadata_adducts_pos.tsv.gz # Path to the adducts file in pos mode
    adducts_neg_path: data_loc/230106_frozen_metadata/230106_frozen_metadata_adducts_neg.tsv.gz # Path to the adducts file in neg mode
  
  spectral_match_params:
    parent_mz_tol: 0.01 # the parent mass tolerance to use for spectral matching (in Da) (if cosine)
    msms_mz_tol: 0.01 # the msms mass tolerance to use for spectral matching (in Da) (if cosine)
    min_score: 0.2 # the minimal cosine to use for spectral matching (if cosine)
    min_peaks: 6 # the minimal matching peaks number to use for spectral matching (if cosine)
  
  networking_params:
    mn_msms_mz_tol: 0.01 # the msms mass tolerance to use for spectral matching (in Da)
    mn_score_cutoff: 0.7 # the minimal modified cosine score for edge creation
    mn_max_links: 10 # Consider edge between spectrumA and spectrumB if score falls into top_n for spectrumA and spectrumB
    mn_top_n: 15 # Maximum number of links to add per node.
  
  reweighting_params:
    top_to_output: 1 # Number of candidate structures to output for each feature
    ppm_tol_ms1: 2 # Tolerance for MS1 matching (adducts)
    use_post_taxo: True # Use cluster chemical consistency after taxonomical reweighting (True or False)
    top_N_chemical_consistency: 15 # Top N candidates to consider for cluster chemical consistency 
    min_score_taxo_ms1: 8 # Minimal taxonomical score for MS1-only candidates (6: family, 7 genus, 8 species)
    min_score_chemo_ms1: 2 # Minimum chemical consistency score (1 = NPClassifier pathway level consistency, 2 = NPClassifier superclass level consistency, 3 = NPClassifier class level consistency )
    msms_weight: 4 # A weight attributed to the spectral score
    taxo_weight: 1 # A weight attributed to the taxonomical score
    chemo_weight: 0.5 # A weight attributed to the chemical consistency score

sirius:
  paths:
    path_to_sirius: /Users/pma/02_tmp/sirius/sirius.app/Contents/MacOS/sirius #/Users/pma/Downloads/sirius/sirius.app/sirius/bin #/Applications/sirius.app/Contents/MacOS/sirius #'/prog/sirius/bin/sirius' #'/prog/sirius/bin/sirius' # Path to sirius executable (default is 'sirius'). 
  options:
    sirius_version: 5 # Sirius version: 4 or 5
    ionization: 'pos' # Ionization mode to process: pos or neg
    sirius_command_arg:  "-i {file} --output {output_name} --maxmz 800 config --IsotopeSettings.filter=true --FormulaSearchDB=BIO --Timeout.secondsPerTree=0 \
  --FormulaSettings.enforced=HCNOP --Timeout.secondsPerInstance=0 --AdductSettings.detectable=[[M+Na]+,[M+H3N+H]+,[M-H4O2+H]+,[M+K]+,[M+H]+,[M-H2O+H]+] \
  --UseHeuristic.mzToUseHeuristicOnly=650 --AlgorithmProfile=qtof --IsotopeMs2Settings=IGNORE --MS2MassDeviation.allowedMassDeviation=10.0ppm \
  --NumberOfCandidatesPerIon=1 --UseHeuristic.mzToUseHeuristic=300 --FormulaSettings.detectable=B,Cl,Br,Se,S --NumberOfCandidates=10 \
  --ZodiacNumberOfConsideredCandidatesAt300Mz=10 --ZodiacRunInTwoSteps=true --ZodiacEdgeFilterThresholds.minLocalConnections=10 \
  --ZodiacEdgeFilterThresholds.thresholdFilter=0.95 --ZodiacEpochs.burnInPeriod=2000 --ZodiacEpochs.numberOfMarkovChains=10 --ZodiacNumberOfConsideredCandidatesAt800Mz=50 \
  --ZodiacEpochs.iterations=20000 --AdductSettings.enforced=, --AdductSettings.fallback=[[M+Na]+,[M-H+K+K]+,[M+K]+,[M+H]+,[M-H2O+H]+] \
  --FormulaResultThreshold=true --InjectElGordoCompounds=true --StructureSearchDB=BIO --RecomputeResults=false \
  formula zodiac fingerprint structure canopus write-summaries --output {output_name}"
          # Don't change input and output parameters
    recompute: False # Recompute files for which output directory (with the same prefix) already exists: True of False (no quotes)
    zip_output: True # Zip individual features outputs to save disk space: True of False (no quotes)
    sirius_user_env: SIRIUS_USERNAME
    sirius_password_env: SIRIUS_PASSWORD

chemo-info-fetching:
  sql_db_name: structures_metadata.db
  sql_db_path: ../sql_db/
  gnps_id: # GNPS id. Only fill if you have been launching a global GNPS id job

memo:
  output: memo_matrix # required=True, help="Output name to use for the generated MEMO matrix", type= str
  output_path: ../data/output/memo_matrix # help="Output path to use for the generated MEMO matrix", type= str, default= '../data/output/memo_matrix')
  min_relative_intensity: 0.01 # help="Minimal relative intensity to keep a peak max_relative_intensity, default 0.01", type= float, default= 0.01)
  max_relative_intensity: 1 # ', help="Maximal relative intensity to keep a peak max_relative_intensity, default 1", type= float, default= 1.0)
  min_peaks_required: 10 # ', help="Minimum number of peaks to keep a spectrum, default 10", type= int, default= 10)
  losses_from: 10 # ', help="Minimal m/z value for losses losses_to (int): maximal m/z value for losses, default 10", type= int, default= 10)
  losses_to: 200 # ', help="Maximal m/z value for losses losses_to (int): maximal m/z value for losses, default 200", type= int, default= 200)
  n_decimals: 2 # ', help="Number of decimal when translating peaks/losses into words, default 2", type= int, default= 2)
  filter_blanks: False # ', help="Remove blanks samples from the MEMO matrix", type= bool, default= False)
  word_max_occ_blanks: -1 # ', help="Set --filter_blanks to True to use. If word is present in more than n blanks, word is removed from MEMO matrix, default -1 (all words kept)", type= int, default= -1)

graph-builder:
  kg_uri : https://enpkg.commons-lab.org/kg/ # required=True, help="URI of the knowledge graph", type= str E.g. https://dbgi.vital-it.ch/ 
  prefix : enpkg # required=True, help="Prefix of the knowledge graph", type= str E.g. dbgi, emikg, enpkg
  module_uri : https://enpkg.commons-lab.org/module/ # required=False, help="URI of the module", type= str 
  prefix_module : enpkgmodule # required=False, help="Prefix of the module", type= str
  target_chembl_url : https://www.ebi.ac.uk/chembl/target_report_card/ # required=False, help="URL of the target in ChEMBL", type= str

  wd_namespace : http://www.wikidata.org/entity/ # required=True, help="Namespace of Wikidata", type= str

  structures_db_path : ../sql_db/structures_metadata.db # required=True, help="Path to the structures SQL database", type= str /home/allardpm/ENPKG/data/structures_db/structures_metadata.db #../
  gnps_dashboard_prefix : 'https://dashboard.gnps2.org/?usi=mzspec:' # required=True, help="Prefix of the GNPS dashboard", type= str
  gnps_tic_pic_prefix : 'https://dashboard.gnps2.org/mspreview?usi=mzspec:' # required=True, help="Prefix of the GNPS TIC/PIC", type= str
  massive_prefix : 'https://massive.ucsd.edu/ProteoSAFe/dataset.jsp?accession=' # required=True, help="Prefix of the MassIVE dataset", type= str
  spectrum_dashboard_prefix : 'https://metabolomics-usi.gnps2.org/dashinterface?usi1=' # required=True, help="Prefix of the spectrum dashboard", type= str
  spectrum_png_prefix : 'https://metabolomics-usi.gnps2.org/png/?usi1=' # required=True, help="Prefix of the spectrum png", type= str
  gnps_fast_search_prefix : 'https://fasst.gnps2.org/fastsearch/?usi1=' # required=True, help="Prefix of the GNPS fast search", type= str

  source_taxon_header : source_taxon # required=True, help="Header of the source taxon", type= str e.g. #organism_species #
  source_id_header : source_id # required=True, help="Header of the source id", type= str e.g. #sample_substance_name 

  peak_loss_params:
    n_decimals: 1 # number of decimals of m/z to use for spectral binning

  graph_format : ttl # required=True, help="Format of the graph", type= str (default is .ttl format (ttl), but can be .nt (nt))
  graph_output_dir_path : ../data/output/standalone_graphs #/home/allardpm/ENPKG/data/pf1600_sub/standalone_graphs #../tests/data/output/standalone_graphs # 
  compress_outputs : False # required=False, help="Compress outputs", type= bool, default= False)