-
Notifications
You must be signed in to change notification settings - Fork 1
/
user.yml
123 lines (105 loc) · 9.83 KB
/
user.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# parameters
general:
raw_data_path: ../data/input # required=True, help="Path to the raw data (or input data) folder", type= str
treated_data_path: ../data/output/individual_analysis # required=True, help="Path to the treated data (or output data) folder", type= str
polarity: pos # required=True, help="ionization mode to treat: pos, neg or both", type= str
data-download:
record_id: 10018590 # required=False, help="Zenodod record id of the dataset to download", type= int
record_name: enpkg_toy_dataset.zip # required=False, help="Name of the dataset to download", type= str
data-organization:
source_path: ../data/input/enpkg_toy_dataset/msdata/processed # required=True, help="Path to the folder containing the mass spec data", type= str
source_metadata_path: ../data/input/enpkg_toy_dataset/metadata # required=True, help="Path to the folder containing the metadata", type= str
sample_metadata_filename: metadata.tsv # required=True, help="Name of the sample metadata file", type= str
lcms_method_params_filename: lcms_method_params.txt # required=True, help="Name of the LCMS method parameters file", type= str
lcms_processing_params_filename: lcms_processing_params.xml # required=True, help="Name of the LCMS processing parameters file", type= str
massive-id-addition:
massive_id: MSV000087728 # required=True, help="MassIVE dataset id", type= str
taxo-info-fetching:
recompute: True # required=False, help="Recompute taxonomical information for samples with results already done", type= bool, default= True)
isdb:
adducts-formatter:
taxo_db_metadata_path: ./db_metadata/230106_frozen_metadata.csv.gz # Path to your spectral library file
general_params:
recompute: True # Recompute for samples with results already done
paths:
taxo_db_metadata_path: db_metadata/230106_frozen_metadata.csv.gz # Path to your spectral library file
spectral_db_pos_path: db_spectra/isdb_pos_cleaned.pkl # Path to the metadata of the spectral file in PI mode
spectral_db_neg_path: db_spectra/isdb_neg.mgf # Path to the metadata of the spectral file in NI mode
adducts_pos_path: data_loc/230106_frozen_metadata/230106_frozen_metadata_adducts_pos.tsv.gz # Path to the adducts file in pos mode
adducts_neg_path: data_loc/230106_frozen_metadata/230106_frozen_metadata_adducts_neg.tsv.gz # Path to the adducts file in neg mode
spectral_match_params:
parent_mz_tol: 0.01 # the parent mass tolerance to use for spectral matching (in Da) (if cosine)
msms_mz_tol: 0.01 # the msms mass tolerance to use for spectral matching (in Da) (if cosine)
min_score: 0.2 # the minimal cosine to use for spectral matching (if cosine)
min_peaks: 6 # the minimal matching peaks number to use for spectral matching (if cosine)
networking_params:
mn_msms_mz_tol: 0.01 # the msms mass tolerance to use for spectral matching (in Da)
mn_score_cutoff: 0.7 # the minimal modified cosine score for edge creation
mn_max_links: 10 # Consider edge between spectrumA and spectrumB if score falls into top_n for spectrumA and spectrumB
mn_top_n: 15 # Maximum number of links to add per node.
reweighting_params:
top_to_output: 1 # Number of candidate structures to output for each feature
ppm_tol_ms1: 2 # Tolerance for MS1 matching (adducts)
use_post_taxo: True # Use cluster chemical consistency after taxonomical reweighting (True or False)
top_N_chemical_consistency: 15 # Top N candidates to consider for cluster chemical consistency
min_score_taxo_ms1: 8 # Minimal taxonomical score for MS1-only candidates (6: family, 7 genus, 8 species)
min_score_chemo_ms1: 2 # Minimum chemical consistency score (1 = NPClassifier pathway level consistency, 2 = NPClassifier superclass level consistency, 3 = NPClassifier class level consistency )
msms_weight: 4 # A weight attributed to the spectral score
taxo_weight: 1 # A weight attributed to the taxonomical score
chemo_weight: 0.5 # A weight attributed to the chemical consistency score
sirius:
paths:
path_to_sirius: /Users/pma/02_tmp/sirius/sirius.app/Contents/MacOS/sirius #/Users/pma/Downloads/sirius/sirius.app/sirius/bin #/Applications/sirius.app/Contents/MacOS/sirius #'/prog/sirius/bin/sirius' #'/prog/sirius/bin/sirius' # Path to sirius executable (default is 'sirius').
options:
sirius_version: 5 # Sirius version: 4 or 5
ionization: 'pos' # Ionization mode to process: pos or neg
sirius_command_arg: "-i {file} --output {output_name} --maxmz 800 config --IsotopeSettings.filter=true --FormulaSearchDB=BIO --Timeout.secondsPerTree=0 \
--FormulaSettings.enforced=HCNOP --Timeout.secondsPerInstance=0 --AdductSettings.detectable=[[M+Na]+,[M+H3N+H]+,[M-H4O2+H]+,[M+K]+,[M+H]+,[M-H2O+H]+] \
--UseHeuristic.mzToUseHeuristicOnly=650 --AlgorithmProfile=qtof --IsotopeMs2Settings=IGNORE --MS2MassDeviation.allowedMassDeviation=10.0ppm \
--NumberOfCandidatesPerIon=1 --UseHeuristic.mzToUseHeuristic=300 --FormulaSettings.detectable=B,Cl,Br,Se,S --NumberOfCandidates=10 \
--ZodiacNumberOfConsideredCandidatesAt300Mz=10 --ZodiacRunInTwoSteps=true --ZodiacEdgeFilterThresholds.minLocalConnections=10 \
--ZodiacEdgeFilterThresholds.thresholdFilter=0.95 --ZodiacEpochs.burnInPeriod=2000 --ZodiacEpochs.numberOfMarkovChains=10 --ZodiacNumberOfConsideredCandidatesAt800Mz=50 \
--ZodiacEpochs.iterations=20000 --AdductSettings.enforced=, --AdductSettings.fallback=[[M+Na]+,[M-H+K+K]+,[M+K]+,[M+H]+,[M-H2O+H]+] \
--FormulaResultThreshold=true --InjectElGordoCompounds=true --StructureSearchDB=BIO --RecomputeResults=false \
formula zodiac fingerprint structure canopus write-summaries --output {output_name}"
# Don't change input and output parameters
recompute: False # Recompute files for which output directory (with the same prefix) already exists: True of False (no quotes)
zip_output: True # Zip individual features outputs to save disk space: True of False (no quotes)
sirius_user_env: SIRIUS_USERNAME
sirius_password_env: SIRIUS_PASSWORD
chemo-info-fetching:
sql_db_name: structures_metadata.db
sql_db_path: ../sql_db/
gnps_id: # GNPS id. Only fill if you have been launching a global GNPS id job
memo:
output: memo_matrix # required=True, help="Output name to use for the generated MEMO matrix", type= str
output_path: ../data/output/memo_matrix # help="Output path to use for the generated MEMO matrix", type= str, default= '../data/output/memo_matrix')
min_relative_intensity: 0.01 # help="Minimal relative intensity to keep a peak max_relative_intensity, default 0.01", type= float, default= 0.01)
max_relative_intensity: 1 # ', help="Maximal relative intensity to keep a peak max_relative_intensity, default 1", type= float, default= 1.0)
min_peaks_required: 10 # ', help="Minimum number of peaks to keep a spectrum, default 10", type= int, default= 10)
losses_from: 10 # ', help="Minimal m/z value for losses losses_to (int): maximal m/z value for losses, default 10", type= int, default= 10)
losses_to: 200 # ', help="Maximal m/z value for losses losses_to (int): maximal m/z value for losses, default 200", type= int, default= 200)
n_decimals: 2 # ', help="Number of decimal when translating peaks/losses into words, default 2", type= int, default= 2)
filter_blanks: False # ', help="Remove blanks samples from the MEMO matrix", type= bool, default= False)
word_max_occ_blanks: -1 # ', help="Set --filter_blanks to True to use. If word is present in more than n blanks, word is removed from MEMO matrix, default -1 (all words kept)", type= int, default= -1)
graph-builder:
kg_uri : https://enpkg.commons-lab.org/kg/ # required=True, help="URI of the knowledge graph", type= str E.g. https://dbgi.vital-it.ch/
prefix : enpkg # required=True, help="Prefix of the knowledge graph", type= str E.g. dbgi, emikg, enpkg
module_uri : https://enpkg.commons-lab.org/module/ # required=False, help="URI of the module", type= str
prefix_module : enpkgmodule # required=False, help="Prefix of the module", type= str
target_chembl_url : https://www.ebi.ac.uk/chembl/target_report_card/ # required=False, help="URL of the target in ChEMBL", type= str
wd_namespace : http://www.wikidata.org/entity/ # required=True, help="Namespace of Wikidata", type= str
structures_db_path : ../sql_db/structures_metadata.db # required=True, help="Path to the structures SQL database", type= str /home/allardpm/ENPKG/data/structures_db/structures_metadata.db #../
gnps_dashboard_prefix : 'https://dashboard.gnps2.org/?usi=mzspec:' # required=True, help="Prefix of the GNPS dashboard", type= str
gnps_tic_pic_prefix : 'https://dashboard.gnps2.org/mspreview?usi=mzspec:' # required=True, help="Prefix of the GNPS TIC/PIC", type= str
massive_prefix : 'https://massive.ucsd.edu/ProteoSAFe/dataset.jsp?accession=' # required=True, help="Prefix of the MassIVE dataset", type= str
spectrum_dashboard_prefix : 'https://metabolomics-usi.gnps2.org/dashinterface?usi1=' # required=True, help="Prefix of the spectrum dashboard", type= str
spectrum_png_prefix : 'https://metabolomics-usi.gnps2.org/png/?usi1=' # required=True, help="Prefix of the spectrum png", type= str
gnps_fast_search_prefix : 'https://fasst.gnps2.org/fastsearch/?usi1=' # required=True, help="Prefix of the GNPS fast search", type= str
source_taxon_header : source_taxon # required=True, help="Header of the source taxon", type= str e.g. #organism_species #
source_id_header : source_id # required=True, help="Header of the source id", type= str e.g. #sample_substance_name
peak_loss_params:
n_decimals: 1 # number of decimals of m/z to use for spectral binning
graph_format : ttl # required=True, help="Format of the graph", type= str (default is .ttl format (ttl), but can be .nt (nt))
graph_output_dir_path : ../data/output/standalone_graphs #/home/allardpm/ENPKG/data/pf1600_sub/standalone_graphs #../tests/data/output/standalone_graphs #
compress_outputs : False # required=False, help="Compress outputs", type= bool, default= False)