# 0. Preliminaries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import platform
import itertools
import re

In [3]:
user_vimms = os.path.join("C:\\", "Users", "mcbrider5002", "Desktop", "Workspace", "phd", "peak_picking", "vimms")
#user_vimms = "vimms"
sys.path.append(user_vimms)

xcms_r_script = os.path.join(user_vimms, "vimms", "scripts", "xcms_script.R")
dsda_path = os.path.join(user_vimms, "vimms", "scripts", "dsda_script.R")

In [4]:
from vimms.Common import (
    POSITIVE, ROI_TYPE_SMART, ROI_EXCLUSION_WEIGHTED_DEW,
    set_log_level_warning,
)

from vimms.Roi import RoiBuilderParams, SmartRoiParams
from vimms.Controller import TopNController
from vimms.PeakPicking import XCMSScriptParams
from vimms.Experiment import ExperimentCase, Experiment



In [5]:
ionisation_mode = POSITIVE
pbar = False
set_log_level_warning()

1

### Get seed data

In [6]:
def match_files(data_dir, regex):
    pattern = re.compile(regex)
    files = (
        (int(pattern.match(fname).group(1)), fname)
        for fname in os.listdir(data_dir)
        if not pattern.match(fname) is None
    )

    return [
        os.path.join(data_dir, fname) 
        for _, fname in sorted(files, key=lambda p: p[0])
    ]

In [7]:
data_dir = os.path.join("C:\\", "Users", "mcbrider5002", "Desktop", "Workspace", "phd", "data", "CLMS", "new_matching_seeds", "Negative")

all_fullscans = match_files(data_dir, r"Vinny_Beers_[0-9]_[0-9]+_ID([0-9]+).mzML")
same_beer_fullscans = all_fullscans[:12]
rep_diff_beer_fullscans = all_fullscans[12:]
id_offset = len(same_beer_fullscans)

### Specify parameters

In [8]:
num_workers = 8 #can't always use number of physical cores because of memory constraints

In [9]:
min_rt = 0
max_rt = 1440
scan_duration_dict = {
    1: 0.59,
    2: 0.19
}

In [10]:
centwave_params = {
    "ppm" : 15,
    "pwlower" : 15,
    "pwupper" : 80,
    "snthresh" : 5,
    "noise" : 1000,
    "prefilterlower" : 3,
    "prefilterupper" : 500,
    "mzdiff" : 0.001
}

pp_params = XCMSScriptParams(
    xcms_r_script = xcms_r_script,
    **centwave_params,
    mzvsrtbalance = None,
    absmz = None,
    absrt = None,
    kNN = None
)

In [11]:
topN_shared = {
    "ionisation_mode" : ionisation_mode,
    "isolation_width" : 1,
    "min_ms1_intensity" : 5000,
    "mz_tol" : 10,
    "rt_tol" : 60
}

In [12]:
dsda_shared = {
    "dsda_loc" : dsda_path,
    "min_rt" : min_rt,
    "max_rt" : max_rt, 
    "scan_duration_dict" : scan_duration_dict,
    "rscript_loc" : "RScript",
}

dsda_params = {
    **centwave_params,
    "repeatmsmsfill" : 4,
    "ms2ltprecursor" : True,
    "minabsintensity" : None,
    "minrelintensity" : 0.001
}

In [13]:
#nested parameter sets, so this code is a bit awkward...
search_params = {
    "N" : [1, 5, 10, 20],
    "maxdepth" : [None, 1, 2, 3, 4]
}

search_names = list(search_params.keys())
param_combinations = list(
    itertools.product(*(search_params[k] for k in search_names))
)

#need to differentiate ports because of the socket-based hack we use to communicate between python and R...
dsda_param_sets = [
    {
        **{k : v for k, v in zip(search_names, comb)},
        "port" : (7011 + i)
    }
    for i, comb in enumerate(param_combinations)
]

for i, params in enumerate(dsda_param_sets):
    print(f"DsDA {i}: {','.join(f'{k}={v}' for k, v in params.items())}")

final_param_sets = [
    {
        **dsda_shared,
        "base_controller" : TopNController(**topN_shared, N=params["N"]),
        "dsda_params" : {**dsda_params, "maxdepth": params["maxdepth"]},
        "port" : params["port"]
    }
    for params in dsda_param_sets
]

DsDA 0: N=1,maxdepth=None,port=7011
DsDA 1: N=1,maxdepth=1,port=7012
DsDA 2: N=1,maxdepth=2,port=7013
DsDA 3: N=1,maxdepth=3,port=7014
DsDA 4: N=1,maxdepth=4,port=7015
DsDA 5: N=5,maxdepth=None,port=7016
DsDA 6: N=5,maxdepth=1,port=7017
DsDA 7: N=5,maxdepth=2,port=7018
DsDA 8: N=5,maxdepth=3,port=7019
DsDA 9: N=5,maxdepth=4,port=7020
DsDA 10: N=10,maxdepth=None,port=7021
DsDA 11: N=10,maxdepth=1,port=7022
DsDA 12: N=10,maxdepth=2,port=7023
DsDA 13: N=10,maxdepth=3,port=7024
DsDA 14: N=10,maxdepth=4,port=7025
DsDA 15: N=20,maxdepth=None,port=7026
DsDA 16: N=20,maxdepth=1,port=7027
DsDA 17: N=20,maxdepth=2,port=7028
DsDA 18: N=20,maxdepth=3,port=7029
DsDA 19: N=20,maxdepth=4,port=7030


# Same Beer Grid Search

In [14]:
repeat = 6
out_dir = "dsda_same_grid_search"
fullscans = same_beer_fullscans[repeat+1:repeat+2] * repeat
print()




In [15]:
same_beer_exp = Experiment()
same_beer_exp.add_cases(
    ExperimentCase("dsda", fullscans, params, name=f"dsda_({i})", pickle_env=False)
    for i, params in enumerate(final_param_sets)
)

same_beer_exp.run_experiment(
    out_dir,
    min_rt=min_rt,
    max_rt=max_rt,
    ionisation_mode=ionisation_mode,
    scan_duration_dict=scan_duration_dict,
    overwrite_keyfile=False,
    point_noise_threshold=0.0,
    chem_noise_threshold=topN_shared["min_ms1_intensity"] * 0.5, #filter low intensity signal for memory
    num_workers=num_workers
)

Creating Chemicals...

Running Experiment of 20 cases...


In [16]:
same_beer_exp = Experiment.load_from_json(
    file_dir = out_dir,
    file_name = "keyfile.json",
    out_dir = out_dir,
    fullscan_dir = data_dir,
    amend_result_path = True
)

In [17]:
isolation_width = 1.0
same_beer_exp.evaluate(
    pp_params = pp_params,
    num_workers = num_workers,
    isolation_widths = isolation_width,
    aligned_names = f"stefanbeersID{repeat+1}",
    force_peak_picking = True
)

same_beer_exp.summarise(num_workers=num_workers, rank_key="cumulative_intensity_proportion")

Running XCMS for dsda_same_grid_search\stefanbeersID7_xcms_aligned.csv
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file
5589 aligned boxes contained in file

dsda_(13)
Number of chems above min intensity: 4716
Number of fragmentations: [5780, 5760, 5760, 5760, 5760, 5760]
Cumulative coverage: [2175, 3065, 3464, 3605, 3725, 3795]
Cumulative cove

# Repeated Different Beer Grid Search

In [18]:
bio_repeat = 4
tech_repeat = 3
out_dir = "dsda_repeated_different_grid_search"
fullscans = rep_diff_beer_fullscans[:bio_repeat] * tech_repeat
print()




In [19]:
rep_diff_beer_exp = Experiment()
rep_diff_beer_exp.add_cases(
    ExperimentCase("dsda", fullscans, params, name=f"dsda_({i})", pickle_env=False)
    for i, params in enumerate(final_param_sets)
)

rep_diff_beer_exp.run_experiment(
    out_dir,
    min_rt=min_rt,
    max_rt=max_rt,
    ionisation_mode=ionisation_mode,
    scan_duration_dict=scan_duration_dict,
    overwrite_keyfile=False,
    point_noise_threshold=0.0,
    chem_noise_threshold=topN_shared["min_ms1_intensity"] * 0.5, #filter low intensity signal for memory
    num_workers=num_workers
)

Creating Chemicals...

Running Experiment of 20 cases...


In [20]:
rep_diff_beer_exp = Experiment.load_from_json(
    file_dir = out_dir,
    file_name = "keyfile.json",
    out_dir = out_dir,
    fullscan_dir = data_dir,
    amend_result_path = True
)

In [21]:
isolation_width = 1.0
rep_diff_beer_exp.evaluate(
    pp_params = pp_params,
    num_workers = num_workers,
    isolation_widths = isolation_width,
    aligned_names = f"stefanbeersID{id_offset+1}-{id_offset+bio_repeat}",
    force_peak_picking = True
)

rep_diff_beer_exp.summarise(num_workers=num_workers, rank_key="cumulative_intensity_proportion")

Running XCMS for dsda_repeated_different_grid_search\stefanbeersID13-16_xcms_aligned.csv
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file
11804 aligned boxes contained in file

dsda_(15)
Number of chems above min intensity: 9747
Number of fragmentations: [6558, 6534, 6534, 6534, 6534, 6534, 6534, 6534, 6534, 6534, 6534, 6534]