# 0. Preliminaries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import platform
import re

In [3]:
user_vimms = os.path.join("C:\\", "Users", "mcbrider5002", "Desktop", "Workspace", "phd", "peak_picking", "vimms")
#user_vimms = "vimms"
sys.path.append(user_vimms)

old_mzmine = False
template_suffix = "_old" if old_mzmine else ""
mzmine_template = os.path.join(user_vimms, "batch_files", f"multi_sample_peak_pick{template_suffix}.xml")

xcms_r_script = os.path.join(user_vimms, "vimms", "scripts", "xcms_script.R")
dsda_path = os.path.join(user_vimms, "vimms", "scripts", "dsda_script.R")

In [4]:
mzmine_path = os.path.join("C:\\", "Users", "mcbrider5002", "Desktop", "Workspace", "phd", "peak_picking", "MZmine-2.53-Windows", "startMZmine-Windows.bat")

In [5]:
from vimms.Common import (
    POSITIVE, ROI_TYPE_SMART, ROI_EXCLUSION_WEIGHTED_DEW,
    set_log_level_warning,
)

from vimms.Roi import RoiBuilderParams, SmartRoiParams
from vimms.Experiment import ExperimentCase, Experiment
from vimms.Controller import TopNController
from vimms.PeakPicking import MZMineParams, XCMSScriptParams
from vimms.Matching import MatchingScan, Matching
from vimms.Controller.misc import TaskFilter



In [6]:
ionisation_mode = POSITIVE
pbar = False
set_log_level_warning()

1

### Get seed data

In [7]:
def match_files(data_dir, regex):
    pattern = re.compile(regex)
    files = (
        (int(pattern.match(fname).group(1)), fname)
        for fname in os.listdir(data_dir)
        if not pattern.match(fname) is None
    )

    return [
        os.path.join(data_dir, fname) 
        for _, fname in sorted(files, key=lambda p: p[0])
    ]

In [8]:
data_dir = os.path.join("C:\\", "Users", "mcbrider5002", "Desktop", "Workspace", "phd", "data", "CLMS", "new_matching_seeds", "Negative")

all_fullscans = match_files(data_dir, r"Vinny_Beers_[0-9]_[0-9]+_ID([0-9]+).mzML")
same_beer_fullscans = all_fullscans[:12]
rep_diff_beer_fullscans = all_fullscans[12:]
id_offset = len(same_beer_fullscans)

### Specify parameters

In [9]:
num_workers = 8 #can't always use number of physical cores because of memory constraints

In [10]:
#intensity params
experiment_params = {
    "topN_params": {
        "ionisation_mode" : ionisation_mode,
        "N" : 20,
        "isolation_width" : 1,
        "min_ms1_intensity" : 5000,
        "mz_tol" : 10,
        "rt_tol" : 60
    },
    
    "roi_params" : {
        "min_roi_length_for_fragmentation" : 0,
        "roi_params" : RoiBuilderParams(
                            min_roi_intensity=0,
                            min_roi_length=3,
                       )
    },
    
    "non_overlap_params": {
    },
    
    "smartroi_params": {
        "rt_tol" : 15,
        "smartroi_params" : SmartRoiParams(
                                reset_length_seconds=1E6,
                                intensity_increase_factor=3,
                                drop_perc=0.001
                            )  
    },
    
    "weighteddew_params": {
        "rt_tol": 60,
        "exclusion_method": ROI_EXCLUSION_WEIGHTED_DEW,
        "exclusion_t_0": 1
    }    
}

min_rt = 0
max_rt = 1440
scan_duration_dict = {
    1: 0.59,
    2: 0.19
}
point_noise_threshold = 0

In [11]:
centwave_params = {
    "ppm" : 15,
    "pwlower" : 15,
    "pwupper" : 80,
    "snthresh" : 5,
    "noise" : 1000,
    "prefilterlower" : 3,
    "prefilterupper" : 500,
    "mzdiff" : 0.001
}

mzmine_object = MZMineParams(
    mzmine_template = mzmine_template,
    mzmine_exe = mzmine_path
)

xcms_object = XCMSScriptParams(
    xcms_r_script = xcms_r_script,
    **centwave_params,
    mzvsrtbalance = None,
    absmz = None,
    absrt = None,
    kNN = None
)

#pp_params = mzmine_object
pp_params = xcms_object

In [12]:
dsda_params = {
    "dsda_loc" : dsda_path, 
    "base_controller" : TopNController(**{**experiment_params["topN_params"], "N" : 10}),
    "min_rt" : min_rt,
    "max_rt" : max_rt, 
    "scan_duration_dict" : scan_duration_dict,
    "port" : 7011,
    "rscript_loc" : "RScript",
    "dsda_params" : {
        **centwave_params,
        "maxdepth" : 3
    }
}

In [13]:
task_filter = TaskFilter(
    ms1_length = 0.59, 
    ms2_length = 0.19, 
    skip_margin=0.5, 
    add_margin=1.2
)

matching_params = {
    "aligned_reader" : pp_params,
    "ionisation_mode" : ionisation_mode,
    "isolation_width" : experiment_params["topN_params"]["isolation_width"],
    "intensity_threshold" : experiment_params["topN_params"]["min_ms1_intensity"],
}

matching_params_with_resync = {
    **matching_params,
    "task_filter" : task_filter
}

### Specify controllers to run

In [14]:
topN_params = experiment_params["topN_params"]
roi_params = {**topN_params, **experiment_params["roi_params"]}
non_overlap_params = {**roi_params, **experiment_params["non_overlap_params"]}

cases = [
    ("topN", topN_params),
    ("dsda", dsda_params),
    #("topN_RoI", roi_params),
    ("topN_exclusion", topN_params),
    #("topNEx", non_overlap_params),
    #("hard_roi_exclusion", non_overlap_params),
    #("intensity_roi_exclusion", non_overlap_params),
    #("non_overlap", non_overlap_params),
    ("intensity_non_overlap", non_overlap_params)
]

no_smartroi = ["topN", "topN_RoI", "topN_exclusion", "dsda"]
#run_smartroi = []
run_smartroi = [name for name, _ in cases if not name in no_smartroi]
#run_weighteddew = []
run_weighteddew = [name for name, _ in cases if not name in no_smartroi]

In [15]:
new_cases = []
for controller_type, params in cases:
    new_cases.append(
        (controller_type, controller_type, params)
    )
    
    if(controller_type in run_smartroi):
        new_name = controller_type + "_smartroi"
        new_params = {**params, **experiment_params["smartroi_params"]}
        new_cases.append(
            (controller_type, new_name, new_params)
        )
        
    if(controller_type in run_weighteddew):
        new_name = controller_type + "_weighteddew"
        new_params = {**params, **experiment_params["weighteddew_params"]}
        new_cases.append(
            (controller_type, new_name, new_params)
        )
        
cases = new_cases
cases.append(("dsda", "dsda_resynced", {**dsda_params, "port" : dsda_params["port"] + 1, "task_filter" : task_filter}))

# 1. Same Beer Repeated Multiple Times

In [16]:
repeat = 4
out_dir = "new_same_beer"
fullscans = same_beer_fullscans[:1] * repeat
aligned_file = f"stefanbeersID1" + template_suffix

aligned_file = pp_params.pick_aligned_peaks(
    fullscans,
    out_dir,
    aligned_file,
    force=True
)
print()

times_list = [
    list(MatchingScan.topN_times(N, max_rt, scan_duration_dict))
    #for N in [20, 10, 5, 3, 2, 1] #+ [1] * 14
    for N in [20] * repeat
]

same_cases = cases + [
    ("matching", "two_step_matching", {
                                            **matching_params, 
                                            "aligned_file" : aligned_file,
                                            "times_list" : times_list,
                                            "weighted" : Matching.TWOSTEP,
                                            "full_assignment_strategy" : 0
                                      }
    ),
    
    ("matching", "two_step_matching_with_recursive_assignment", {
                                            **matching_params, 
                                            "aligned_file" : aligned_file,
                                            "times_list" : times_list,
                                            "weighted" : Matching.TWOSTEP,
                                            "full_assignment_strategy" : 1
                                      }
    ),
    
    ("matching", "two_step_matching_resynced", {
                                            **matching_params_with_resync, 
                                            "aligned_file" : aligned_file,
                                            "times_list" : times_list,
                                            "weighted" : Matching.TWOSTEP,
                                            "full_assignment_strategy" : 0
                                      }
    ),
    
    ("matching", "two_step_matching_with_recursive_assignment_resynced", {
                                            **matching_params_with_resync, 
                                            "aligned_file" : aligned_file,
                                            "times_list" : times_list,
                                            "weighted" : Matching.TWOSTEP,
                                            "full_assignment_strategy" : 1
                                      }
    )
]

for controller_type, name, params in same_cases:
    print(f"NAME: {name}")
    print(f"CONTROLLER TYPE: {controller_type}")
    if(not "matching" in name): print(f"PARAMS: {params}")
    print()

Running XCMS for new_same_beer\stefanbeersID1_xcms_aligned.csv
5979 aligned boxes contained in file

NAME: topN
CONTROLLER TYPE: topN
PARAMS: {'ionisation_mode': 'Positive', 'N': 20, 'isolation_width': 1, 'min_ms1_intensity': 5000, 'mz_tol': 10, 'rt_tol': 60}

NAME: dsda
CONTROLLER TYPE: dsda
PARAMS: {'dsda_loc': 'C:\\Users\\mcbrider5002\\Desktop\\Workspace\\phd\\peak_picking\\vimms\\vimms\\scripts\\dsda_script.R', 'base_controller': <class 'vimms.Controller.topN.TopNController'>(advanced_params=<vimms.Controller.base.AdvancedParams object at 0x000001A6AF4F9180>,scans=defaultdict(<class 'list'>, {}),scan_to_process=None,environment=None,next_processed_scan_id=100000,initial_scan_id=100000,current_task_id=100000,processing_times=[],last_ms1_rt=0.0,ionisation_mode=Positive,N=10,isolation_width=1,mz_tol=10,rt_tol=60,min_ms1_intensity=5000,ms1_shift=0,force_N=False,exclusion=<vimms.Exclusion.TopNExclusion object at 0x000001A6AF4F8B80>,deisotope=False,charge_range=(2, 3),min_fit_score=80,pe

In [17]:
for desync_factor in [1 + i / 10 for i in range(-5, 6)]:
    new_times = {level: time * desync_factor for level, time in scan_duration_dict.items()}
    print(new_times)
    
    same_beer_exp = Experiment()
    same_beer_exp.add_cases(
        ExperimentCase(controller_type, fullscans, params, name=name, pickle_env=False)
        for controller_type, name, params in same_cases
    )
    same_beer_exp.run_experiment(
        out_dir + f"_desync{int(100 * desync_factor)}",
        min_rt=min_rt,
        max_rt=max_rt,
        ionisation_mode=ionisation_mode,
        scan_duration_dict=new_times,
        overwrite_keyfile=False,
        point_noise_threshold=point_noise_threshold,
        chem_noise_threshold=experiment_params["topN_params"]["min_ms1_intensity"] * 0.5, #filter low intensity signal for memory
        num_workers=num_workers
    )

{1: 0.295, 2: 0.095}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.354, 2: 0.11399999999999999}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.413, 2: 0.13299999999999998}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.472, 2: 0.15200000000000002}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.531, 2: 0.171}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.59, 2: 0.19}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.649, 2: 0.20900000000000002}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.708, 2: 0.22799999999999998}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.767, 2: 0.24700000000000003}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.826, 2: 0.26599999999999996}
Creating Chemicals...

Running Experiment of 11 cases...
{1: 0.885, 2: 0.28500000000000003}
Creating Chemicals...

Running Experiment of 11 cases...


In [18]:
for case_name in ["two_step_matching", "two_step_matching_with_recursive_assignment"]:
    i = same_beer_exp.case_names.index(case_name)
    case = same_beer_exp.cases[i]
    if(case.shared.shared is None): print("Matching is None") #shouldn't be
    case.shared.init_shareable(
        {**matching_params, "aligned_file" : aligned_file, "times_list" : times_list, "weighted" : Matching.UNWEIGHTED},
        "",
        fullscans
    )
    matching = case.shared.shared
    print(case.shared.name)
    print(f"Size of matching for {case_name}: {len(matching)}") # 1897

Matching is None
matching
Size of matching for two_step_matching: 4740
Matching is None
matching
Size of matching for two_step_matching_with_recursive_assignment: 4740
