**This code example works with raw MS Data downloaded from the MetaboLights repository (~ 1 GiB).  The corresponding data is currently under the curation process of MetaboLights. These notebooks will be updated when the data can be publicly accessed. The running time is ~ 1 min (excluding the time required to download the data) using a Personal Computer with an 8th generation Intel i5 processor and 8 GiB memory.**

# Application 1: System suitability check, and signal drift evaluation

This notebook introduces the analysis of the application #1 published in (add DOI). It shows how to work with raw data using as an example a System Suitability Check conducted in a metabolomics experiment: System Suitability Samples (SSS) were prepared using five known chemical standards:

* Alogliptin
* Phe-Phe
* Tryptophan
* LPC 18:0
* Leu-Enk

Ten SSS samples (addressed as SSS1) were consecutively run and used to build an acceptance criteria, and then compared against values obtained from the analysis of SSS samples that were analyzed before (SSS2) and after (SSS3) the study samples. This analysis is displayed in Figure 3.

A similar analysis was conducted using QC samples that were spiked with the same compounds and with Leu-13C used as internal standard, but in this case, no acceptance criteria was defined. These results are displayed in Figure S1.

In [None]:
import tidyms as ms
import numpy as np
import bokeh as bk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from download_from_metabolights import get_application1_data
bk.plotting.output_notebook()
pd.set_option("display.precision", 4)
sns.set_context("paper", font_scale=1.5)

## Loading data from Metabolights

In [None]:
# Here we set the raw data path and create a DataFrame with
# sample metadata
data_path = "data"
# this function downloads the data from Metabolights if it is not available locally 
get_application1_data(data_path) 
sample_list_path = os.path.join(data_path, "sample_list.csv")
centroid_data_path = os.path.join(data_path, "cent")

# get sample metadata
sample_list = pd.read_csv(sample_list_path, index_col=0)

In [None]:
# compounds used for SSS check, with their m/z and expected retention times
d = {"Compound": ["Leu-13C", "Trp", "Phe-Phe",
                  "Alogliptin", "LPC 18:0", "Leu-Enk"],
     "rt": [75, 129, 320, 291, 775, 372],
     "mz": [133.1056, 205.0977,313.1552,
            340.1773, 524.3716, 556.2771]}
df = pd.DataFrame(data=d)

## Feature Detection

In [None]:
# Feature detection in all samples. Features with m/z values from
# the compounds used for SSS
roi_params = {"targeted_mz": df["mz"].values, "tolerance": 0.015}
# proto dm contains the feature table and roi is a dictionary
# where each key is a sample name and each value is a list
# of ROI detected in each sample
roi, feature_table = ms.detect_features(centroid_data_path, roi_params=roi_params)

In [None]:
# feature table
feature_table.head()

In [None]:
# plotting ROI data
k = 40
s = feature_table.loc[k, "sample"]
r = feature_table.loc[k, "roi index"]
p = feature_table.loc[k, "peak index"]
roi[s][r].plot()

## Feature correspondence

In [None]:
# Feature correspondence of the detected features
# after performing correspondence, a new column called cluster
# label each one of the ionic species detected across samples
cluster = ms.feature_correspondence(feature_table, 0.01, 5)
# make_data_container organizes the feature table into
# a data matrix, feature metadata and sample metadata
data = ms.make_data_container(feature_table, cluster, sample_list)

In [None]:
feature_table

## Remove unwanted features using the Retention times from the standards

In [None]:
# find the compounds used in the SSS using the expected rt 
ft_to_compound = dict()
for index in df.index:
    mz = df.loc[index, "mz"]
    rt = df.loc[index, "rt"]
    compound = df.loc[index, "Compound"]
    ft = data.select_features(mz, rt)
    ft_to_compound[ft[0]] = compound

In [None]:
# removing features that are not from the chemical standards
rm_features = (data.data_matrix.columns
               .difference(ft_to_compound.keys()))
data.remove(rm_features, "features")

In [None]:
# add compound name and class to each feature
feature_table["Compound"] = feature_table["cluster"].map(ft_to_compound)
feature_table["class"] = feature_table["sample"].map(sample_list["class"])

In [None]:
feature_table.head()

## Figure 3: m/z, Rt and area dispersion for SSS samples

In [None]:
# FIGURE 3: 
sss_mask = feature_table["class"].isin(["SSS1", "SSS2", "SSS3"])
sss_data = feature_table[sss_mask].copy()

# compute mean centered m/z and rt
mean_mz = sss_data["mz"].groupby(sss_data["cluster"]).mean()
mean_rt = sss_data["rt"].groupby(sss_data["cluster"]).mean()
sss_data["mean mz"] = \
    (sss_data["mz"].groupby(sss_data["cluster"])
     .apply(lambda x: x - mean_mz[x.name]))

sss_data["mean rt"] = \
    (sss_data["rt"].groupby(sss_data["cluster"])
     .apply(lambda x: x - mean_rt[x.name]))

xvars = ["mean mz", "mean rt", "area"]
g = sns.PairGrid(data=sss_data,
                 y_vars=["Compound"],
                 x_vars=xvars,
                 hue="class",
                 hue_kws={"marker": [".", "X", "D"], "size": [8, 8, 8]},
                 height=4)
g.map(sns.stripplot)

# setting plot properties
g.axes[0, 0].set_xlim(-0.01, 0.01)
g.axes[0, 2].set_xticks(np.linspace(0, 2e5, 5))
t = g.axes[0, 2].get_xticks()
t = [str(x / 100000) for x in t ]
g.axes[0, 2].set_xticklabels(t);
g.axes[0, 0].set_xlabel("Mean centered m/z")
g.axes[0, 1].set_xlabel("Mean centered Rt [s]")
g.axes[0, 2].set_xlabel("Area / $10^{5}$ [au]");
# g.savefig("metabolomics-2020-sss.png", dpi=300)

## Figure S2: m/z, Rt and area dispersion for QC samples

In [None]:
# FIGURE S1: 

# also remove LPC 18:0 because the area has much higher values
qc_mask = feature_table["class"].isin(["QC"]) & (feature_table["Compound"] != "LPC 18:0")
qc_data = feature_table[qc_mask].copy()

# compute mean centered m/z and rt
mean_mz = qc_data["mz"].groupby(qc_data["cluster"]).mean()
mean_rt = qc_data["rt"].groupby(qc_data["cluster"]).mean()
qc_data["mean_mz"] = \
    (qc_data["mz"].groupby(qc_data["cluster"])
     .apply(lambda x: x - mean_mz[x.name]))

qc_data["mean_rt"] = \
    (qc_data["rt"].groupby(qc_data["cluster"])
     .apply(lambda x: x - mean_rt[x.name]))
# sss_data = sss_data.rename(columns={"area": "Area / 10e5 [au]"})

xvars = ["mean_mz", "mean_rt", "area"]
g = sns.PairGrid(data=qc_data,
                 y_vars=["Compound"],
                 x_vars=xvars,
                 hue="class",
                 height=4)
g.map(sns.stripplot)
g.axes[0, 0].set_xlim(-0.01, 0.01)
g.axes[0, 1].set_xlim(-1, 1)
g.axes[0, 2].set_xticks(np.linspace(0, 3e5, 6))

# ticks for lpc
# g.axes[0, 2].set_xticks(np.linspace(2e6, 7e6, 6))
t = g.axes[0, 2].get_xticks()
# t = [str(x / 100000) + "e05" if x != 0 else "0" for x in t ]
t = [str(x / 100000) for x in t ]
g.axes[0, 2].set_xticklabels(t);g.axes[0, 0].set_xlabel("Mean centered m/z")
g.axes[0, 1].set_xlabel("Mean centered Rt [s]")
g.axes[0, 2].set_xlabel("Area / $10^{5}$ [au]")
# g.savefig("rt_mz_area_qc_lpc.png", dpi=300)

### Figure S1: overlapped ROI for each feature

In [None]:
# an optional figure to show feature correspondence results
def plot_clustered_roi(feature_table, roi, ft, classes, cmapper):
    fig, ax = plt.subplots(figsize=(12, 8))
    grouped = feature_table.groupby("cluster")
    ft_group = grouped.get_group(ft)
    for ind in ft_group.index:
        c = ft_group.loc[ind, "class"]
        if c in classes:
            sample = ft_group.loc[ind, "sample"]
            roi_ind = ft_group.loc[ind, "roi index"]
            peak_ind = ft_group.loc[ind, "peak index"]
            r = roi[sample][roi_ind]
            p = roi[sample][roi_ind].peaks[peak_ind]
            ax.plot(r.rt, r.spint, color="gray", alpha=0.5)
            ax.fill_between(r.rt[p.start:p.end+1], r.spint[p.start:p.end+1],
                            alpha=0.5, color=cmapper[c])
    return fig, ax

title_str = "Alogliptin"
kft = "FT04"
# fig, ax = plot_clustered_roi(proto_dm, roi, kft, ["SSS1", "SSS2", "SSS3"],
#                              {"SSS1": "lightskyblue",
#                                "SSS2": "darkorange",
#                                "SSS3": "green"})
fig, ax = plot_clustered_roi(feature_table, roi, kft, ["QC"],
                             {"QC": "lightskyblue"})
ax.set_title(title_str)
ax.set_xlim(750, 850)
ax.set_xlabel("Rt [s]");
# plt.savefig(kft + ".png", dpi=300)