In [1]:
import numpy as np
import pandas as pd
import ms_feature_validation as mfv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Reading Metabolomics data

Metabolomics data is stored in a DataContainer Object. Data container can be built using pandas DataFrame or read directly from output files from common tools.

In [2]:
# Reading data from a Progenesis csv file
fname = "progenesis_data_matrix_20190918.csv"
data = mfv.filter.read_progenesis(fname)

The data container stores infromation in three different DataFrames:

1. Data Matrix: contains feature values for each sample. Each sample is a row and each feature is a column.
2. Sample Metadata: contains sample information, such as class, run order, batch, sample id, etc... Each sample.
3. Feature Metadata: contains feature information. In the case of LC-MS data it contains retention time, exact mass, etc... Each row is a feature.

Index are shared between data matrix rows and sample metadata rows, and between data matrix columns and feature metadata rows.

In [3]:
data.data_matrix.head()

feature,7.07_663.1141m/z,7.07_514.1940m/z,7.07_677.3182m/z,7.07_849.5030m/z,7.07_337.1398m/z,7.07_125.0234m/z,7.07_1189.5734m/z,7.07_273.2985m/z,7.07_400.0327m/z,7.07_919.4429n,...,7.53_185.1176m/z,14.22_246.2576n,15.74_342.2756n,14.76_162.9161m/z,16.36_293.1729m/z,14.99_180.8400m/z,14.64_162.9211m/z,15.38_901.5212m/z,14.04_302.2425n,16.89_365.3399m/z
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20190917_001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,165.462393,5021.043288,1616.486733,163.441998,22.859482,627.574816,88.691954,403.700768,860.758958,89.170041
20190917_003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.382744,1862.391176,777.170579,137.955794,17.045837,429.084467,83.938129,52.466431,342.585379,8.4785
20190917_007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.894827,1976.196198,900.537921,114.785809,17.334182,409.479664,78.111128,0.0,428.797598,26.856258
20190917_019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,78.956374,2839.857975,2273.082794,181.982139,28.629302,559.047145,97.55706,300.179417,612.079054,98.117394
20190917_031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,98.241624,2913.377054,2484.757258,191.633181,30.200875,608.195486,109.875468,345.911712,700.41316,122.964165


In [4]:
data.sample_metadata.head()

Unnamed: 0_level_0,class
sample,Unnamed: 1_level_1
20190917_001,Zero
20190917_003,Solvent
20190917_007,Solvent
20190917_019,Solvent
20190917_031,Solvent


In [5]:
data.feature_metadata.head()

Unnamed: 0_level_0,Neutral mass (Da),mz,Charge,rt,Chromatographic peak width (min),Identifications,Max Fold Change,Highest Mean,Lowest Mean,Isotope Distribution,Maximum Abundance,Minimum CV%
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7.07_663.1141m/z,,663.114121,1,423.948,0.101483,0,inf,QC d1 v2,Zero,100 - 19.2,20.885235,79.47
7.07_514.1940m/z,,514.193963,1,423.948,0.050733,0,inf,QC d1 v1,Zero,100 - 69.2,21.56875,24.7
7.07_677.3182m/z,,677.318163,1,423.948,0.084567,0,inf,QC d1 v1,Zero,100 - 27,38.728473,11.85
7.07_849.5030m/z,,849.503049,1,423.948,0.06765,0,inf,QC d1 v2,Zero,100 - 12.4,23.914494,20.65
7.07_337.1398m/z,,337.139812,1,423.948,0.1184,0,inf,QC d2 v1,Zero,100,82.486405,6.46


Some common fields such as class, run order, batch number are accessible as DataContainer attributes. Run order and batch will raise an Exception if they are not defined

In [6]:
data.classes.head()

sample
20190917_001       Zero
20190917_003    Solvent
20190917_007    Solvent
20190917_019    Solvent
20190917_031    Solvent
Name: class, dtype: object

### Setting run order information and batch information

order and batch can be set as attributes of the DataContainer

In [None]:
# In this example, the sample name contains batch and order information.
# This code extracts this info from the sample name and set up the bath and order attributes

# index have the following format: name_date_project_run_order
# date is obtained and converted to a batch number
# extracting batch data
batch = pd.Series(data=data.sample_metadata.index.str.split("_"), index=data.data_matrix.index)
batch = batch.apply(lambda x: x[1])
days = np.sort(batch.unique())
batch_map = dict(zip(days, np.arange(1, days.size + 1)))
batch = batch.map(batch_map)
# extracting order data
order = pd.Series(data=data.sample_metadata.index.str.split("_"), index=data.data_matrix.index)
order = order.apply(lambda x: x[-1]).astype(int)

data.order = order
data.batch = batch

## Data curation

Data curation is implementated through a series of Process objects that perform transformations on the Data matrix or remove features/samples according to a criteria. Data curation is strongly based on concepts defined on [this paper](https://doi.org/10.1007).

Even if the filters are highly customizable, the easiest way to perform data curation is first to define a mapping.
A mapping is a dictionary that maps sample types to sample classes. Using the information provided by a mapping, a Processor knows which sample to use to correct a data set and which classes are to be corrected.

Once created, a filter is used with the method process.

In [None]:
# in this example we define the Quality control samples as samples of the class QC,
# blank samples as samples of the class "SV" and sample types as samples of the class EI, EII, EIII and EIV
data.mapping
mapping = {"blank": ["SV"],
           "qc": ["QC"],
           "sample": ["EI", "EII", "EIII", "EIV", "CS"]}
data.mapping = mapping

## Getting common metrics from DataContainer objects

Some common metrics associated with metabolomics data can be obtained using the metrics object:

1. CV for each feature
2. D-Ratio for each feature
3. Detection rate for each feature
4. PCA loadings, scores and cumulative variance

In [None]:
# cv for each class
cv = data.metrics.cv()
cv.head()

In [None]:
score, loading, variance = data.metrics.pca(n_components=2)

In [None]:
fig, axes = plt.subplots(figsize=(12, 8))
sns.scatterplot(data=score, x="PC1", y="PC2", hue=data.classes, ax=axes)

In [None]:
# blank correction
br = mfv.filter.BlankCorrector(mode="lod")
br.process(data)

# prevalence filter
pf = mfv.filter.PrevalenceFilter()
pf.process(data)

# variation filter
vf = mfv.filter.VariationFilter()
vf.process(data)

Several filters can be applied using the Pipeline object

In [None]:
# revert filter effects
data.reset()

# process data with several filters using a pipeline
pipe = mfv.filter.Pipeline([br, pf, vf])
pipe.process(data)

## Analizing raw LC-MS data

Raw MS data in the mzML format can be read using the pyopenms module. Several functions are incorporated in the MSData object to read and process MS data

In [None]:
import ms_feature_validation as mfv
lcms_data = mfv.fileio.MSData("20190918_039.mzML")

In [None]:
# making EIC for a list of mz
mz_list = [203.0821, 508.3403, 285.2066]
rt, eic = lcms_data.get_eic(mz_list)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(rt, eic[1, :])

In [None]:
mfv.peaks.pick_cwt(rt, eic[1, :], min_width=1)

In [None]:
np.diff(rt).min()