### 1. Process all the files in the "Data/LS_MS NL" folder: extract all peak's isotope ratios and put this information in a pandas DataFrame. Use hierarchical indexing to be able to attribute different information to the acquired data (e.g. aminoacid name, sample name, dilution). Try to select the most convinient way to store all the information about the peaks in one DataFrame.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
def filter_labels(labels, chosen_labels):
    filtered_labels = list(filter(lambda label: label.endswith(tuple(chosen_labels)), labels))
    return filtered_labels


def drop_zero_rows(df):
    return df.loc[(df != 0).any(1)]


def get_aminoacid_name(label):
    return label.split()[0].split("_")[-1].split(".")[0]


def get_peak_name(label):
    return " ".join(label.split()[1:])


def get_aminoacids(labels):
    return set(get_aminoacid_name(label) for label in labels)


def modify_index(df):
    df_copy = df.copy()
    df_copy.index = df_copy.index.map(get_peak_name)
    return df_copy


def make_one_ratios(df, chosen_label):
    rows_indexes = filter_labels(df.index, [chosen_label])
    df_rows = df.loc[rows_indexes]
    df_sum = df_rows.sum(axis=1)
    df_ratio = df_rows.div(df_sum, axis=0)
    return df_ratio


def make_all_ratios(df, chosen_peaks):
    # todo suppose that: chosen_peaks = ["peak area", "peak height", "peak m/z", "peak retention time"]
    df_area = make_one_ratios(df, chosen_peaks[0])
    df_height = make_one_ratios(df, chosen_peaks[1])
    df_mass = df.loc[filter_labels(df.index, [chosen_peaks[2]])]
    df_time = df.loc[filter_labels(df.index, [chosen_peaks[3]])]
    return pd.concat([df_area, df_height, df_time, df_mass])


def process_df(df, chosen_peaks):
    filtered_indexes = filter_labels(df.index, chosen_peaks)
    return df.loc[filtered_indexes], filtered_indexes


def make_df_with_hier_aminoacid(df, chosen_peaks):
    filtered_df, filtered_labels = process_df(df, chosen_peaks)
    filtered_df = make_all_ratios(filtered_df, chosen_peaks)
    aminoacids = get_aminoacids(filtered_labels)
    hier_by_aminoacid = {}
    for aminoacid in aminoacids:
        filt_df_by_acid = filtered_df[filtered_df.index.map(lambda x: get_aminoacid_name(x) == aminoacid)]
        hier_by_aminoacid[aminoacid] = modify_index(filt_df_by_acid)
    return pd.concat(hier_by_aminoacid)


def make_df_with_hier_dilution(dfs, dilution_names, chosen_peaks):
    hier_by_dilution = {}
    for df, name in zip(dfs, dilution_names):
        hier_by_dilution[name] = make_df_with_hier_aminoacid(df, chosen_peaks)
    return pd.concat(hier_by_dilution)

In [39]:
directory = "../Data/LS_MS NL"
path_NL = "/NL.csv"
path_NL_5 = "/NL_5.csv"
path_NL_20 = "/NL_20.csv"
df_NL = pd.read_csv(directory + path_NL)
df_NL_5 = pd.read_csv(directory + path_NL_5)
df_NL_20 = pd.read_csv(directory + path_NL_20)

In [40]:
df_NL = df_NL.T
df_NL_5 = df_NL_5.T
df_NL_20 = df_NL_20.T

df_NL = drop_zero_rows(df_NL)
df_NL_5 = drop_zero_rows(df_NL_5)
df_NL_20 = drop_zero_rows(df_NL_20)

In [41]:
chosen_peaks = ["peak area", "peak height", "peak m/z", "peak retention time"]
dfs = [df_NL, df_NL_5, df_NL_20]
dilution_names = ["NL", "NL_5", "NL_20"]

In [42]:
df = make_df_with_hier_dilution(dfs, dilution_names, chosen_peaks)

In [43]:
df.fillna(0, inplace=True)
df.sort_index(inplace=True)

### 2. Use pandas functionality to find out how istope fraction's values change from one sample to another for a certain aminoacid and given retention time. Calculate the max and min values and the mean value.

Misunderstanding:
It seems that retention times are not equal for different samples. Then task doesn't really make sence as there will always be one value for given retention time.

### 3. Output all peak retention times for a given amino acid.

In [44]:
def point_3(df, aminoacid):
    res = df.loc(axis=0)[:,aminoacid,'peak retention time']
    res.index = res.index.droplevel(level=(1, 2))  # drop useless indexes
    res = res.loc[(res != 0).any(axis=1), (res != 0).any(axis=0)]
    print(res)

In [45]:
# Example
aminoacid = 'GLY'
point_3(df, aminoacid)

             18        19        20        37      38
NL     2.350917  2.331783  2.331783  0.000000  0.0000
NL_20  0.000000  0.000000  0.000000  2.255333  2.2793
NL_5   2.305617  2.298467  2.329567  0.000000  0.0000


### 4. For a given retention time output all the samples names, which have a peak detected at specified retention time.

In [46]:
def almost_equal(x, y, eps=0.01):
    return abs(x - y) < eps


def point_4(df, ret_time):
    res = df.loc(axis=0)[:,:,'peak retention time']
    res = res.loc[(almost_equal(res, ret_time)).any(axis=1)]
    print(list(res.index.droplevel(level=(1, 2))))

In [47]:
# Example
ret_time = 2.350917
point_4(df, ret_time)

['NL']


### 5. Group all the corresponding peaks of certain aminoacid detected in different samples by getting the mean value for every mass across all the corresponding samples. You might have to do something about missing values.

In [48]:
def point_5(df, aminoacid):
    res = df.loc(axis=0)[:,aminoacid,'peak m/z']
    res.index = res.index.droplevel(level=(1, 2))  # drop useless indexes
    res_mean = res.mean(axis=0)
    print(res_mean.loc[np.nonzero(res_mean)])

In [49]:
# Example
point_5(df, aminoacid)

18    164.053335
19    164.720001
20    165.386668
37     82.026667
38     82.360001
dtype: float64
