Jupyter Notebook for Selecting Lipid MRMs and OzESIs after a Neutral Loss Scan

In [1]:
#Import all the necessary libraries
import pymzml
import csv
import os
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import re
import plotly.express as px
from IPython.display import Image

No module named 'ms_deisotope._c.averagine' averagine
No module named 'ms_deisotope._c.scoring'
No module named 'ms_deisotope._c.deconvoluter_base'
No module named 'ms_deisotope._c.deconvoluter_base'
No module named 'ms_deisotope._c.deconvoluter_base'


First perform a neutral loss scan and obtain pecursor ion m/z values and the neutral loss. Upload an excel spreadsheet with these values listed in two columns.

![title](Figures/search_list.png)

A database of known lipid MRMs will be parsed and matched. And the rule-based method for OzESI will be used. 


![title](Figures/OzESI_results.png)

Load MRM databases 1: SUPPLE_2.XLS and 2:ListMRMs.csv, and then save as a pandas dataframe. 

In [2]:
def read_mrm_list(filename,remove_std = True):
    mrm_list_new = pd.read_excel(filename, sheet_name=None)
    mrm_list_new = pd.concat(mrm_list_new, ignore_index=True)
    mrm_list_offical = mrm_list_new[['Compound Name', 'Parent Ion', 'Product Ion', 'Class']]
    # Add underscore to middle of columns names
    mrm_list_offical.columns = mrm_list_offical.columns.str.replace(' ', '_')
    # Round Parent Ion and Product Ion to 1 decimal place
    mrm_list_offical['Parent_Ion'] = np.round(mrm_list_offical['Parent_Ion'],1)
    mrm_list_offical['Product_Ion'] = np.round(mrm_list_offical['Product_Ion'],1)
    # Create transition column by combining Parent Ion and Product Ion with arrow between numbers
    mrm_list_offical['Transition'] = mrm_list_offical['Parent_Ion'].astype(str) + ' -> ' + mrm_list_offical['Product_Ion'].astype(str)
    # Change column compound name to lipid
    mrm_list_offical = mrm_list_offical.rename(columns={'Compound_Name': 'Lipid'})
    # Make a column called Class match lipid column to lipid types
    if remove_std == True:
        lipid_class = mrm_list_offical['Class'].unique()
        lipid_class_to_keep = ['PS','PG','CE','PC', 'DAG', 'PE', 'TAG', 'FA', 'Cer', 'CAR', 'PI','SM']
        mrm_list_offical = mrm_list_offical[mrm_list_offical['Class'].isin(lipid_class_to_keep)]
    return mrm_list_offical


## old code
# #loop through all sheets in SUPPLE_2.XLS and make a df of Compound Name, Parent Ion, and Product Ion
mrm_list_new = pd.read_excel('lipid_database/Lipid_Database.xlsx', sheet_name = None)
mrm_list_new = pd.concat(mrm_list_new, ignore_index=True)
mrm_list_offical = mrm_list_new[['Compound Name', 'Parent Ion', 'Product Ion']]
#mrm_list_official = mrm_list_new.loc[:, ['Compound Name', 'Parent Ion', 'Product Ion']]
#Add underscore to middle of columns names
mrm_list_offical.columns = mrm_list_offical.columns.str.replace(' ', '_')
#round the Parent Ion and Product Ion to 1 decimal place
mrm_list_offical.loc[:, 'Parent_Ion'] = mrm_list_offical['Parent_Ion'].round(0)
mrm_list_offical.loc[:, 'Product_Ion'] = mrm_list_offical['Product_Ion'].round(0)
# Create transition column by combining Parent Ion and Product Ion with arrow between numbers
mrm_list_offical.loc[:, 'Transition'] = mrm_list_offical['Parent_Ion'].astype(str) + ' -> ' + mrm_list_offical['Product_Ion'].astype(str)
# Change column compound name to lipid
mrm_list_offical = mrm_list_offical.rename(columns={'Compound_Name': 'Lipid'})




print(mrm_list_offical.head(25))
# print(df_mrm.head(25))

                                Lipid  Parent_Ion  Product_Ion      Transition
0                            LPC(2:0)       300.0        184.0  300.0 -> 184.0
1                            LPC(3:1)       312.0        184.0  312.0 -> 184.0
2                  LPC(3:0),PC(O-3:0)       314.0        184.0  314.0 -> 184.0
3        LPC(4:0),PC(O-4:0),PC(O-5:0)       328.0        184.0  328.0 -> 184.0
4                    PC(4:0),LPC(5:0)       342.0        184.0  342.0 -> 184.0
5                            LPC(6:0)       356.0        184.0  356.0 -> 184.0
6                    PC(6:0),LPC(7:0)       370.0        184.0  370.0 -> 184.0
7                            LPC(8:0)       384.0        184.0  384.0 -> 184.0
8                         LPC(O-10:1)       396.0        184.0  396.0 -> 184.0
9                    PC(8:0),LPC(9:0)       398.0        184.0  398.0 -> 184.0
10               LPC(10:0),PC(O-10:0)       412.0        184.0  412.0 -> 184.0
11                 PC(10:0),LPC(11:0)       426.0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical.loc[:, 'Parent_Ion'] = mrm_list_offical['Parent_Ion'].round(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical.loc[:, 'Product_Ion'] = mrm_list_offical['Product_Ion'].round(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical.loc[:, 'Transition'] = mrm

Input search list excel sheet as discussed in the introduction. Save the data into a dataframe and then search the databases for matches and output them into a found_list dataframe. 

In [3]:
search_list = pd.read_excel('./data_OzESI/search/Practice_LiverLD_MUFAs_2.xlsx')
found_list = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Transition'])
print(search_list.head(10))
search_list['Product_Ion'] = search_list['Parent_Ion'] - search_list['neutral_loss']
#Round search list columns to 0 deciaml places
search_list['Parent_Ion'] = search_list['Parent_Ion'].round(0)
search_list['Product_Ion'] = search_list['Product_Ion'].round(0)

print(search_list.head())

#Search for matches between mrm_list_offical and search_list for Parent Ion and Product Ion
for i in range(len(mrm_list_offical)):
    for j in range(len(search_list)):
        if mrm_list_offical.loc[i,'Parent_Ion'] == search_list.loc[j,'Parent_Ion'] and mrm_list_offical.loc[i,'Product_Ion'] == search_list.loc[j,'Product_Ion']:
            found_list = found_list.append(mrm_list_offical.loc[i,:])



# found_list.to_excel('./data_OzESI/data_csv/found_list.xlsx', index=False)      
found_list.head(25)


   Parent_Ion  neutral_loss
0       846.8         299.2
1       848.8         299.2
2       850.9         299.2
3       870.8         299.2
4       872.8         299.2
5       874.8         299.2
6       876.8         299.2
7       878.9         299.2
8       896.9         299.2
9       898.9         299.2
   Parent_Ion  neutral_loss  Product_Ion
0       847.0         299.2        548.0
1       849.0         299.2        550.0
2       851.0         299.2        552.0
3       871.0         299.2        572.0
4       873.0         299.2        574.0


  found_list = found_list.append(mrm_list_offical.loc[i,:])
  found_list = found_list.append(mrm_list_offical.loc[i,:])
  found_list = found_list.append(mrm_list_offical.loc[i,:])
  found_list = found_list.append(mrm_list_offical.loc[i,:])
  found_list = found_list.append(mrm_list_offical.loc[i,:])
  found_list = found_list.append(mrm_list_offical.loc[i,:])


Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Transition
1360,"[TG(53:8),TG(52:1)]_FA18:1",879.0,580.0,879.0 -> 580.0
1361,"[TG(53:9),TG(52:2)]_FA18:1",877.0,578.0,877.0 -> 578.0
1369,"[TG(57:10),TG(56:3)]_FA18:1",931.0,632.0,931.0 -> 632.0
1444,"[TG(55:10),TG(54:3)]_FA18:1",903.0,604.0,903.0 -> 604.0
1445,"[TG(55:11),TG(54:4)]_FA18:1",901.0,602.0,901.0 -> 602.0
1447,"[TG(55:9),TG(54:2)]_FA18:1",905.0,606.0,905.0 -> 606.0


Create a dataframe of DB_position and aldehyde ion based off lipid rules. Save in dataframe df_OzESI

In [4]:
df_OzESI = pd.DataFrame(columns=['DB_Position','Aldehyde_Ion'])

for i in range(3,21):
    df_OzESI.loc[i,'DB_Position'] = i
    df_OzESI.loc[i,'Aldehyde_Ion'] = 26 + (14 * (i-3))

df_OzESI.head(25)


Unnamed: 0,DB_Position,Aldehyde_Ion
3,3,26
4,4,40
5,5,54
6,6,68
7,7,82
8,8,96
9,9,110
10,10,124
11,11,138
12,12,152


Input the requested OzESI n# in the OzESI_list. Based off the previously found data in the found_list, the script will provide the correct m/z value with each n# for each lipid in the found list dataframe

In [8]:
OzESI_list = [7,9,11]
#create columns in found list called n-# based on the numbers in OzESI_list
for i in OzESI_list:
    found_list['n-' + str(i)] = ''
print(found_list.head(25))

# Parent_Ion - Aldehyde_Ion *

# Subtract the aldehyde ion from the parent ion with the corresponding DB position and add the result to the found_list dataframe
for i in range(3,len(df_OzESI)):
    for j in range(len(found_list)):
        #if df_OzESI.loc[i,'DB_Position']:
        for k in range(4,7):
            if df_OzESI.loc[i,'DB_Position'] == OzESI_list[k-4]:
                found_list.iloc[j,k] = found_list.iloc[j,1] - df_OzESI.loc[i,'Aldehyde_Ion'] + 1

        
found_list.head(None)

                            Lipid  Parent_Ion  Product_Ion      Transition  \
1360   [TG(53:8),TG(52:1)]_FA18:1       879.0        580.0  879.0 -> 580.0   
1361   [TG(53:9),TG(52:2)]_FA18:1       877.0        578.0  877.0 -> 578.0   
1369  [TG(57:10),TG(56:3)]_FA18:1       931.0        632.0  931.0 -> 632.0   
1444  [TG(55:10),TG(54:3)]_FA18:1       903.0        604.0  903.0 -> 604.0   
1445  [TG(55:11),TG(54:4)]_FA18:1       901.0        602.0  901.0 -> 602.0   
1447   [TG(55:9),TG(54:2)]_FA18:1       905.0        606.0  905.0 -> 606.0   

     n-7 n-9 n-11  
1360               
1361               
1369               
1444               
1445               
1447               


Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Transition,n-7,n-9,n-11
1360,"[TG(53:8),TG(52:1)]_FA18:1",879.0,580.0,879.0 -> 580.0,798.0,770.0,742.0
1361,"[TG(53:9),TG(52:2)]_FA18:1",877.0,578.0,877.0 -> 578.0,796.0,768.0,740.0
1369,"[TG(57:10),TG(56:3)]_FA18:1",931.0,632.0,931.0 -> 632.0,850.0,822.0,794.0
1444,"[TG(55:10),TG(54:3)]_FA18:1",903.0,604.0,903.0 -> 604.0,822.0,794.0,766.0
1445,"[TG(55:11),TG(54:4)]_FA18:1",901.0,602.0,901.0 -> 602.0,820.0,792.0,764.0
1447,"[TG(55:9),TG(54:2)]_FA18:1",905.0,606.0,905.0 -> 606.0,824.0,796.0,768.0


Save the dataframe as an excel sheet that can be downloaded and given back to the mass spec chemist

In [6]:
from datetime import date
i = 0
while i < 100:
    if not os.path.exists('./data_OzESI/data_excel/OzESI_Selections_{}_.xlsx'.format(date.today())):
        found_list.to_excel('./data_OzESI/data_excel/OzESI_Selections_{}_.xlsx'.format(date.today()), index=False)
        break
    elif not os.path.exists('./data_OzESI/data_excel/OzESI_Selections_{}_'.format(date.today()) + str(i) + '.xlsx'):
        found_list.to_excel('./data_OzESI/data_excel/OzESI_Selections_{}_'.format(date.today()) + str(i) + '.xlsx', index=False)
        break
    else:
        i += 1


# Ignore code below

In [7]:
matching_list = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Transition'])

#if n-7 and product ion are the same for different lipids, add them to the matching list
for i in range(len(found_list)):
    for j in range(len(found_list)):
        if found_list.iloc[i,4] == found_list.iloc[j,3] and i != j:
            matching_list = matching_list.append(found_list.iloc[i,:])

matching_list.head(25)

Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Transition


In [17]:
# Group the DataFrame by 'Product_Ion' and 'n-7'
grouped = found_list.groupby(['Product_Ion', 'n-7'])

# Create an empty dictionary to store the matching lipids
matching_lipids = {}

# Iterate over the groups
for group_name, group_df in grouped:
    # Get the list of lipids in the group
    lipids = list(group_df['Lipid'])
    
    # Check if there are multiple lipids in the group
    if len(lipids) > 1:
        # Add the lipids to the matching_lipids dictionary
        matching_lipids[group_name] = lipids

# Print the matching lipids
print(matching_lipids)


{(578.0, 796.0): ['[TG(53:9),TG(52:2)]_FA18:1', 'TAG(52:2)_FA 18:1'], (580.0, 798.0): ['[TG(53:8),TG(52:1)]_FA18:1', 'TAG(52:1)_FA 18:1'], (602.0, 820.0): ['[TG(55:11),TG(54:4)]_FA18:1', 'TAG(54:4)_FA 18:1'], (604.0, 822.0): ['[TG(55:10),TG(54:3)]_FA18:1', 'TAG(54:3)_FA 18:1'], (606.0, 824.0): ['[TG(55:9),TG(54:2)]_FA18:1', 'TAG(54:2)_FA 18:1'], (632.0, 850.0): ['[TG(57:10),TG(56:3)]_FA18:1', 'TAG(56:3)_FA 18:1']}


In [18]:
# Group the DataFrame by 'Product_Ion' and 'n-7'
grouped = found_list.groupby(['Product_Ion', 'n-11'])

# Create an empty dictionary to store the matching lipids
matching_lipids = {}

# Iterate over the groups
for group_name, group_df in grouped:
    # Get the list of lipids in the group
    lipids = list(group_df['Lipid'])
    
    # Check if there are multiple lipids in the group
    if len(lipids) > 1:
        # Add the lipids to the matching_lipids dictionary
        matching_lipids[group_name] = lipids

# Print the matching lipids
print(matching_lipids)

{(578.0, 740.0): ['[TG(53:9),TG(52:2)]_FA18:1', 'TAG(52:2)_FA 18:1'], (580.0, 742.0): ['[TG(53:8),TG(52:1)]_FA18:1', 'TAG(52:1)_FA 18:1'], (602.0, 764.0): ['[TG(55:11),TG(54:4)]_FA18:1', 'TAG(54:4)_FA 18:1'], (604.0, 766.0): ['[TG(55:10),TG(54:3)]_FA18:1', 'TAG(54:3)_FA 18:1'], (606.0, 768.0): ['[TG(55:9),TG(54:2)]_FA18:1', 'TAG(54:2)_FA 18:1'], (632.0, 794.0): ['[TG(57:10),TG(56:3)]_FA18:1', 'TAG(56:3)_FA 18:1']}


In [23]:
# Group the DataFrame by 'Product_Ion' and 'n-7'
grouped = found_list.groupby(['Product_Ion', 'n-7'])

# Create an empty DataFrame to store the matching lipids
matching_lipids_df = pd.DataFrame(columns=['Product_Ion', 'n-7', 'Matching_Lipids'])

# Iterate over the groups
for group_name, group_df in grouped:
    # Get the list of lipids in the group
    lipids = list(group_df['Lipid'])
    
    # Check if there are multiple lipids in the group
    if len(lipids) > 1:
        # Add the group to the matching_lipids_df DataFrame
        matching_lipids_df = matching_lipids_df.append({
            'Product_Ion': group_name[0],
            'n-7': group_name[1],
            'Matching_Lipids': lipids
        }, ignore_index=True)

# Print the matching lipids DataFrame
matching_lipids_df.head(25)
matching_lipids_df.to_excel('./data_OzESI/data_excel/FA 18:1.xlsx', index=False)


  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({


In [24]:
# Group the DataFrame by 'Product_Ion' and 'n-7'
grouped = found_list.groupby(['Product_Ion', 'n-11'])

# Create an empty DataFrame to store the matching lipids
matching_lipids_df = pd.DataFrame(columns=['Product_Ion', 'n-11', 'Matching_Lipids'])

# Iterate over the groups
for group_name, group_df in grouped:
    # Get the list of lipids in the group
    lipids = list(group_df['Lipid'])
    
    # Check if there are multiple lipids in the group
    if len(lipids) > 1:
        # Add the group to the matching_lipids_df DataFrame
        matching_lipids_df = matching_lipids_df.append({
            'Product_Ion': group_name[0],
            'n-11': group_name[1],
            'Matching_Lipids': lipids
        }, ignore_index=True)

# Print the matching lipids DataFrame
matching_lipids_df.head(25)
matching_lipids_df.to_excel('./data_OzESI/data_excel/FA 20:1.xlsx', index=False)


  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
  matching_lipids_df = matching_lipids_df.append({
