# Automated Multiple Reaction Monitoring (MRM)-profiling and Ozone Electrospray Ionizaton (OzESI)-MRM Informatics Platform for High-throughput Lipidomics


In this jupyter notebook you will automate the data analysis of the lipidome. This is a challenging problem to perform manually due to the diverse nature of lipids and the many potential isomers. In this notebook you will analyze mzML files containing data from lipid MRMs, with ozone off and ozone on. The goal is to identify possible double-bond locations in a lipid, in this case a TAG (triacylglycerols).

In [121]:
from IPython.display import Image

![title](Figures/agilent_lcms.png)

The examples shown here were run on an Agilent 6495C Triple Quadrupole LC/MS (example shown above) that has been connected to an ozone line (not shown in picture) for ozoneolysis of lipids.

![title](Figures/TAG_example.png)
Here is an example of a TAG. Notice how many possibilities there are for locations of one double-bond there could be and how convoluted the analysis can become! This image is obtained from LipidMaps.org

Import all necessary libraries

In [122]:
#Import all the necessary libraries
import pymzml
import csv
import os
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import re
import plotly.express as px
from collections import defaultdict



In [123]:
###Importing Variables for all functions

data_base_name_location = 'lipid_database/Lipid_Database.xlsx'####Lipid database with Standard Carnitines
mzml_folder = './data_mzml/04-29-2023_mzml/test_oz_1/'
tolerance = 0.3
remove_std = True

# Example usage:
folder_name_to_save = 'TEST_04-29-2023_validation_test_1'
file_name_to_save = 'TEST_04-29-2023_validation_test_1'
save_data= True





Lipid MRM Parsing and Matching Functions

In [124]:

###All functions

#Function to read in MRM database
#Option to remove STDs from database##Not finished need option to use another database with no qualitative ACs


def read_mrm_list(filename,remove_std = True):
    mrm_list_new = pd.read_excel(filename, sheet_name=None)
    mrm_list_new = pd.concat(mrm_list_new, ignore_index=True)
    mrm_list_offical = mrm_list_new[['Compound Name', 'Parent Ion', 'Product Ion', 'Class']]
    # Add underscore to middle of columns names
    mrm_list_offical.columns = mrm_list_offical.columns.str.replace(' ', '_')
    # Round Parent Ion and Product Ion to 1 decimal place
    mrm_list_offical['Parent_Ion'] = np.round(mrm_list_offical['Parent_Ion'],1)
    mrm_list_offical['Product_Ion'] = np.round(mrm_list_offical['Product_Ion'],1)
    # Create transition column by combining Parent Ion and Product Ion with arrow between numbers
    mrm_list_offical['Transition'] = mrm_list_offical['Parent_Ion'].astype(str) + ' -> ' + mrm_list_offical['Product_Ion'].astype(str)
    # Change column compound name to lipid
    mrm_list_offical = mrm_list_offical.rename(columns={'Compound_Name': 'Lipid'})
    # Make a column called Class match lipid column to lipid types
    if remove_std == True:
        lipid_class = mrm_list_offical['Class'].unique()
        lipid_class_to_keep = ['PS','PG','CE','PC', 'DAG', 'PE', 'TAG', 'FA', 'Cer', 'CAR', 'PI','SM']
        mrm_list_offical = mrm_list_offical[mrm_list_offical['Class'].isin(lipid_class_to_keep)]
    return mrm_list_offical

#OzESI
OzESI_time = {}
def mzml_parser(file_name):
    df = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Intensity','Transition','Class','Sample_ID'])
    data_folder = os.listdir(file_name) #Path to the mzml files
    data_folder.sort()
    path_to_mzml_files = file_name

    
    

    for file in data_folder:
            if file.endswith('.mzML'):

                    run = pymzml.run.Reader(path_to_mzml_files+file, skip_chromatogram=False) #Load the mzml file into the run object



                    df_all = pd.DataFrame(columns=['Lipid','Parent_Ion','Product_Ion','Intensity','Transition','Class','Sample_ID']) #Create empty pandas dataframe to store the data

                    #create pandas dataframe to store the data with the columns Parent Ion, Product Ion, Intensity, Transition Lipid and Class
                   
                    q1_mz = 0 #Create empty variables to store the Q1 and Q3 m/z values
                    q3_mz = 0
                    count = 0 #Create a counter to keep track of the number of transitions
                    for spectrum in run:
                        if isinstance(spectrum, pymzml.spec.Chromatogram):
                            for time, intensity in spectrum.peaks():
                                OzESI_time[time] = np.round(intensity), q1_mz, q3_mz

                            for element in spectrum.ID.split(' '):
                                    intensity_store = np.array([])
                                    if 'Q1' in element:
                                            q1 = element.split('=')
                                            q1_mz= np.round((float(q1[1])),1)

                                    if 'Q3' in element:
                                
                                            q3 = element.split('=')
  
                                            q3_mz=np.round(float(q3[1]),1)


                                            for mz,intensity in spectrum.peaks(): #Get the m/z and intensity values from the spectrum
                                                    intensity_store = np.append(intensity_store,intensity) #Store the intensity values in an array



                                    if 'Q3' in element:
                                            # print(intensity_sum)
                                            intensity_sum = np.sum(intensity_store) #Sum the intensity values
                                            df_all.loc[count,'Parent_Ion'] = q1_mz #Store the Q1 and Q3 m/z values in the pandas dataframe
                                            df_all.loc[count,'Product_Ion'] = q3_mz
                                            #round the Q1 and Q3 m/z values to 1 decimal places
                                            df_all.loc[count,'Parent_Ion'] = np.round(df_all.loc[count,'Parent_Ion'],1)
                                            df_all.loc[count,'Product_Ion'] = np.round(df_all.loc[count,'Product_Ion'],1)
                                            df_all.loc[count,'Intensity'] = intensity_sum #Store the intensity values in the pandas dataframe
                                            df_all.loc[count,'Transition'] = str(q1_mz)+ ' -> '+ str(q3_mz) #Store the transition values in the pandas dataframe
                                            #add file name to Sample_ID column without the mzmL extension
                                            df_all.loc[count,'Sample_ID'] = file[:-5]
                                            count+=1

            #append df_all to df
            df = df.append(df_all, ignore_index=True)
    return df

# Function to create an ion dictionary from an MRM database DataFrame
def create_ion_dict(mrm_database):
    ion_dict = defaultdict(list)
    # Iterate through the rows of the MRM database DataFrame
    for index, row in mrm_database.iterrows():
        # Add a tuple with Lipid and Class to the ion dictionary using Parent_Ion and Product_Ion as the key
        ion_dict[(row['Parent_Ion'], row['Product_Ion'])].append((row['Lipid'], row['Class']))
    return ion_dict

# Function to check if the absolute difference between two values is within a given tolerance
def within_tolerance(a, b, tolerance=0.1):
    return abs(a - b) <= tolerance

# Function to match the ions in a DataFrame row with the ions in an ion dictionary
def match_ions(row, ion_dict, tolerance=0.1):
    ions = (row['Parent_Ion'], row['Product_Ion'])
    matched_lipids = []
    matched_classes = []

    # Iterate through the ion dictionary
    for key, value in ion_dict.items():
        # Check if both the Parent_Ion and Product_Ion values are within the specified tolerance
        if within_tolerance(ions[0], key[0], tolerance) and within_tolerance(ions[1], key[1], tolerance):
            # If within tolerance, extend the matched_lipids and matched_classes lists with the corresponding values
            matched_lipids.extend([match[0] for match in value])
            matched_classes.extend([match[1] for match in value])

    # If any matches were found, update the Lipid and Class columns in the row
    if matched_lipids and matched_classes:
        row['Lipid'] = ' | '.join(matched_lipids)
        row['Class'] = ' | '.join(matched_classes)

    return row

####Combined functions for Matching

def match_lipids_parser(mrm_database,df, tolerance=0.3):
    ion_dict = create_ion_dict(mrm_database)
    # Assuming you have the df DataFrame to apply the match_ions function
    df_matched = df.apply(lambda row: match_ions(row, ion_dict=ion_dict, tolerance=tolerance), axis=1)


    # df_matched = df_matched.dropna()
    
    return df_matched


def save_dataframe(df, folder_name, file_name, max_attempts=5):
    folder_path = f'data_results/data/data_matching/{folder_name}'
    os.makedirs(folder_path, exist_ok=True)

    for i in range(max_attempts):
        file_path = f'{folder_path}/{file_name}.csv'
        if not os.path.isfile(file_path):
            df.to_csv(file_path, index=False)
            print(f"Saved DataFrame to {file_path}")
            break
    else:
        print(f"Failed to save DataFrame after {max_attempts} attempts.")
        return None


def full_parse(data_base_name_location,mzml_folder, folder_name_to_save, file_name_to_save,tolerance,remove_std = True,
               save_data=False):
    mrm_database = read_mrm_list(data_base_name_location,remove_std=remove_std)
    df = mzml_parser(mzml_folder)
    df_matched = match_lipids_parser(mrm_database,df, tolerance=tolerance)
    
    if save_data == True:
        
        save_dataframe(df_matched, folder_name_to_save, file_name_to_save)

    return df_matched





In [188]:
#print first 10 values in OzESI_time
print(list(OzESI_time.items())[:10])
#convert OzESI_time to a pandas dataframe
df_OzESI_time = pd.DataFrame(list(OzESI_time.items()), columns=['Time', 'Intensity'])
#convert intensity column to a string
df_OzESI_time['Intensity'] = df_OzESI_time['Intensity'].astype(str)
# Split the 'Intensity' column into three new columns
df_OzESI_time[['Intensity_1', 'Parent_Ion', 'Product_Ion']] = df_OzESI_time['Intensity'].str.strip('()').str.split(',', expand=True).astype(float)
#Convert Intensity_1, Parent_Ion and Product_Ion to float
df_OzESI_time['Intensity_1'] = df_OzESI_time['Intensity_1'].astype(float)

#convert Product_Ion and Parent_Ion to string with -> in between
df_OzESI_time['Product_Ion'] = df_OzESI_time['Product_Ion'].astype(str)
df_OzESI_time['Parent_Ion'] = df_OzESI_time['Parent_Ion'].astype(str)
df_OzESI_time['Transition'] = df_OzESI_time['Parent_Ion'].str.cat(df_OzESI_time['Product_Ion'],sep=' -> ')
df_OzESI_time['Parent_Ion'] = df_OzESI_time['Parent_Ion'].astype(float)
df_OzESI_time['Product_Ion'] = df_OzESI_time['Product_Ion'].astype(float)

#sort the dataframe by transition
df_OzESI_time.sort_values(by=['Transition'], inplace=True)
#Drop the Intensity column
df_OzESI_time.drop('Intensity', axis=1, inplace=True)
# # Drop the original 'Intensity' column
# df_OzESI_time.drop('Intensity', axis=1, inplace=True)

# #Drop intensities below 1000
df_OzESI_time = df_OzESI_time[df_OzESI_time['Intensity_1'] > 1000]
#drop time below 10 seconds
df_OzESI_time = df_OzESI_time[df_OzESI_time['Time'] > 10]
#drop time above 16.5 seconds
df_OzESI_time = df_OzESI_time[df_OzESI_time['Time'] < 16.5]


#split up the Intensity column into two columns, one for the intensity and one for the m/z values
df_OzESI_time.tail(10)

[(0.00011666666666666667, (151.0, 900.8, 601.6)), (0.0035, (150.0, 898.8, 599.6)), (0.006883333333333333, (244.0, 876.8, 577.6)), (0.010283333333333334, (185.0, 874.8, 575.6)), (0.013666666666666667, (165.0, 872.8, 573.6)), (0.01705, (319.0, 850.8, 551.6)), (0.020433333333333335, (311.0, 848.8, 549.6)), (0.023833333333333335, (312.0, 846.8, 547.6)), (0.027216666666666667, (276.0, 820.8, 603.6)), (0.030600000000000002, (196.0, 818.8, 601.6))]


Unnamed: 0,Time,Intensity_1,Parent_Ion,Product_Ion,Transition
29148,10.01468,38935501.0,902.8,603.6,902.8 -> 603.6
29149,10.015096,38935501.0,902.8,603.6,902.8 -> 603.6
29150,10.015512,38935501.0,902.8,603.6,902.8 -> 603.6
29151,10.015928,38932501.0,902.8,603.6,902.8 -> 603.6
29152,10.016344,38916000.0,902.8,603.6,902.8 -> 603.6
29153,10.01676,38916000.0,902.8,603.6,902.8 -> 603.6
29154,10.017177,38922501.0,902.8,603.6,902.8 -> 603.6
29141,10.011766,38946500.0,902.8,603.6,902.8 -> 603.6
29188,10.031326,38936499.0,902.8,603.6,902.8 -> 603.6
29701,10.244817,39326999.0,902.8,603.6,902.8 -> 603.6


In [189]:
import numpy as np
from scipy.signal import find_peaks

# Extract the intensity values from the DataFrame
intensity_values = df_OzESI_time['Intensity_1'].values

# Find peaks in the intensity values
peaks, _ = find_peaks(intensity_values)

# Get the peak values and their corresponding time points
peak_values = intensity_values[peaks]
time_points = df_OzESI_time['Time'].values[peaks]

# Print the peaks and their corresponding time points
for peak, time in zip(peak_values, time_points):
    print(f"Peak value: {peak}, Time: {time}")


Peak value: 3903.0, Time: 11.692066666666667
Peak value: 1227.0, Time: 12.179366666666667
Peak value: 1386.0, Time: 11.935716666666668
Peak value: 4516.0, Time: 12.663300000000001
Peak value: 3462.0, Time: 13.272433333333334
Peak value: 3561.0, Time: 13.1506
Peak value: 18586.0, Time: 12.90695
Peak value: 9281.0, Time: 12.785116666666667
Peak value: 3431.0, Time: 11.688683333333334
Peak value: 3211.0, Time: 14.003383333333334
Peak value: 1581.0, Time: 14.734366666666668
Peak value: 1898.0, Time: 14.978016666666667
Peak value: 2627.0, Time: 13.881566666666668
Peak value: 2091.0, Time: 11.932333333333334
Peak value: 4198.0, Time: 13.394266666666667
Peak value: 3581.0, Time: 13.516083333333334
Peak value: 2579.0, Time: 14.125216666666667
Peak value: 18288.0, Time: 10.710683333333334
Peak value: 21314.0, Time: 14.487316666666667
Peak value: 21121.0, Time: 14.609150000000001
Peak value: 20745.0, Time: 14.3655
Peak value: 24331.0, Time: 13.878183333333334
Peak value: 21054.0, Time: 15.09645


In [191]:
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.mixture import GaussianMixture

# Get the unique transitions in the DataFrame
transitions = df_OzESI_time['Transition'].unique()

peak_dict = {}

# Iterate over each transition
for transition in transitions:
    # Filter the DataFrame for the current transition
    transition_df = df_OzESI_time[df_OzESI_time['Transition'] == transition]

    # Extract the intensity values for the current transition
    intensity_values = transition_df['Intensity_1'].values.reshape(-1, 1)

    # Fit Gaussian Mixture Model to the intensity values
    gmm = GaussianMixture(n_components=1)
    gmm.fit(intensity_values)

    # Get the mean and standard deviation of the fitted Gaussian component
    mean = gmm.means_[0][0]
    std = np.sqrt(gmm.covariances_[0][0][0])

    # Set the peak threshold as mean + 2 standard deviations (adjust as needed)
    threshold = mean + 1 * std

    # Find the indices of the intensity values that exceed the threshold
    peak_indices = np.where(intensity_values > threshold)[0]

    # Get the peak values and their corresponding time points
    peak_values = intensity_values[peak_indices]
    peak_time_points = transition_df['Time'].values[peak_indices]

    # add the peak values and their corresponding time points to the dictionary
    peak_dict[transition] = (peak_values, peak_time_points)


    # # #Create a plot for the current transition
    # plt.figure()
    # plt.scatter(transition_df['Time'], transition_df['Intensity_1'], label='Intensity')
    # plt.scatter(peak_time_points, peak_values, color='red', label='Peaks')
    # plt.xlabel('Time')
    # plt.ylabel('Intensity')
    # plt.title(f'Peaks for Transition: {transition}')
    # plt.legend()
    # plt.show()

# print(peak_dict)

[]


Run all Lipid MRM functions

In [126]:
df_matched = full_parse(data_base_name_location,mzml_folder, folder_name_to_save, 
                        file_name_to_save,tolerance, remove_std = remove_std,save_data=save_data)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical['Parent_Ion'] = np.round(mrm_list_offical['Parent_Ion'],1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical['Product_Ion'] = np.round(mrm_list_offical['Product_Ion'],1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mrm_list_offical['Transition'] = mrm_list_offical['

Failed to save DataFrame after 5 attempts.


  df = df.append(df_all, ignore_index=True)


LC OzESI Lipid Parsing and Retention Time Functions

In [195]:


def process_chromatogram(OzESI_time):
    # Create dataframe from OzESI_time dictionary
    OzESI_rt_df = pd.DataFrame(list(OzESI_time.items()), columns=['Retention_Time', 'intensity'])
    
    # Split intensity column into three columns intensity, Parent_Ion and Product_Ion
    OzESI_rt_df[['intensity','Parent_Ion','Product_Ion']] = pd.DataFrame(OzESI_rt_df['intensity'].tolist(), index=OzESI_rt_df.index)
    
    # Round retention Retention_Time to 1 decimal place
    OzESI_rt_df['Retention_Time'] = round(OzESI_rt_df['Retention_Time'], 2)
    
    
    # Create a column called Transition with the Parent_Ion and Product_Ion
    OzESI_rt_df['Transition'] = OzESI_rt_df['Parent_Ion'].astype(str) + ' -> ' + OzESI_rt_df['Product_Ion'].astype(str)
    
    ########### HARDCODED TO DROP RETENTION TIMES BELOW 7 SECONDS ############
    #drop Rention_Time below 10.5 seconds and above 15.5 seconds
    OzESI_rt_df = OzESI_rt_df[OzESI_rt_df['Retention_Time'] > 10.5]
    OzESI_rt_df = OzESI_rt_df[OzESI_rt_df['Retention_Time'] < 15.5]

####################################
###########################################
###################################
    # # Sort dataframe  and take the average retention time for each transition
    # OzESI_rt_df['Retention_Time_Average'] = OzESI_rt_df.groupby('Transition')['Retention_Time'].transform('mean')
    # print('OzESI_rt_df: \n', OzESI_rt_df.head(10))


    # Sort dataframe by intensity, drop duplicates, and find the 10 largest values
   
    

    #OzESI_rt_df_sorted = OzESI_rt_df.sort_values(['intensity'], ascending=False).drop_duplicates(['Parent_Ion','Product_Ion']).sort_index()

    # Sort the DataFrame by intensity in descending order
    OzESI_rt_df_sorted = OzESI_rt_df.sort_values(['intensity'], ascending=False)
    # Group the DataFrame by 'Parent_Ion' and 'Product_Ion'
    grouped_df = OzESI_rt_df_sorted.groupby(['Parent_Ion', 'Product_Ion'])

    # Create an empty DataFrame to store the top 5 values
    top_5_df = pd.DataFrame(columns=OzESI_rt_df.columns)

    # Iterate over each group
    for _, group in grouped_df:
        # Get the top 5 rows with the highest intensity
        top_5_rows = group.head(5)
        # Append the top 5 rows to the 'top_5_df'
        top_5_df = top_5_df.append(top_5_rows)

    # Sort the 'top_5_df' by the original index
    top_5_df.sort_index(inplace=True)

# Print the DataFrame with the top 5 values for each 'Parent_Ion' and 'Product_Ion'
    print(top_5_df)
    OzESI_rt_df_top = top_5_df.nlargest(1000, 'intensity')
    #OzESI_rt_df_top = OzESI_rt_df_sorted.nlargest(1000, 'intensity')

    print('OzESI 10 largest: \n', OzESI_rt_df_top)
    
    # # Find peaks in the intensity column and plot the chromatogram
    # peaks, _ = find_peaks(OzESI_rt_df['intensity'], height=0.5e5,distance=1000)
    # plt.plot(OzESI_rt_df['Retention_Time'] ,OzESI_rt_df['intensity'])
    # plt.plot(OzESI_rt_df.iloc[peaks]['Retention_Time'], OzESI_rt_df.iloc[peaks]['intensity'], "x")
    # plt.ylabel('Intensity')
    # plt.xlabel('Retention Time')
    # plt.title('OzESI LC Chromatogram')
    # plt.show()
    
    return OzESI_rt_df_top

def add_rt_intensity(df, OzESI_rt_df_top):
    # Create a dictionary to map transitions to retention times and intensities
    transitions_to_rt = dict(zip(OzESI_rt_df_top['Transition'], OzESI_rt_df_top['Retention_Time']))
    transitions_to_intensity = dict(zip(OzESI_rt_df_top['Transition'], OzESI_rt_df_top['intensity']))
    
    # Use the map function to add retention times and intensities to the dataframe
    df['Retention_Time'] = df['Transition'].map(transitions_to_rt)
    df['Intensity_OzESI'] = df['Transition'].map(transitions_to_intensity)
    
    return df

def create_aldehyde_ion_dataframe():
    # Create a pandas dataframe with columns for DB_Position and Aldehyde_Ion
    df_OzESI = pd.DataFrame(columns=['DB_Position','Aldehyde_Ion'])

    # Loop over the range of DB_Position values and calculate the corresponding Aldehyde_Ion values
    for i in range(3, 21):
        df_OzESI.loc[i,'DB_Position'] = i
        df_OzESI.loc[i,'Aldehyde_Ion'] = 26 + (14 * (i-3)) 

    # Print the first 25 rows of the dataframe
    # print(df_OzESI.head(25))

    # Return the dataframe
    return df_OzESI

#OzESI_list = [3,5,7,9,11]
OzESI_list = [7,9,12]
#def calculate_n_minus_values(df_matched, df_OzESI, OzESI_list=[3,5,7,9,11], starting_column=9, last_column=14):
def calculate_n_minus_values(df_matched, df_OzESI, OzESI_list=[7,9,12], starting_column=9, last_column=14):
    """
    Given a pandas dataframe df and a dataframe df_OzESI containing DB_Position and Aldehyde_Ion values,
    calculates the n-i values for each i in OzESI_list by subtracting the corresponding Aldehyde_Ion value
    from the Parent_Ion column in df and storing the result in new columns named 'n-i' in df.
    The starting_column and last_column parameters specify the range of columns in which the n-i values should be stored.
    """
    # Create new columns in df for n-i values
    for i in OzESI_list:
        df_matched[f"n-{i}"] = df_matched["Parent_Ion"] - df_OzESI.loc[df_OzESI["DB_Position"] == i, "Aldehyde_Ion"].values[0]
    
    # Print the shape of the dataframe and return it
    # print(df_matched.shape)
    return df_matched


# OzESI_rt_df_top = process_chromatogram(OzESI_time)


Pipeline to run all LC OzESI functions

In [197]:
#OzESI_list = [3,5,7,9,11]
#def df_OzESI_pipeline(df, OzESI_time, OzESI_list=[3,5,7,9,12]):
def df_OzESI_pipeline(df, OzESI_time, OzESI_list=[7,9,12]):
    # Process chromatogram
    OzESI_rt_df_top = process_chromatogram(OzESI_time)
    
    df_OzESI = create_aldehyde_ion_dataframe()
    # Add retention time and intensity to dataframe
    df_processed = add_rt_intensity(df, OzESI_rt_df_top)
    # Calculate n-minus values
    df_processed = calculate_n_minus_values(df_processed, df_OzESI, OzESI_list, starting_column=9, last_column=14)
    
    return df_processed
df_OzESI_processed = df_OzESI_pipeline(df_matched, OzESI_time)
df_OzESI_processed.tail(10)

       Retention_Time   intensity  Parent_Ion  Product_Ion      Transition
3016            10.81       485.0       740.8        551.6  740.8 -> 551.6
3130            11.21    100034.0       876.8        577.6  876.8 -> 577.6
3133            11.23     86088.0       850.8        551.6  850.8 -> 551.6
3164            11.34    193831.0       876.8        577.6  876.8 -> 577.6
3170            11.36     37467.0       820.8        603.6  820.8 -> 603.6
3172            11.36      2994.0       816.8        599.6  816.8 -> 599.6
3186            11.42       496.0       740.8        551.6  740.8 -> 551.6
3201            11.47     70972.0       850.8        551.6  850.8 -> 551.6
3210            11.50      5377.0       790.8        601.6  790.8 -> 601.6
3229            11.57      1849.0         0.0          0.0      0.0 -> 0.0
3263            11.69      3903.0         0.0          0.0      0.0 -> 0.0
3287            11.78       985.0       746.8        599.6  746.8 -> 599.6
3297            11.81    

  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top_5_df.append(top_5_rows)
  top_5_df = top

Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID,Retention_Time,Intensity_OzESI,n-7,n-9,n-12
26,,820.8,603.6,401419.2,820.8 -> 603.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.82,22476.0,738.8,710.8,668.8
27,[TG(50:3)]_FA18:1,846.8,547.6,686913.8,846.8 -> 547.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,15.01,213888.0,764.8,736.8,694.8
28,"[TG(51:9),TG(50:2)]_FA18:1",848.8,549.6,3262575.0,848.8 -> 549.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.91,246212.0,766.8,738.8,696.8
29,"[TG(51:8),TG(50:1)]_FA18:1",850.8,551.6,8906404.0,850.8 -> 551.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.69,36204.0,768.8,740.8,698.8
30,[TG(52:4)]_FA18:1,872.8,573.6,721947.5,872.8 -> 573.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,14.88,148100.0,790.8,762.8,720.8
31,"[TG(53:10),TG(52:3)]_FA18:1",874.8,575.6,4364672.0,874.8 -> 575.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.65,608530.0,792.8,764.8,722.8
32,"[TG(53:9),TG(52:2)]_FA18:1",876.8,577.6,18056850.0,876.8 -> 577.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.65,76176.0,794.8,766.8,724.8
33,[TG(54:5)]_FA18:1,898.8,599.6,1673118.0,898.8 -> 599.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.55,116106.0,816.8,788.8,746.8
34,"[TG(55:11),TG(54:4)]_FA18:1",900.8,601.6,3305303.0,900.8 -> 601.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.64,360401.0,818.8,790.8,748.8
35,"[TG(55:10),TG(54:3)]_FA18:1",902.8,603.6,9339795.0,902.8 -> 603.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,15.49,49842001.0,820.8,792.8,750.8


In [198]:
df_OzESI_processed.head(None)

Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID,Retention_Time,Intensity_OzESI,n-7,n-9,n-12
0,,694.6,547.6,256799.6,694.6 -> 547.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.39,4198.0,612.6,584.6,542.6
1,,696.6,549.6,218514.4,696.6 -> 549.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.27,23284.0,614.6,586.6,544.6
2,,698.7,551.6,6608002.0,698.7 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,14.24,545.0,616.7,588.7,546.7
3,,720.6,573.6,73306.55,720.6 -> 573.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,11.92,1720.0,638.6,610.6,568.6
4,,722.6,575.6,142839.1,722.6 -> 575.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.14,6684.0,640.6,612.6,570.6
5,,724.7,577.6,581595.5,724.7 -> 577.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,15.08,4650.0,642.7,614.7,572.7
6,,736.7,547.6,661318.4,736.7 -> 547.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.64,8576.0,654.7,626.7,584.7
7,,738.7,549.6,458210.3,738.7 -> 549.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.4,21974.0,656.7,628.7,586.7
8,,740.8,551.6,936681.4,740.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.27,482.0,658.8,630.8,588.8
9,,746.8,599.6,197613.6,746.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.88,512.0,664.8,636.8,594.8


OzESI n-# lipid matching

In [66]:
# ### FIRST VERSION
# #Supposed to add n-# and lipid name to lipid column, see github for previous working version
# #OzESI_list = [3,5,7,9,12]
# OzESI_list = [7,9,12]
# def add_lipid_info(df_OzESI_processed, OzESI_list):
#     # Create a copy of the original dataframe
#     df_test = df_matched.copy()
    
#     # Convert Parent_Ion column to float
#     df_test['Parent_Ion'] = df_test['Parent_Ion'].astype(float)
    
#     # Convert all n-# columns to float
#     for i in OzESI_list:
#         df_test['n-' + str(i)] = df_test['n-' + str(i)].astype(float)
    
#     # Search for n-# in Parent_Ion and add the corresponding lipid to the Lipid column
#     for i in range(len(df_test)):
#         for j in range(len(df_test)):
#             if pd.isna(df_test.loc[i,'Lipid']):
#                 # print(i,j)
#                 parent_ion = df_test.loc[i,'Parent_Ion']
#                 # if parent_ion == df_test.loc[j,'n-3'] and isinstance(df_test.loc[j,'Lipid'], str):
#                 #     df_test.loc[i,'Lipid'] = 'n-3 ' + (df_test.loc[j,'Lipid'])
#                 # elif parent_ion == df_test.loc[j,'n-5'] and isinstance(df_test.loc[j,'Lipid'], str):
#                 #     df_test.loc[i,'Lipid'] = 'n-5 ' + (df_test.loc[j,'Lipid'])
#                 if parent_ion == df_test.loc[j,'n-7'] and isinstance(df_test.loc[j,'Lipid'], str):
#                     df_test.loc[i,'Lipid'] = 'n-7 ' + (df_test.loc[j,'Lipid'])
#                 elif parent_ion == df_test.loc[j,'n-9'] and isinstance(df_test.loc[j,'Lipid'], str):
#                     print('n-9', i, j)
#                     print(df_test.loc[j,'Lipid'],parent_ion)
#                     print(df_test.loc[j,'Lipid'],df_test.loc[j,'Product_Ion'])
#                     #print retention time of that lipid
#                     print(df_test.loc[j,'Retention_Time'])
#                     df_test.loc[i,'Lipid'] = 'n-9 ' + (df_test.loc[j,'Lipid'])
#                 elif parent_ion == df_test.loc[j,'n-12'] and isinstance(df_test.loc[j,'Lipid'], str):
#                     df_test.loc[i,'Lipid'] = 'n-12 '+ (df_test.loc[j,'Lipid'])
#                 else:
#                     pass
#     df_test.dropna(subset=['Lipid'], inplace=True)
#     return df_test

# pd.set_option('display.max_rows', None)
# df_test = add_lipid_info(df_OzESI_processed, OzESI_list)
# df_test.head(None)

n-9 8 29
[TG(51:8),TG(50:1)]_FA18:1 740.8
[TG(51:8),TG(50:1)]_FA18:1 551.6
12.57
n-9 18 33
[TG(54:5)]_FA18:1 788.8
[TG(54:5)]_FA18:1 599.6
13.65


Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID,Retention_Time,Intensity_OzESI,n-7,n-9,n-12
8,"n-9 [TG(51:8),TG(50:1)]_FA18:1",740.8,551.6,936681.4,740.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,5.94,18649.0,658.8,630.8,588.8
9,n-12 [TG(54:5)]_FA18:1,746.8,599.6,197613.6,746.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,6.42,3948.0,664.8,636.8,594.8
10,"n-12 [TG(55:11),TG(54:4)]_FA18:1",748.8,601.6,112815.3,748.8 -> 601.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,7.51,49650.0,666.8,638.8,596.8
11,"n-12 [TG(55:10),TG(54:3)]_FA18:1",750.8,603.6,575742.4,750.8 -> 603.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.5,4476.0,668.8,640.8,598.8
17,"n-7 [TG(51:8),TG(50:1)]_FA18:1",768.8,551.6,772301.3,768.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.36,8898.0,686.8,658.8,616.8
18,n-9 [TG(54:5)]_FA18:1,788.8,599.6,148250.6,788.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,1.15,1688.0,706.8,678.8,636.8
20,n-7 [TG(52:4)]_FA18:1,790.8,601.6,195273.8,790.8 -> 601.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.69,18782.0,708.8,680.8,638.8
22,"n-7 [TG(53:10),TG(52:3)]_FA18:1",792.8,603.6,748905.4,792.8 -> 603.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,14.78,98857.0,710.8,682.8,640.8
24,n-7 [TG(54:5)]_FA18:1,816.8,599.6,73669.29,816.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,7.1,5200.0,734.8,706.8,664.8
25,"n-7 [TG(55:11),TG(54:4)]_FA18:1",818.8,601.6,120095.3,818.8 -> 601.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,7.1,74176.0,736.8,708.8,666.8


In [98]:
#### SECOND VERSION (working better)
OzESI_list = [7, 9, 12]

def add_lipid_info(df, OzESI_list):
    df_test = df.copy()
    
    for i in OzESI_list:
        df_test['n-' + str(i)] = df_test['n-' + str(i)].astype(float)
    
    for i in range(len(df_test)):
        if pd.isna(df_test.loc[i, 'Lipid']):
            parent_ion = df_test.loc[i, 'Parent_Ion']
            
            for j in range(len(df_test)):
                if parent_ion == df_test.loc[j, 'n-7'] and isinstance(df_test.loc[j, 'Lipid'], str):
                    df_test.loc[i, 'Lipid'] = 'n-7 ' + df_test.loc[j, 'Lipid']
                elif parent_ion == df_test.loc[j, 'n-9'] and isinstance(df_test.loc[j, 'Lipid'], str):
                    df_test.loc[i, 'Lipid'] = 'n-9 ' + df_test.loc[j, 'Lipid']
                elif parent_ion == df_test.loc[j, 'n-12'] and isinstance(df_test.loc[j, 'Lipid'], str):
                    df_test.loc[i, 'Lipid'] = 'n-12 ' + df_test.loc[j, 'Lipid']
    
    df_test.dropna(subset=['Lipid'], inplace=True)
    return df_test

df_test = add_lipid_info(df_OzESI_processed, OzESI_list)
df_test.head(None)


Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID,Retention_Time,Intensity_OzESI,n-7,n-9,n-12
8,"n-9 [TG(51:8),TG(50:1)]_FA18:1",740.8,551.6,936681.4,740.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,15.19,584.0,658.8,630.8,588.8
9,n-12 [TG(54:5)]_FA18:1,746.8,599.6,197613.6,746.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,11.78,985.0,664.8,636.8,594.8
10,"n-12 [TG(55:11),TG(54:4)]_FA18:1",748.8,601.6,112815.3,748.8 -> 601.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.85,3845.0,666.8,638.8,596.8
11,"n-12 [TG(55:10),TG(54:3)]_FA18:1",750.8,603.6,575742.4,750.8 -> 603.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.5,4476.0,668.8,640.8,598.8
17,"n-7 [TG(51:8),TG(50:1)]_FA18:1",768.8,551.6,772301.3,768.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.36,8898.0,686.8,658.8,616.8
18,n-9 [TG(54:5)]_FA18:1,788.8,599.6,148250.6,788.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.48,1466.0,706.8,678.8,636.8
20,"n-9 [TG(55:11),TG(54:4)]_FA18:1",790.8,601.6,195273.8,790.8 -> 601.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.69,18782.0,708.8,680.8,638.8
22,"n-9 [TG(55:10),TG(54:3)]_FA18:1",792.8,603.6,748905.4,792.8 -> 603.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,14.78,98857.0,710.8,682.8,640.8
24,n-7 [TG(54:5)]_FA18:1,816.8,599.6,73669.29,816.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.68,3524.0,734.8,706.8,664.8
25,"n-7 [TG(55:11),TG(54:4)]_FA18:1",818.8,601.6,120095.3,818.8 -> 601.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,14.65,25918.0,736.8,708.8,666.8


In [99]:
sorted_df = df_test.sort_values(by='Product_Ion')

sorted_df.head(None)



Unnamed: 0,Lipid,Parent_Ion,Product_Ion,Intensity,Transition,Class,Sample_ID,Retention_Time,Intensity_OzESI,n-7,n-9,n-12
27,[TG(50:3)]_FA18:1,846.8,547.6,686913.8,846.8 -> 547.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.79,409651.0,764.8,736.8,694.8
28,"[TG(51:9),TG(50:2)]_FA18:1",848.8,549.6,3262575.0,848.8 -> 549.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,15.13,2236988.0,766.8,738.8,696.8
8,"n-9 [TG(51:8),TG(50:1)]_FA18:1",740.8,551.6,936681.4,740.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,15.19,584.0,658.8,630.8,588.8
17,"n-7 [TG(51:8),TG(50:1)]_FA18:1",768.8,551.6,772301.3,768.8 -> 551.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.36,8898.0,686.8,658.8,616.8
29,"[TG(51:8),TG(50:1)]_FA18:1",850.8,551.6,8906404.0,850.8 -> 551.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.57,159733.0,768.8,740.8,698.8
30,[TG(52:4)]_FA18:1,872.8,573.6,721947.5,872.8 -> 573.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.66,928619.0,790.8,762.8,720.8
31,"[TG(53:10),TG(52:3)]_FA18:1",874.8,575.6,4364672.0,874.8 -> 575.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,14.99,4376154.0,792.8,764.8,722.8
32,"[TG(53:9),TG(52:2)]_FA18:1",876.8,577.6,18056850.0,876.8 -> 577.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,12.56,494143.0,794.8,766.8,724.8
9,n-12 [TG(54:5)]_FA18:1,746.8,599.6,197613.6,746.8 -> 599.6,,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,11.78,985.0,664.8,636.8,594.8
33,[TG(54:5)]_FA18:1,898.8,599.6,1673118.0,898.8 -> 599.6,TAG,DOD93_F4-5xFAD-Cereb_TG18-1_o3on,13.65,797129.0,816.8,788.8,746.8


Plotting Functions

In [22]:
#import visualization libraries
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
#Plotting functions

def plot_transition_vs_intensity(df):
    fig = px.bar(df, x="Transition", y="Intensity", color="Lipid", hover_data=['Lipid', 'Class'])
    fig.show()

def plot_class_vs_intensity_bar(df):
    fig = px.bar(df, x="Class", y="Intensity", color="Class", hover_data=['Lipid', 'Class'])
    fig.show()

def plot_class_vs_intensity_pie(df):
    fig = px.pie(df, values='Intensity', names='Class', title='Lipid Class')
    fig.show()

def plot_intensity_heatmap(df):
    fig = go.Figure(data=go.Heatmap(
        z=df['Intensity'],
        x=df['Lipid'],
        y=df['Class'],
        colorscale='Viridis'))
    fig.show()

# Example usage:
# Assuming you have the df_matching DataFrame
plot_transition_vs_intensity(df_matched)
plot_class_vs_intensity_bar(df_matched)
plot_class_vs_intensity_pie(df_matched)
plot_intensity_heatmap(df_matched)
