# Make a database on general lipids based off another database

In [1]:
import pandas as pd

def generate_lipid_names(start_chain_length, end_chain_length, min_double_bonds, max_double_bonds):
    """ Generate all combinations of lipid names within the specified ranges. """
    lipid_names = []
    for chain_length in range(start_chain_length, end_chain_length + 1):
        for double_bonds in range(min_double_bonds, max_double_bonds + 1):
            lipid_name = f"{chain_length}:{double_bonds}"
            lipid_names.append(lipid_name)
    return lipid_names

def update_lipid_data(df, lipid_names):
    # Reference values: 18:2 corresponds to 447 parent ion
    base_name = "18:2"
    base_parent_ion = 447.3
    base_parts = [int(x) for x in base_name.split(':')]
    
    for compound_name in lipid_names:
        new_parts = [int(x) for x in compound_name.split(':')]
        parent_ion_change = ((new_parts[1] - base_parts[1]) * -2) + ((new_parts[0] - base_parts[0]) * 14)
        new_parent_ion = base_parent_ion + parent_ion_change
        product_ion = 183
        
        # Add new row to the DataFrame
        new_row = {'Compound Name': compound_name, 'Parent Ion': new_parent_ion, 'Product Ion': product_ion}
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
    
    return df

def order_by_compound_name(df):
    """Sort DataFrame by 'Compound Name' after converting to numeric for accurate sorting."""
    df[['Chain Length', 'Double Bonds']] = df['Compound Name'].str.split(':', expand=True)
    df[['Chain Length', 'Double Bonds']] = df[['Chain Length', 'Double Bonds']].astype(int)
    df = df.sort_values(by=['Chain Length', 'Double Bonds'])
    df.drop(columns=['Chain Length', 'Double Bonds'], inplace=True)
    return df

def remove_duplicates(df):
    # Remove duplicate rows, keeping the first occurrence
    return df.drop_duplicates('Compound Name', keep='first')

def main(start_chain_length, end_chain_length, min_double_bonds, max_double_bonds, new_file_name):
    # Path to the original Excel file
    original_file_path = 'AMP_Database3.xlsx'
    
    # Load the original Excel file into a DataFrame
    data = pd.read_excel(original_file_path)
    
    # Generate lipid names based on the user specifications
    lipid_names = generate_lipid_names(start_chain_length, end_chain_length, min_double_bonds, max_double_bonds)
    
    # Update the DataFrame with new lipids
    updated_data = update_lipid_data(data, lipid_names)
    
    # Optionally remove duplicates if needed
    updated_data = remove_duplicates(updated_data)
    
    # Order the DataFrame by 'Compound Name'
    ordered_data = order_by_compound_name(updated_data)
    
    # Save the updated DataFrame to a new Excel file (optional based on your need)
    new_file_path = f'{new_file_name}'
    ordered_data.to_excel(new_file_path, index=False)
    
    # Return the ordered DataFrame instead of printing a message
    return ordered_data


# Call the main function and store the result
final_df = main(5, 30, 0, 5, 'Updated_Lipid_Data.xlsx')

final_df



Unnamed: 0,Compound Group,Compound Name,ISTD?,Parent Ion,MS1 Res,Product Ion,MS2 Res,Dwell,Fragmentor,Collision Energy,Cell Accelerator Voltage,Polarity,Class
11,,5:0,,269.3,,183,,,,,,,
12,,5:1,,267.3,,183,,,,,,,
13,,5:2,,265.3,,183,,,,,,,
14,,5:3,,263.3,,183,,,,,,,
15,,5:4,,261.3,,183,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,,30:1,,617.3,,183,,,,,,,
163,,30:2,,615.3,,183,,,,,,,
164,,30:3,,613.3,,183,,,,,,,
165,,30:4,,611.3,,183,,,,,,,


# find Lipids for OzOFF with similar parent and product ions

In [4]:
import pandas as pd

# Existing functions as previously defined
def aggregate_compounds(df):
    aggregated_data = df.groupby(['Parent Ion', 'Product Ion']).agg({
        'Compound Name': lambda x: ','.join(x.unique()),  # Join unique compound names
        'Class': 'first'  # Take the first class entry
    }).reset_index()
    aggregated_data.rename(columns={'Compound Name': 'Aggregated Compound Names'}, inplace=True)
    return aggregated_data

def view_overlaps(df):
    overlap_rows = df[df['Aggregated Compound Names'].str.contains(',')]
    return overlap_rows

# Example usage
# Assume 'final_df' is your DataFrame after all processing
OzOFF_overlap = aggregate_compounds(final_df)
overlapped_compounds = view_overlaps(OzOFF_overlap)

# Display the results
print(overlapped_compounds)


Empty DataFrame
Columns: [Parent Ion, Product Ion, Aggregated Compound Names, Class]
Index: []


In [2]:
# 3. Initialize df_DB_aldehyde
df_DB_aldehyde = pd.DataFrame(columns=['DB_Position', 'Aldehyde_Ion'])
for position in range(3, 21):
    df_DB_aldehyde.loc[position, 'DB_Position'] = position
    df_DB_aldehyde.loc[position, 'Aldehyde_Ion'] = 26 + (14 * (position - 3))

df_DB_aldehyde

Unnamed: 0,DB_Position,Aldehyde_Ion
3,3,26
4,4,40
5,5,54
6,6,68
7,7,82
8,8,96
9,9,110
10,10,124
11,11,138
12,12,152


# Caitlin excel spreadsheets

In [4]:
import pandas as pd
import itertools

# Define the lipids
lipids = ['22:4', '20:5', '20:4', '20:3', '22:6', '22:5', '18:2', '18:3', '18:4']

# Generate all possible combinations of lipids
combinations = list(itertools.product(lipids, repeat=2))

# Create a DataFrame with the combinations
df = pd.DataFrame(combinations, columns=['Lipid 1', 'Lipid 2'])

# Calculate FA mass and [FA + AMP]+ m/z for each lipid
def calculate_fa_mass(lipid):
    carbon, double_bond = map(int, lipid.split(':'))
    fa_mass = carbon * 12 + (2 * carbon - 2 * double_bond + 2) * 1.00782503223 + 31.972071
    return round(fa_mass, 2)

def calculate_fa_amp_mz(fa_mass):
    return round(fa_mass + 167, 2)

df['FA mass 1'] = df['Lipid 1'].apply(calculate_fa_mass)
df['FA mass 2'] = df['Lipid 2'].apply(calculate_fa_mass)
df['[FA + AMP]+ m/z 1'] = df['FA mass 1'].apply(calculate_fa_amp_mz)
df['[FA + AMP]+ m/z 2'] = df['FA mass 2'].apply(calculate_fa_amp_mz)

df

Unnamed: 0,Lipid 1,Lipid 2,FA mass 1,FA mass 2,[FA + AMP]+ m/z 1,[FA + AMP]+ m/z 2
0,22:4,22:4,334.27,334.27,501.27,501.27
1,22:4,20:5,334.27,304.22,501.27,471.22
2,22:4,20:4,334.27,306.24,501.27,473.24
3,22:4,20:3,334.27,308.25,501.27,475.25
4,22:4,22:6,334.27,330.24,501.27,497.24
...,...,...,...,...,...,...
76,18:4,22:6,278.21,330.24,445.21,497.24
77,18:4,22:5,278.21,332.25,445.21,499.25
78,18:4,18:2,278.21,282.24,445.21,449.24
79,18:4,18:3,278.21,280.22,445.21,447.22
