# GENERATE LIPID DATABASE

In [10]:
import os
import pandas as pd

def expand_lipid_data(result_df: pd.DataFrame, chain_length: int = 30) -> pd.DataFrame:
    """
    Expands the lipid data in the input DataFrame by iterating over a specified chain length range
    and creating new rows for each chain length found in the input data.

    Args:
    result_df (pd.DataFrame): The original DataFrame containing lipid data.
    chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    pd.DataFrame: A new DataFrame with expanded lipid data.
    """
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])
    
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']
        
        for i in range(2, chain_length + 1):
            column_name = f'n-{i}'
            if pd.notna(row.get(column_name)):  # Using row.get() to avoid KeyError
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)
    
    return new_df

# Directory and sorting code
directory = 'lipid_database/OzON_FA_Database_whole/'

# Initialize an empty list to store the dataframes
dataframes = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        # Read the csv file into a dataframe
        df = pd.read_csv(file_path)
        # Append the dataframe to the list
        dataframes.append(df)

# Concatenate all dataframes in the list into a single dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

# Sort the combined dataframe by "Chain length" and then by "DB Number"
sorted_df = combined_df.sort_values(by=["Chain length", "DB Number"])

# Expand the sorted dataframe
expanded_df = expand_lipid_data(sorted_df)

# Display the expanded dataframe
expanded_df


Unnamed: 0,Lipid,Parent Ion,Chain length,DB Number,DB Location,FA mass,[FA+AMP]+
0,FA(6:1)_<>_n-2,269.2,6,1,<>,114.1,281.2
1,FA(6:1)_<>_n-3,255.2,6,1,<>,114.1,281.2
2,FA(6:1)_<>_n-4,241.2,6,1,<>,114.1,281.2
3,FA(6:2)_<B>_n-2,267.2,6,2,<B>,112.1,279.2
4,FA(6:2)_<B>_n-3,253.2,6,2,<B>,112.1,279.2
...,...,...,...,...,...,...,...
6504,FA(30:6)_<FFFFF>_n-24,297.5,30,6,<FFFFF>,440.4,607.5
6505,FA(30:6)_<FFFFF>_n-25,283.5,30,6,<FFFFF>,440.4,607.5
6506,FA(30:6)_<FFFFF>_n-26,269.5,30,6,<FFFFF>,440.4,607.5
6507,FA(30:6)_<FFFFF>_n-27,255.5,30,6,<FFFFF>,440.4,607.5


In [13]:
expanded_df.tail(5)
expanded_df['Parent Ion'].value_counts()


241.2    45
269.4    42
325.3    42
311.3    42
395.4    42
         ..
559.4     2
553.4     1
455.3     1
595.5     1
357.2     1
Name: Parent Ion, Length: 408, dtype: int64

# save new df

In [12]:
expanded_df.to_csv('lipid_database/OzON_databases/OzON_Possible.csv', index=False)