# GENERATE LIPID DATABASE

# n-1 and now save databases

In [3]:
import os
import pandas as pd

def generate_db_locations(db_number):
    length = db_number - 1  # Adjust the length to be one less than the number of double bonds

    # Recursive helper function to generate patterns
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':  # Ensure 'B' is always on the left of 'F'
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns

    # Wrap each generated pattern with angle brackets
    return ['<' + p + '>' for p in generate_patterns(length)]

def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    db_locations = generate_db_locations(db_number)  # Generate possible double bond location patterns
    dbs_in_front = list(range(len(db_locations)))  # Create a list of indices for double bonds in front

    # Create an empty DataFrame with specified columns
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(1, chain_length - 2)])

    fa_amp = fa_mass + AMP  # Calculate the [FA+AMP]+ value

    for i, db_location in enumerate(db_locations):
        # Initialize a row dictionary with lipid properties
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }

        # Count the number of 'B's and 'F's in the pattern
        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n = chain_length - db_number + f_count  # Determine the maximum n-value

        # Calculate n-values and add them to the row
        for j in range(1, max_n ):
            if j == 1:
                row[f'n-{j}'] = fa_mass + AMP + 2  # Special case for n-1
            else:
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        # Append the row to the DataFrame
        df = df.append(row, ignore_index=True)

    return df

def generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP):
    all_dfs = []
    fa_mass = starting_fa_mass

    # Loop through each DB number from starting_db_number to 1
    for db_number in range(starting_db_number, 0, -1):
        df = calculate_n_values(chain_length, db_number, fa_mass, AMP)  # Calculate n-values for the current DB number
        all_dfs.append(df)  # Append the resulting DataFrame to the list
        fa_mass += 2  # Increment FA mass for the next iteration

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(all_dfs, ignore_index=True)

    return result_df

def calculate_min_n_values(df, chain_length, db_number):
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Check if the current row's DB Number and Chain length match the provided values
        if row['DB Number'] == db_number and row['Chain length'] == chain_length:
            # Get the double bond location pattern for the current row
            db_location = row['DB Location']
            
            # Count the number of 'B' characters in the pattern, indicating the double bonds at the back
            b_count = db_location.count('F')
            
            # Iterate over the possible n-values from the end of the chain to n-1
            for j in range(1, b_count + 1):
                df.at[index, f'n-{j}'] = pd.NA  # pd.NA is used to represent missing values

    # Return the updated DataFrame
    return df

def replace_nan_with_na(df):
    return df.mask(pd.isna(df), pd.NA)

def save_fa_database(chain_length, starting_db_number, starting_fa_mass, AMP, directory):
    # Generate the DataFrame
    result_df = generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP)

    # Update the DataFrame with minimum n-values
    for db_number in range(1, starting_db_number + 1):
        result_df = calculate_min_n_values(result_df, chain_length, db_number)

    # Replace NaN values with pd.NA
    result_df = replace_nan_with_na(result_df)

    # Ensure the directory exists
    if not os.path.exists(directory):
        try:
            os.makedirs(directory)
        except PermissionError:
            print(f"Permission denied: Unable to create directory {directory}")
            raise

    # Generate the filename
    filename = f"FA({chain_length}_x)_OzON_Database.csv"

    # Save the DataFrame to a CSV file
    result_df.to_csv(os.path.join(directory, filename), index=False)
    print(f"DataFrame successfully saved to: {os.path.join(directory, filename)}")

# Example usage
chain_length = 20
starting_db_number = 6
starting_fa_mass = 300.2
AMP = 167.1
directory = 'lipid_database/OzON_FA_Database/'

save_fa_database(chain_length, starting_db_number, starting_fa_mass, AMP, directory)


DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(20_x)_OzON_Database.csv


### AUTOMATE ALL DB AT ONCE

In [2]:
import os
import pandas as pd

def generate_db_locations(db_number):
    """
    Generate all possible double bond locations for a given number of double bonds.
    
    Parameters:
    db_number (int): Number of double bonds.
    
    Returns:
    list: List of patterns representing double bond locations.
    """
    length = db_number - 1

    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns

    return ['<' + p + '>' for p in generate_patterns(length)]

def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    """
    Calculate the values for n positions based on chain length, double bond number, and FA mass.
    
    Parameters:
    chain_length (int): Length of the fatty acid chain.
    db_number (int): Number of double bonds.
    fa_mass (float): Mass of the fatty acid.
    AMP (float): AMP constant to add to the FA mass.
    
    Returns:
    DataFrame: DataFrame containing the calculated values.
    """
    db_locations = generate_db_locations(db_number)
    dbs_in_front = list(range(len(db_locations)))

    # Initialize DataFrame with required columns
    df = pd.DataFrame(columns=[
        'Lipid', 'Chain length', 'DB Number', 'DB Location', 
        'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(1, chain_length - 2)]
    )

    fa_amp = fa_mass + AMP

    for i, db_location in enumerate(db_locations):
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }

        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n = chain_length - db_number + f_count

        for j in range(1, max_n):
            if j == 1:
                row[f'n-{j}'] = fa_mass + AMP + 2
            else:
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        df = df.append(row, ignore_index=True)

    return df

def generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP):
    """
    Generate all values for fatty acids by iterating through possible double bond numbers.
    
    Parameters:
    chain_length (int): Length of the fatty acid chain.
    starting_db_number (int): Starting number of double bonds.
    starting_fa_mass (float): Starting mass of the fatty acid.
    AMP (float): AMP constant to add to the FA mass.
    
    Returns:
    DataFrame: DataFrame containing all generated values.
    """
    all_dfs = []
    fa_mass = starting_fa_mass

    for db_number in range(starting_db_number, 0, -1):
        df = calculate_n_values(chain_length, db_number, fa_mass, AMP)
        all_dfs.append(df)
        fa_mass += 2

    result_df = pd.concat(all_dfs, ignore_index=True)

    return result_df

def calculate_min_n_values(df, chain_length, db_number):
    """
    Calculate minimum n values for specific chain length and double bond number.
    
    Parameters:
    df (DataFrame): DataFrame containing the data.
    chain_length (int): Length of the fatty acid chain.
    db_number (int): Number of double bonds.
    
    Returns:
    DataFrame: DataFrame with minimum n values calculated.
    """
    for index, row in df.iterrows():
        if row['DB Number'] == db_number and row['Chain length'] == chain_length:
            db_location = row['DB Location']
            b_count = db_location.count('F')
            for j in range(1, b_count + 1):
                df.at[index, f'n-{j}'] = pd.NA

    return df

def replace_nan_with_na(df):
    """
    Replace NaN values in the DataFrame with NA.
    
    Parameters:
    df (DataFrame): DataFrame containing the data.
    
    Returns:
    DataFrame: DataFrame with NaN values replaced by NA.
    """
    return df.mask(pd.isna(df), pd.NA)

def save_fa_database(chain_length, starting_db_number, starting_fa_mass, AMP, directory):
    """
    Generate and save the fatty acid database to a CSV file.
    
    Parameters:
    chain_length (int): Length of the fatty acid chain.
    starting_db_number (int): Starting number of double bonds.
    starting_fa_mass (float): Starting mass of the fatty acid.
    AMP (float): AMP constant to add to the FA mass.
    directory (str): Directory to save the CSV file.
    """
    result_df = generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP)
    for db_number in range(1, starting_db_number + 1):
        result_df = calculate_min_n_values(result_df, chain_length, db_number)
    result_df = replace_nan_with_na(result_df)

    if not os.path.exists(directory):
        try:
            os.makedirs(directory)
        except PermissionError:
            print(f"Permission denied: Unable to create directory {directory}")
            raise

    filename = f"FA({chain_length}_x)_OzON_Database.csv"
    result_df.to_csv(os.path.join(directory, filename), index=False)
    print(f"DataFrame successfully saved to: {os.path.join(directory, filename)}")

def process_fas(fa_data, AMP, directory):
    """
    Process fatty acid data and save each to a database.
    
    Parameters:
    fa_data (list): List of dictionaries containing FA data.
    AMP (float): AMP constant to add to the FA mass.
    directory (str): Directory to save the CSV files.
    """
    for fa in fa_data:
        chain_length = int(fa['Lipid'].split('(')[1].split(':')[0])
        starting_fa_mass = fa['Values'][-1]
        starting_db_number = 6  # Assuming starting DB number as 6 for all FAs

        save_fa_database(chain_length, starting_db_number, starting_fa_mass, AMP, directory)

# Example FA data extracted from the table image
fa_data = [
    {'Lipid': 'FA(6:x)', 'Values': [114.1, 112.1, 110.1, 108.1, 106.1, 104.1]},
    {'Lipid': 'FA(7:x)', 'Values': [128.1, 126.1, 124.1, 122.1, 120.1, 118.1]},
    {'Lipid': 'FA(8:x)', 'Values': [142.1, 140.1, 138.1, 136.1, 134.1, 132.1]},
    # Add the remaining FA data here
    {'Lipid': 'FA(9:x)', 'Values': [156.1, 154.1, 152.1, 150.1, 148.1, 146.1]},
    {'Lipid': 'FA(10:x)', 'Values': [170.1, 168.1, 166.1, 164.1, 162.1, 160.1]},
    {'Lipid': 'FA(11:x)', 'Values': [184.1, 182.1, 180.1, 178.1, 176.1, 174.1]},
    {'Lipid': 'FA(12:x)', 'Values': [198.1, 196.1, 194.1, 192.1, 190.1, 188.1]},
    {'Lipid': 'FA(13:x)', 'Values': [212.1, 210.1, 208.1, 206.1, 204.1, 202.1]},
    {'Lipid': 'FA(14:x)', 'Values': [226.2, 224.2, 222.2, 220.2, 218.2, 216.2]},
    {'Lipid': 'FA(15:x)', 'Values': [240.2, 238.2, 236.2, 234.2, 232.2, 230.2]},
    {'Lipid': 'FA(16:x)', 'Values': [254.2, 252.2, 250.2, 248.2, 246.2, 244.2]},
    {'Lipid': 'FA(17:x)', 'Values': [268.2, 266.2, 264.2, 262.2, 260.2, 258.2]},
    {'Lipid': 'FA(18:x)', 'Values': [282.2, 280.2, 278.2, 276.2, 274.2, 272.2]},
    {'Lipid': 'FA(19:x)', 'Values': [296.2, 294.2, 292.2, 290.2, 288.2, 286.2]},
    {'Lipid': 'FA(20:x)', 'Values': [310.2, 308.2, 306.2, 304.2, 302.2, 300.2]},
    {'Lipid': 'FA(21:x)', 'Values': [324.3, 322.3, 320.3, 318.3, 316.3, 314.3]},
    {'Lipid': 'FA(22:x)', 'Values': [338.3, 336.3, 334.3, 332.3, 330.3, 328.3]},
    {'Lipid': 'FA(23:x)', 'Values': [352.3, 350.3, 348.3, 346.3, 344.3, 342.3]},
    {'Lipid': 'FA(24:x)', 'Values': [366.3, 364.3, 362.3, 360.3, 358.3, 356.3]},
    {'Lipid': 'FA(25:x)', 'Values': [380.3, 378.3, 376.3, 374.3, 372.3, 370.3]},
    {'Lipid': 'FA(26:x)', 'Values': [394.3, 392.3, 390.3, 388.3, 386.3, 384.3]},
    {'Lipid': 'FA(27:x)', 'Values': [408.3, 406.3, 404.3, 402.3, 400.3, 398.3]},
    {'Lipid': 'FA(28:x)', 'Values': [422.4, 420.4, 418.4, 416.4, 414.4, 412.4]},
    {'Lipid': 'FA(29:x)', 'Values': [436.4, 434.4, 432.4, 430.4, 428.4, 426.4]},
    {'Lipid': 'FA(30:x)', 'Values': [450.4, 448.4, 446.4, 444.4, 442.4, 440.4]}
]

AMP = 167.1
directory = 'lipid_database/OzON_FA_Database/'

process_fas(fa_data, AMP, directory)


DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(6_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(7_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(8_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(9_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(10_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(11_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(12_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(13_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(14_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(15_x)_OzON_Database.csv
DataFrame successfully saved to: lipid_database/OzON_FA_Database/FA(16_x)_OzON_Datab

# SPLIT INTO LIPID DB BASED ON DBs

In [3]:
import os
import pandas as pd

def split_lipid_databases(directory):
    """
    Splits each lipid database file into multiple files based on the unique Lipid values.
    
    Parameters:
    directory (str): Directory containing the lipid database files.
    """
    if not os.path.exists(directory):
        raise FileNotFoundError(f"Directory {directory} does not exist")

    for filename in os.listdir(directory):
        if filename.endswith("_OzON_Database.csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)

            # Get unique lipids from the 'Lipid' column
            unique_lipids = df['Lipid'].unique()

            # Create new files for each unique lipid
            for lipid in unique_lipids:
                lipid_df = df[df['Lipid'] == lipid]
                
                # Extract chain length and db_number from the lipid string
                chain_length, db_number = lipid.split('(')[1].split(')')[0].split(':')
                
                # Generate the new filename based on the lipid information
                new_filename = f"FA({chain_length}_{db_number})_OzON_Database.csv"
                new_filepath = os.path.join(directory, new_filename)
                
                # Save the new DataFrame to a CSV file
                lipid_df.to_csv(new_filepath, index=False)
                print(f"DataFrame for {lipid} successfully saved to: {new_filepath}")

# Example usage
directory = 'lipid_database/OzON_FA_Database/'
split_lipid_databases(directory)


DataFrame for FA(20:6) successfully saved to: lipid_database/OzON_FA_Database/FA(20_6)_OzON_Database.csv
DataFrame for FA(20:5) successfully saved to: lipid_database/OzON_FA_Database/FA(20_5)_OzON_Database.csv
DataFrame for FA(20:4) successfully saved to: lipid_database/OzON_FA_Database/FA(20_4)_OzON_Database.csv
DataFrame for FA(20:3) successfully saved to: lipid_database/OzON_FA_Database/FA(20_3)_OzON_Database.csv
DataFrame for FA(20:2) successfully saved to: lipid_database/OzON_FA_Database/FA(20_2)_OzON_Database.csv
DataFrame for FA(20:1) successfully saved to: lipid_database/OzON_FA_Database/FA(20_1)_OzON_Database.csv
DataFrame for FA(9:6) successfully saved to: lipid_database/OzON_FA_Database/FA(9_6)_OzON_Database.csv
DataFrame for FA(9:5) successfully saved to: lipid_database/OzON_FA_Database/FA(9_5)_OzON_Database.csv
DataFrame for FA(9:4) successfully saved to: lipid_database/OzON_FA_Database/FA(9_4)_OzON_Database.csv
DataFrame for FA(9:3) successfully saved to: lipid_database/O

# Convert all values from df into 1 long df with the name including all the relevant info

In [5]:
# import pandas as pd

# # Assuming results_df already has all the data it needs
# # Create a new DataFrame
# new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

# # Iterate over the rows of the original DataFrame
# for _, row in result_df.iterrows():
#     lipid = row['Lipid']
#     db_location = row['DB Location']
    
#     for i in range(2, chain_length +1):  # Assuming chain length of 20 for the given example
#         column_name = f'n-{i}'
#         if pd.notna(row[column_name]):
#             new_row = {
#                 'Lipid': f'{lipid}_{db_location}_{column_name}',
#                 'Parent Ion': row[column_name],
#                 'Chain length': row['Chain length'],
#                 'DB Number': row['DB Number'],
#                 'DB Location': row['DB Location'],
#                 'FA mass': row['FA mass'],
#                 '[FA+AMP]+': row['[FA+AMP]+']
#             }
#             new_df = new_df.append(new_row, ignore_index=True)

# new_df
#### make a function
import pandas as pd

def expand_lipid_data(result_df: pd.DataFrame, chain_length: int = 20) -> pd.DataFrame:
    """
    Expands the lipid data in the input DataFrame by iterating over a specified chain length range
    and creating new rows for each chain length found in the input data.

    Args:
    result_df (pd.DataFrame): The original DataFrame containing lipid data.
    chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    pd.DataFrame: A new DataFrame with expanded lipid data.
    """
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])
    
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']
        
        for i in range(2, chain_length + 1):
            column_name = f'n-{i}'
            if pd.notna(row.get(column_name)):  # Using row.get() to avoid KeyError
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)
    
    return new_df

# Example usage
new_df = expand_lipid_data(result_df)
new_df


Unnamed: 0,Lipid,Parent Ion,Chain length,DB Number,DB Location,FA mass,[FA+AMP]+
0,FA(20:6)_<BBBBB>_n-2,455.3,20,6,<BBBBB>,300.2,467.3
1,FA(20:6)_<BBBBB>_n-3,441.3,20,6,<BBBBB>,300.2,467.3
2,FA(20:6)_<BBBBB>_n-4,427.3,20,6,<BBBBB>,300.2,467.3
3,FA(20:6)_<BBBBB>_n-5,413.3,20,6,<BBBBB>,300.2,467.3
4,FA(20:6)_<BBBBB>_n-6,399.3,20,6,<BBBBB>,300.2,467.3
...,...,...,...,...,...,...,...
297,FA(20:1)_<>_n-14,297.3,20,1,<>,310.2,477.3
298,FA(20:1)_<>_n-15,283.3,20,1,<>,310.2,477.3
299,FA(20:1)_<>_n-16,269.3,20,1,<>,310.2,477.3
300,FA(20:1)_<>_n-17,255.3,20,1,<>,310.2,477.3


# seperate into own dfs

In [6]:
# import pandas as pd

# # Assuming result_df already has all the data it needs
# # Create a new DataFrame
# new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

# # Iterate over the rows of the original DataFrame
# chain_length = 20  # Assuming chain length of 20 for the given example

# for _, row in result_df.iterrows():
#     lipid = row['Lipid']
#     db_location = row['DB Location']
    
#     for i in range(2, chain_length + 1):  # Iterating from 2 to chain_length
#         column_name = f'n-{i}'
#         if pd.notna(row[column_name]):
#             new_row = {
#                 'Lipid': f'{lipid}_{db_location}_{column_name}',
#                 'Parent Ion': row[column_name],
#                 'Chain length': row['Chain length'],
#                 'DB Number': row['DB Number'],
#                 'DB Location': row['DB Location'],
#                 'FA mass': row['FA mass'],
#                 '[FA+AMP]+': row['[FA+AMP]+']
#             }
#             new_df = new_df.append(new_row, ignore_index=True)

# # Create a dictionary to store the separate DataFrames
# dfs_dict = {}

# # Get unique combinations of Chain length and DB Number
# unique_combinations = new_df[['Chain length', 'DB Number']].drop_duplicates()

# for _, combo in unique_combinations.iterrows():
#     chain_length = combo['Chain length']
#     db_number = combo['DB Number']
    
#     # Filter the DataFrame based on the current combination
#     filtered_df = new_df[(new_df['Chain length'] == chain_length) & (new_df['DB Number'] == db_number)]
    
#     # Store the filtered DataFrame in the dictionary
#     dfs_dict[f'Chain_length_{chain_length}_DB_Number_{db_number}'] = filtered_df

# # Displaying the dictionary keys to verify the split
# dfs_dict.keys()

### make int oa function
import pandas as pd

def split_lipid_dataframe(result_df, chain_length=20):
    """
    This function takes in a DataFrame, iterates over its rows, and creates a new DataFrame
    with additional columns. It then splits the new DataFrame into separate DataFrames based 
    on unique combinations of Chain length and DB Number.

    Parameters:
    - result_df (pd.DataFrame): The original DataFrame with lipid data.
    - chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    - dict: A dictionary containing separate DataFrames for each unique combination of 
            Chain length and DB Number.
    """
    
    # Create a new DataFrame with the desired columns
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

    # Iterate over the rows of the original DataFrame
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']

        for i in range(2, chain_length + 1):  # Iterating from 2 to chain_length
            column_name = f'n-{i}'
            if pd.notna(row[column_name]):
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)

    # Create a dictionary to store the separate DataFrames
    dfs_dict = {}

    # Get unique combinations of Chain length and DB Number
    unique_combinations = new_df[['Chain length', 'DB Number']].drop_duplicates()

    for _, combo in unique_combinations.iterrows():
        chain_length_val = combo['Chain length']
        db_number = combo['DB Number']

        # Filter the DataFrame based on the current combination
        filtered_df = new_df[(new_df['Chain length'] == chain_length_val) & (new_df['DB Number'] == db_number)]

        # Store the filtered DataFrame in the dictionary
        dfs_dict[f'Chain_length_{chain_length_val}_DB_Number_{db_number}'] = filtered_df

    return dfs_dict

# Example usage
dfs_dict = split_lipid_dataframe(result_df)
#Displaying the dictionary keys to verify the split
print(dfs_dict.keys())



KeyError: 'n-19'

# fixed for NaN values

In [None]:
import pandas as pd

def split_lipid_dataframe(result_df, chain_length=20):
    """
    This function takes in a DataFrame, iterates over its rows, and creates a new DataFrame
    with additional columns. It then splits the new DataFrame into separate DataFrames based 
    on unique combinations of Chain length and DB Number.

    Parameters:
    - result_df (pd.DataFrame): The original DataFrame with lipid data.
    - chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    - dict: A dictionary containing separate DataFrames for each unique combination of 
            Chain length and DB Number.
    """
    
    # Create a new DataFrame with the desired columns
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

    # Iterate over the rows of the original DataFrame
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']

        for i in range(2, chain_length + 1):  # Iterating from 2 to chain_length
            column_name = f'n-{i}'
            if column_name in row and pd.notna(row[column_name]):
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)

    # Create a dictionary to store the separate DataFrames
    dfs_dict = {}

    # Get unique combinations of Chain length and DB Number
    unique_combinations = new_df[['Chain length', 'DB Number']].drop_duplicates()

    for _, combo in unique_combinations.iterrows():
        chain_length_val = combo['Chain length']
        db_number = combo['DB Number']

        # Filter the DataFrame based on the current combination
        filtered_df = new_df[(new_df['Chain length'] == chain_length_val) & (new_df['DB Number'] == db_number)]

        # Store the filtered DataFrame in the dictionary
        dfs_dict[f'Chain_length_{chain_length_val}_DB_Number_{db_number}'] = filtered_df

    return dfs_dict


# Run the function
dfs_dict = split_lipid_dataframe(result_df)

# Display the dictionary keys to verify the split
print(dfs_dict.keys())

# # Display the first few rows of one of the resulting DataFrames to check the contents
# for key, df in dfs_dict.items():
#     print(f"DataFrame for {key}:")
#     print(df.head(), "\n")

dict_keys(['Chain_length_20_DB_Number_4', 'Chain_length_20_DB_Number_3', 'Chain_length_20_DB_Number_2', 'Chain_length_20_DB_Number_1'])


In [None]:
key = 'Chain_length_20_DB_Number_2'

dfs_dict[key]

Unnamed: 0,Lipid,Parent Ion,Chain length,DB Number,DB Location,FA mass,[FA+AMP]+
101,FA(20:2)_<B>_n-2,463.3,20,2,<B>,308.2,475.3
102,FA(20:2)_<B>_n-3,449.3,20,2,<B>,308.2,475.3
103,FA(20:2)_<B>_n-4,435.3,20,2,<B>,308.2,475.3
104,FA(20:2)_<B>_n-5,421.3,20,2,<B>,308.2,475.3
105,FA(20:2)_<B>_n-6,407.3,20,2,<B>,308.2,475.3
106,FA(20:2)_<B>_n-7,393.3,20,2,<B>,308.2,475.3
107,FA(20:2)_<B>_n-8,379.3,20,2,<B>,308.2,475.3
108,FA(20:2)_<B>_n-9,365.3,20,2,<B>,308.2,475.3
109,FA(20:2)_<B>_n-10,351.3,20,2,<B>,308.2,475.3
110,FA(20:2)_<B>_n-11,337.3,20,2,<B>,308.2,475.3
