In [9]:
import pandas as pd

def calculate_n_values(chain_length, db_number, fa_mass, AMP, db_locations, dbs_in_front):
    # Create a DataFrame to store the results
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length)])

    # Calculate the [FA+AMP]+ value (assuming it is a fixed increment from FA mass)
    fa_amp = fa_mass + AMP  # Example increment to get [FA+AMP]+

    for i, db_location in enumerate(db_locations):
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }
        
        for j in range(2, chain_length):
            if db_location == '<BBB>':
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2)
            elif db_location == '<BBF>':
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * 1
            elif db_location == '<BFF>':
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * 2
            elif db_location == '<FFF>':
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * 3

        df = df.append(row, ignore_index=True)

    return df

# Example usage
chain_length = 20
db_number = 4
fa_mass = 304.2
AMP = 167.1
db_locations = ['<BBB>', '<BBF>', '<BFF>', '<FFF>']
dbs_in_front = [0, 1, 2, 3]

result_df = calculate_n_values(chain_length, db_number, fa_mass, AMP, db_locations, dbs_in_front)
result_df


Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18,n-19
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,347.3,333.3,319.3,305.3,291.3,277.3,263.3,249.3,235.3,221.3
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,461.3,447.3,433.3,...,349.3,335.3,321.3,307.3,293.3,279.3,265.3,251.3,237.3,223.3
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,463.3,449.3,435.3,...,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,239.3,225.3
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,465.3,451.3,437.3,...,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3,227.3


# trying to automatic db locatiosn

In [29]:
import pandas as pd

def generate_db_locations(db_number):
    # Generate patterns where 'F' is always on the right of 'B'
    length = db_number - 1
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns
    
    return ['<' + p + '>' for p in generate_patterns(length)]

def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    # Generate DB locations based on the DB number
    db_locations = generate_db_locations(db_number)
    dbs_in_front = list(range(len(db_locations)))

    # Create a DataFrame to store the results
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length - 1)])

    # Calculate the [FA+AMP]+ value (assuming it is a fixed increment from FA mass)
    fa_amp = fa_mass + AMP  # Example increment to get [FA+AMP]+

    for i, db_location in enumerate(db_locations):
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }
        
        for j in range(2, chain_length - 1):
            b_count = db_location.count('B')
            f_count = db_location.count('F')
            row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        df = df.append(row, ignore_index=True)

    return df

# Example usage
chain_length = 20
db_number = 4
fa_mass = 304.2
AMP = 167.1

result_df = calculate_n_values(chain_length, db_number, fa_mass, AMP)
result_df

# Print the generated DB locations for different DB numbers as a demonstration
for db_number in range(2, 7):
    print(f"DB Number {db_number}: {generate_db_locations(db_number)}")

result_df


DB Number 2: ['<B>', '<F>']
DB Number 3: ['<BB>', '<BF>', '<FF>']
DB Number 4: ['<BBB>', '<BBF>', '<BFF>', '<FFF>']
DB Number 5: ['<BBBB>', '<BBBF>', '<BBFF>', '<BFFF>', '<FFFF>']
DB Number 6: ['<BBBBB>', '<BBBBF>', '<BBBFF>', '<BBFFF>', '<BFFFF>', '<FFFFF>']


Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-9,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,361.3,347.3,333.3,319.3,305.3,291.3,277.3,263.3,249.3,235.3
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,461.3,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,251.3,237.3
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,239.3
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3


20 5

In [38]:
import pandas as pd

def generate_db_locations(db_number):
    # Generate patterns where 'F' is always on the right of 'B'
    length = db_number - 1
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns
    
    return ['<' + p + '>' for p in generate_patterns(length)]

def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    # Generate DB locations based on the DB number
    db_locations = generate_db_locations(db_number)
    dbs_in_front = list(range(len(db_locations)))

    # Create a DataFrame to store the results
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length - 1)])

    # Calculate the [FA+AMP]+ value (assuming it is a fixed increment from FA mass)
    fa_amp = fa_mass + AMP  # Example increment to get [FA+AMP]+

    for i, db_location in enumerate(db_locations):
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }
        
        for j in range(2, chain_length - 1):
            b_count = db_location.count('B')
            f_count = db_location.count('F')
            row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        df = df.append(row, ignore_index=True)

    return df

# Example usage
chain_length = 20
db_number = 4
fa_mass = 304.2
AMP = 167.1

result_df = calculate_n_values(chain_length, db_number, fa_mass, AMP)
result_df

# Print the generated DB locations for different DB numbers as a demonstration
for db_number in range(2, db_number +1):
    print(f"DB Number {db_number}: {generate_db_locations(db_number)}")

result_df


DB Number 2: ['<B>', '<F>']
DB Number 3: ['<BB>', '<BF>', '<FF>']
DB Number 4: ['<BBB>', '<BBF>', '<BFF>', '<FFF>']


Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-9,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,361.3,347.3,333.3,319.3,305.3,291.3,277.3,263.3,249.3,235.3
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,461.3,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,251.3,237.3
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,239.3
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3


do for all db lengths for 1 chain kind aowrking

In [59]:
import pandas as pd

def generate_db_locations(db_number):
    # Generate patterns where 'F' is always on the right of 'B'
    length = db_number - 1
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns
    
    return ['<' + p + '>' for p in generate_patterns(length)]

def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    # Generate DB locations based on the DB number
    db_locations = generate_db_locations(db_number)
    dbs_in_front = list(range(len(db_locations)))

    # Create a DataFrame to store the results
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length - 2)])

    # Calculate the [FA+AMP]+ value (assuming it is a fixed increment from FA mass)
    fa_amp = fa_mass + AMP  # Example increment to get [FA+AMP]+

    for i, db_location in enumerate(db_locations):
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }
        
        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n = chain_length - db_number + b_count

        for j in range(2, max_n + 2):
            row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        df = df.append(row, ignore_index=True)

    return df

def generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP):
    all_dfs = []
    fa_mass = starting_fa_mass
    
    for db_number in range(starting_db_number, 0, -1):
        df = calculate_n_values(chain_length, db_number, fa_mass, AMP)
        all_dfs.append(df)
        fa_mass += 2
    
    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(all_dfs, ignore_index=True)
    
    return result_df

# Example usage
chain_length = 20
starting_db_number = 4
starting_fa_mass = 304.2
AMP = 167.1

result_df = generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP)
print(result_df)

# Print the generated DB locations for different DB numbers as a demonstration
for db_number in range(1, starting_db_number + 1):
    print(f"DB Number {db_number}: {generate_db_locations(db_number)}")

#result_df to excel 
result_df.to_excel('result_df.xlsx', index=False)
result_df

      Lipid Chain length DB Number DB Location DBs in front  FA mass  \
0  FA(20:4)           20         4       <BBB>            0    304.2   
1  FA(20:4)           20         4       <BBF>            1    304.2   
2  FA(20:4)           20         4       <BFF>            2    304.2   
3  FA(20:4)           20         4       <FFF>            3    304.2   
4  FA(20:3)           20         3        <BB>            0    306.2   
5  FA(20:3)           20         3        <BF>            1    306.2   
6  FA(20:3)           20         3        <FF>            2    306.2   
7  FA(20:2)           20         2         <B>            0    308.2   
8  FA(20:2)           20         2         <F>            1    308.2   
9  FA(20:1)           20         1          <>            0    310.2   

   [FA+AMP]+    n-2    n-3    n-4  ...   n-11   n-12   n-13   n-14   n-15  \
0      471.3  459.3  445.3  431.3  ...  333.3  319.3  305.3  291.3  277.3   
1      471.3  461.3  447.3  433.3  ...  335.3  321.3 

Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18,n-19,n-20
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,333.3,319.3,305.3,291.3,277.3,263.3,249.3,235.3,221.3,207.3
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,461.3,447.3,433.3,...,335.3,321.3,307.3,293.3,279.3,265.3,251.3,237.3,223.3,
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,463.3,449.3,435.3,...,337.3,323.3,309.3,295.3,281.3,267.3,253.3,239.3,,
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,465.3,451.3,437.3,...,339.3,325.3,311.3,297.3,283.3,269.3,255.3,,,
4,FA(20:3),20,3,<BB>,0,306.2,473.3,461.3,447.3,433.3,...,335.3,321.3,307.3,293.3,279.3,265.3,251.3,237.3,223.3,209.3
5,FA(20:3),20,3,<BF>,1,306.2,473.3,463.3,449.3,435.3,...,337.3,323.3,309.3,295.3,281.3,267.3,253.3,239.3,225.3,
6,FA(20:3),20,3,<FF>,2,306.2,473.3,465.3,451.3,437.3,...,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3,,
7,FA(20:2),20,2,<B>,0,308.2,475.3,463.3,449.3,435.3,...,337.3,323.3,309.3,295.3,281.3,267.3,253.3,239.3,225.3,211.3
8,FA(20:2),20,2,<F>,1,308.2,475.3,465.3,451.3,437.3,...,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3,227.3,
9,FA(20:1),20,1,<>,0,310.2,477.3,465.3,451.3,437.3,...,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3,227.3,213.3


# Adjust kinda wokring script

In [85]:
import pandas as pd

# Function to generate all possible double bond locations for a given DB number
def generate_db_locations(db_number):
    length = db_number - 1  # Adjust the length to be one less than the number of double bonds

    # Recursive helper function to generate patterns
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':  # Ensure 'B' is always on the left of 'F'
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns

    # Wrap each generated pattern with angle brackets
    return ['<' + p + '>' for p in generate_patterns(length)]

# Function to calculate n-values and other properties for a given chain length and DB number
def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    db_locations = generate_db_locations(db_number)  # Generate possible double bond location patterns
    dbs_in_front = list(range(len(db_locations)))  # Create a list of indices for double bonds in front

    # Create an empty DataFrame with specified columns
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length - 2)])

    fa_amp = fa_mass + AMP  # Calculate the [FA+AMP]+ value

    for i, db_location in enumerate(db_locations):
        # Initialize a row dictionary with lipid properties
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }

        # Count the number of 'B's and 'F's in the pattern
        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n = chain_length - db_number + f_count -2  # Determine the maximum n-value

        # Calculate n-values and add them to the row
        for j in range(2, max_n + 2):
            row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        # Append the row to the DataFrame
        df = df.append(row, ignore_index=True)

    return df

# Function to generate values for all DB numbers starting from a given number down to 1
def generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP):
    all_dfs = []
    fa_mass = starting_fa_mass

    # Loop through each DB number from starting_db_number to 1
    for db_number in range(starting_db_number, 0, -1):
        df = calculate_n_values(chain_length, db_number, fa_mass, AMP)  # Calculate n-values for the current DB number
        all_dfs.append(df)  # Append the resulting DataFrame to the list
        fa_mass += 2  # Increment FA mass for the next iteration

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(all_dfs, ignore_index=True)

    return result_df

# Example usage
chain_length = 20
starting_db_number = 4
starting_fa_mass = 304.2
AMP = 167.1

# Generate the DataFrame and print it
result_df = generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP)
print(result_df)

# Print the generated DB locations for different DB numbers as a demonstration
for db_number in range(1, starting_db_number + 1):
    print(f"DB Number {db_number}: {generate_db_locations(db_number)}")

result_df


      Lipid Chain length DB Number DB Location DBs in front  FA mass  \
0  FA(20:4)           20         4       <BBB>            0    304.2   
1  FA(20:4)           20         4       <BBF>            1    304.2   
2  FA(20:4)           20         4       <BFF>            2    304.2   
3  FA(20:4)           20         4       <FFF>            3    304.2   
4  FA(20:3)           20         3        <BB>            0    306.2   
5  FA(20:3)           20         3        <BF>            1    306.2   
6  FA(20:3)           20         3        <FF>            2    306.2   
7  FA(20:2)           20         2         <B>            0    308.2   
8  FA(20:2)           20         2         <F>            1    308.2   
9  FA(20:1)           20         1          <>            0    310.2   

   [FA+AMP]+    n-2    n-3    n-4  ...    n-9   n-10   n-11   n-12   n-13  \
0      471.3  459.3  445.3  431.3  ...  361.3  347.3  333.3  319.3  305.3   
1      471.3  461.3  447.3  433.3  ...  363.3  349.3 

Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-9,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,361.3,347.3,333.3,319.3,305.3,291.3,277.3,,,
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,461.3,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,,
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
4,FA(20:3),20,3,<BB>,0,306.2,473.3,461.3,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,,
5,FA(20:3),20,3,<BF>,1,306.2,473.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
6,FA(20:3),20,3,<FF>,2,306.2,473.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
7,FA(20:2),20,2,<B>,0,308.2,475.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
8,FA(20:2),20,2,<F>,1,308.2,475.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
9,FA(20:1),20,1,<>,0,310.2,477.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3


## adjust for front now  based of <F> number so more F removethose starting at n-2

In [3]:
import pandas as pd
import numpy as np

# Function to generate all possible double bond locations for a given DB number
def generate_db_locations(db_number):
    length = db_number - 1  # Adjust the length to be one less than the number of double bonds

    # Recursive helper function to generate patterns
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':  # Ensure 'B' is always on the left of 'F'
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns

    # Wrap each generated pattern with angle brackets
    return ['<' + p + '>' for p in generate_patterns(length)]

# Function to calculate n-values and other properties for a given chain length and DB number
def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    db_locations = generate_db_locations(db_number)  # Generate possible double bond location patterns
    dbs_in_front = list(range(len(db_locations)))  # Create a list of indices for double bonds in front

    # Create an empty DataFrame with specified columns
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length - 2)])

    fa_amp = fa_mass + AMP  # Calculate the [FA+AMP]+ value

    for i, db_location in enumerate(db_locations):
        # Initialize a row dictionary with lipid properties
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }

        # Count the number of 'B's and 'F's in the pattern
        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n = chain_length - db_number + f_count - 2 # Determine the maximum n-value
        min_n = f_count  # Determine the minimum n-value

        # Calculate n-values and add them to the row
        for j in range(2, chain_length - 2):
            if j < min_n + 2:
                row[f'n-{j}'] = pd.NA
            else:
                row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        # Append the row to the DataFrame
        df = df.append(row, ignore_index=True)

    return df

# Function to generate values for all DB numbers starting from a given number down to 1
def generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP):
    all_dfs = []
    fa_mass = starting_fa_mass

    # Loop through each DB number from starting_db_number to 1
    for db_number in range(starting_db_number, 0, -1):
        df = calculate_n_values(chain_length, db_number, fa_mass, AMP)  # Calculate n-values for the current DB number
        all_dfs.append(df)  # Append the resulting DataFrame to the list
        fa_mass += 2  # Increment FA mass for the next iteration

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(all_dfs, ignore_index=True)

    return result_df

# Function to calculate min n-values and other properties for a given chain length and DB number
def calculate_min_n_values(df, chain_length, db_number):
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Check if the current row's DB Number and Chain length match the provided values
        if row['DB Number'] == db_number and row['Chain length'] == chain_length:
            # Get the double bond location pattern for the current row
            db_location = row['DB Location']
            
            # Count the number of 'B' characters in the pattern, indicating the double bonds at the back
            b_count = db_location.count('B')
            
            # Determine the end index for setting NA values based on b_count
            start_na_index = chain_length - b_count - 2
            
            # Iterate over the possible n-values from the end of the chain to n-2
            for j in range(chain_length - 1, 1, -1):
                # If the current n-value index is within the range to be set to NA, set it to NA
                if j > start_na_index:
                    df.at[index, f'n-{j}'] = pd.NA  # pd.NA is used to represent missing values

    # Return the updated DataFrame
    return df

# Example usage
chain_length = 20
starting_db_number = 4
starting_fa_mass = 304.2
AMP = 167.1

# Generate the DataFrame and print it
result_df = generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP)
print(result_df)

# Print the generated DB locations for different DB numbers as a demonstration
for db_number in range(1, starting_db_number + 1):
    print(f"DB Number {db_number}: {generate_db_locations(db_number)}")

result_df


# # Update the DataFrame with minimum n-values
# for db_number in range(1, starting_db_number + 1):
#     result_df = calculate_min_n_values(result_df, chain_length, db_number)

# result_df

      Lipid Chain length DB Number DB Location DBs in front  FA mass  \
0  FA(20:4)           20         4       <BBB>            0    304.2   
1  FA(20:4)           20         4       <BBF>            1    304.2   
2  FA(20:4)           20         4       <BFF>            2    304.2   
3  FA(20:4)           20         4       <FFF>            3    304.2   
4  FA(20:3)           20         3        <BB>            0    306.2   
5  FA(20:3)           20         3        <BF>            1    306.2   
6  FA(20:3)           20         3        <FF>            2    306.2   
7  FA(20:2)           20         2         <B>            0    308.2   
8  FA(20:2)           20         2         <F>            1    308.2   
9  FA(20:1)           20         1          <>            0    310.2   

   [FA+AMP]+    n-2    n-3    n-4  ...    n-8    n-9   n-10   n-11   n-12  \
0      471.3  459.3  445.3  431.3  ...  375.3  361.3  347.3  333.3  319.3   
1      471.3   <NA>  447.3  433.3  ...  377.3  363.3 

Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-8,n-9,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,375.3,361.3,347.3,333.3,319.3,305.3,291.3,277.3,263.3,249.3
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,,447.3,433.3,...,377.3,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,251.3
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,,,435.3,...,379.3,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,,,,...,381.3,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3
4,FA(20:3),20,3,<BB>,0,306.2,473.3,461.3,447.3,433.3,...,377.3,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,251.3
5,FA(20:3),20,3,<BF>,1,306.2,473.3,,449.3,435.3,...,379.3,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3
6,FA(20:3),20,3,<FF>,2,306.2,473.3,,,437.3,...,381.3,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3
7,FA(20:2),20,2,<B>,0,308.2,475.3,463.3,449.3,435.3,...,379.3,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3
8,FA(20:2),20,2,<F>,1,308.2,475.3,,451.3,437.3,...,381.3,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3
9,FA(20:1),20,1,<>,0,310.2,477.3,465.3,451.3,437.3,...,381.3,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3


# FRONT AND BACK n-# removal working

In [14]:
import pandas as pd

# Function to generate all possible double bond locations for a given DB number
def generate_db_locations(db_number):
    length = db_number - 1  # Adjust the length to be one less than the number of double bonds

    # Recursive helper function to generate patterns
    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':  # Ensure 'B' is always on the left of 'F'
            patterns += generate_patterns(n-1, pattern + 'B')
        patterns += generate_patterns(n-1, pattern + 'F')
        return patterns

    # Wrap each generated pattern with angle brackets
    return ['<' + p + '>' for p in generate_patterns(length)]

# Function to calculate n-values and other properties for a given chain length and DB number
def calculate_n_values(chain_length, db_number, fa_mass, AMP):
    db_locations = generate_db_locations(db_number)  # Generate possible double bond location patterns
    dbs_in_front = list(range(len(db_locations)))  # Create a list of indices for double bonds in front

    # Create an empty DataFrame with specified columns
    df = pd.DataFrame(columns=['Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'] + [f'n-{i}' for i in range(2, chain_length - 2)])

    fa_amp = fa_mass + AMP  # Calculate the [FA+AMP]+ value

    for i, db_location in enumerate(db_locations):
        # Initialize a row dictionary with lipid properties
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp
        }

        # Count the number of 'B's and 'F's in the pattern
        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n = chain_length - db_number + f_count -2  # Determine the maximum n-value

        # Calculate n-values and add them to the row
        for j in range(2, max_n + 2):
            row[f'n-{j}'] = fa_mass + AMP - 12 - 14 * (j - 2) + 2 * f_count

        # Append the row to the DataFrame
        df = df.append(row, ignore_index=True)

    return df

# Function to generate values for all DB numbers starting from a given number down to 1
def generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP):
    all_dfs = []
    fa_mass = starting_fa_mass

    # Loop through each DB number from starting_db_number to 1
    for db_number in range(starting_db_number, 0, -1):
        df = calculate_n_values(chain_length, db_number, fa_mass, AMP)  # Calculate n-values for the current DB number
        all_dfs.append(df)  # Append the resulting DataFrame to the list
        fa_mass += 2  # Increment FA mass for the next iteration

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(all_dfs, ignore_index=True)

    return result_df

# Example usage
chain_length = 20
starting_db_number = 4
starting_fa_mass = 304.2
AMP = 167.1

# Generate the DataFrame and print it
result_df = generate_all_values(chain_length, starting_db_number, starting_fa_mass, AMP)
print(result_df)

# Print the generated DB locations for different DB numbers as a demonstration
for db_number in range(1, starting_db_number + 1):
    print(f"DB Number {db_number}: {generate_db_locations(db_number)}")

result_df


# Function to calculate min n-values and other properties for a given chain length and DB number
def calculate_min_n_values(df, chain_length, db_number):
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Check if the current row's DB Number and Chain length match the provided values
        if row['DB Number'] == db_number and row['Chain length'] == chain_length:
            # Get the double bond location pattern for the current row
            db_location = row['DB Location']
            
            # Count the number of 'B' characters in the pattern, indicating the double bonds at the back
            b_count = db_location.count('F')
            
            
            # Iterate over the possible n-values from the end of the chain to n-2
            for j in range(2, b_count+2):
                df.at[index, f'n-{j}'] = pd.NA  # pd.NA is used to represent missing values

    # Return the updated DataFrame
    return df


# Update the DataFrame with minimum n-values
for db_number in range(1, starting_db_number + 1):
    result_df = calculate_min_n_values(result_df, chain_length, db_number)

result_df


      Lipid Chain length DB Number DB Location DBs in front  FA mass  \
0  FA(20:4)           20         4       <BBB>            0    304.2   
1  FA(20:4)           20         4       <BBF>            1    304.2   
2  FA(20:4)           20         4       <BFF>            2    304.2   
3  FA(20:4)           20         4       <FFF>            3    304.2   
4  FA(20:3)           20         3        <BB>            0    306.2   
5  FA(20:3)           20         3        <BF>            1    306.2   
6  FA(20:3)           20         3        <FF>            2    306.2   
7  FA(20:2)           20         2         <B>            0    308.2   
8  FA(20:2)           20         2         <F>            1    308.2   
9  FA(20:1)           20         1          <>            0    310.2   

   [FA+AMP]+    n-2    n-3    n-4  ...    n-9   n-10   n-11   n-12   n-13  \
0      471.3  459.3  445.3  431.3  ...  361.3  347.3  333.3  319.3  305.3   
1      471.3  461.3  447.3  433.3  ...  363.3  349.3 

Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-9,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,361.3,347.3,333.3,319.3,305.3,291.3,277.3,,,
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,,
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,,,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,,,,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
4,FA(20:3),20,3,<BB>,0,306.2,473.3,461.3,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,,
5,FA(20:3),20,3,<BF>,1,306.2,473.3,,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
6,FA(20:3),20,3,<FF>,2,306.2,473.3,,,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
7,FA(20:2),20,2,<B>,0,308.2,475.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
8,FA(20:2),20,2,<F>,1,308.2,475.3,,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
9,FA(20:1),20,1,<>,0,310.2,477.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3


# clean up front and back removal ANNOTATE GOOD

In [22]:
import pandas as pd

def generate_double_bond_locations(db_number):
    """
    Generate all possible double bond locations for a given number of double bonds (db_number).
    Each double bond location pattern is wrapped with angle brackets.

    Parameters:
    db_number (int): Number of double bonds.

    Returns:
    list: List of double bond location patterns.
    """
    length = db_number - 1  # Length is one less than the number of double bonds

    def generate_patterns(n, pattern=""):
        if n == 0:
            return [pattern]
        patterns = []
        if not pattern or pattern[-1] == 'B':  # Ensure 'B' is always on the left of 'F'
            patterns += generate_patterns(n - 1, pattern + 'B')
        patterns += generate_patterns(n - 1, pattern + 'F')
        return patterns

    return ['<' + p + '>' for p in generate_patterns(length)]

def calculate_lipid_properties(chain_length, db_number, fa_mass, amp_mass):
    """
    Calculate lipid properties including n-values and other properties for a given chain length and double bond number.

    Parameters:
    chain_length (int): Length of the fatty acid chain.
    db_number (int): Number of double bonds.
    fa_mass (float): Mass of the fatty acid.
    amp_mass (float): Mass of AMP.

    Returns:
    pd.DataFrame: DataFrame containing lipid properties.
    """
    db_locations = generate_double_bond_locations(db_number)  # Generate double bond location patterns
    dbs_in_front = list(range(len(db_locations)))  # List of indices for double bonds in front

    # Initialize DataFrame with specified columns
    df = pd.DataFrame(columns=[
        'Lipid', 'Chain length', 'DB Number', 'DB Location', 'DBs in front', 'FA mass', '[FA+AMP]+'
    ] + [f'n-{i}' for i in range(2, chain_length - 2)])

    fa_amp_mass = fa_mass + amp_mass  # Calculate the [FA+AMP]+ value

    for i, db_location in enumerate(db_locations):
        # Initialize a row dictionary with lipid properties
        row = {
            'Lipid': f'FA({chain_length}:{db_number})',
            'Chain length': chain_length,
            'DB Number': db_number,
            'DB Location': db_location,
            'DBs in front': dbs_in_front[i],
            'FA mass': fa_mass,
            '[FA+AMP]+': fa_amp_mass
        }

        # Count the number of 'B's and 'F's in the pattern
        b_count = db_location.count('B')
        f_count = db_location.count('F')
        max_n_value = chain_length - db_number + f_count - 2  # Determine the maximum n-value

        # Calculate n-values and add them to the row
        for j in range(2, max_n_value + 2):
            row[f'n-{j}'] = fa_mass + amp_mass - 12 - 14 * (j - 2) + 2 * f_count

        df = df.append(row, ignore_index=True)

    return df

def generate_all_lipid_values(chain_length, starting_db_number, starting_fa_mass, amp_mass):
    """
    Generate values for all double bond numbers starting from a given number down to 1.

    Parameters:
    chain_length (int): Length of the fatty acid chain.
    starting_db_number (int): Starting number of double bonds.
    starting_fa_mass (float): Starting mass of the fatty acid.
    amp_mass (float): Mass of AMP.

    Returns:
    pd.DataFrame: DataFrame containing lipid properties for all double bond numbers.
    """
    all_dataframes = []
    fa_mass = starting_fa_mass

    for db_number in range(starting_db_number, 0, -1):
        df = calculate_lipid_properties(chain_length, db_number, fa_mass, amp_mass)
        all_dataframes.append(df)
        fa_mass += 2  # Increment FA mass for the next iteration

    result_df = pd.concat(all_dataframes, ignore_index=True)
    return result_df

def calculate_min_n_values(df, chain_length, db_number):
    """
    Calculate minimum n-values and update the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame containing lipid properties.
    chain_length (int): Length of the fatty acid chain.
    db_number (int): Number of double bonds.

    Returns:
    pd.DataFrame: Updated DataFrame with minimum n-values.
    """
    for index, row in df.iterrows():
        if row['DB Number'] == db_number and row['Chain length'] == chain_length:
            db_location = row['DB Location']
            b_count = db_location.count('F')

            for j in range(2, b_count + 2):
                df.at[index, f'n-{j}'] = pd.NA  # Set minimum n-values to NA

    return df

def convert_nan_to_na(df):
    """
    Convert all NaN values in the DataFrame to pd.NA.

    Parameters:
    df (pd.DataFrame): Input DataFrame.

    Returns:
    pd.DataFrame: DataFrame with NaN values converted to pd.NA.
    """
    return df.applymap(lambda x: pd.NA if pd.isna(x) else x)


def main():
    # Example usage
    chain_length = 20
    starting_db_number = 4
    starting_fa_mass = 304.2
    amp_mass = 167.1

    # Generate the DataFrame containing lipid properties
    result_df = generate_all_lipid_values(chain_length, starting_db_number, starting_fa_mass, amp_mass)
    print(result_df)

    # Print the generated double bond locations for demonstration
    for db_number in range(1, starting_db_number + 1):
        print(f"DB Number {db_number}: {generate_double_bond_locations(db_number)}")

    # Update the DataFrame with minimum n-values
    for db_number in range(1, starting_db_number + 1):
        result_df = calculate_min_n_values(result_df, chain_length, db_number)

    print(result_df)

if __name__ == "__main__":
    main()


      Lipid Chain length DB Number DB Location DBs in front  FA mass  \
0  FA(20:4)           20         4       <BBB>            0    304.2   
1  FA(20:4)           20         4       <BBF>            1    304.2   
2  FA(20:4)           20         4       <BFF>            2    304.2   
3  FA(20:4)           20         4       <FFF>            3    304.2   
4  FA(20:3)           20         3        <BB>            0    306.2   
5  FA(20:3)           20         3        <BF>            1    306.2   
6  FA(20:3)           20         3        <FF>            2    306.2   
7  FA(20:2)           20         2         <B>            0    308.2   
8  FA(20:2)           20         2         <F>            1    308.2   
9  FA(20:1)           20         1          <>            0    310.2   

   [FA+AMP]+    n-2    n-3    n-4  ...    n-9   n-10   n-11   n-12   n-13  \
0      471.3  459.3  445.3  431.3  ...  361.3  347.3  333.3  319.3  305.3   
1      471.3  461.3  447.3  433.3  ...  363.3  349.3 

In [23]:
# Define parameters
chain_length = 20
starting_db_number = 4
starting_fa_mass = 304.2
amp_mass = 167.1

# Generate the DataFrame containing lipid properties
result_df = generate_all_lipid_values(chain_length, starting_db_number, starting_fa_mass, amp_mass)
print(result_df)

# Print the generated double bond locations for demonstration
for db_number in range(1, starting_db_number + 1):
    print(f"DB Number {db_number}: {generate_double_bond_locations(db_number)}")

# Update the DataFrame with minimum n-values
for db_number in range(1, starting_db_number + 1):
    result_df = calculate_min_n_values(result_df, chain_length, db_number)


# Convert all NaN values to pd.NA
result_df = convert_nan_to_na(result_df)

result_df

      Lipid Chain length DB Number DB Location DBs in front  FA mass  \
0  FA(20:4)           20         4       <BBB>            0    304.2   
1  FA(20:4)           20         4       <BBF>            1    304.2   
2  FA(20:4)           20         4       <BFF>            2    304.2   
3  FA(20:4)           20         4       <FFF>            3    304.2   
4  FA(20:3)           20         3        <BB>            0    306.2   
5  FA(20:3)           20         3        <BF>            1    306.2   
6  FA(20:3)           20         3        <FF>            2    306.2   
7  FA(20:2)           20         2         <B>            0    308.2   
8  FA(20:2)           20         2         <F>            1    308.2   
9  FA(20:1)           20         1          <>            0    310.2   

   [FA+AMP]+    n-2    n-3    n-4  ...    n-9   n-10   n-11   n-12   n-13  \
0      471.3  459.3  445.3  431.3  ...  361.3  347.3  333.3  319.3  305.3   
1      471.3  461.3  447.3  433.3  ...  363.3  349.3 

Unnamed: 0,Lipid,Chain length,DB Number,DB Location,DBs in front,FA mass,[FA+AMP]+,n-2,n-3,n-4,...,n-9,n-10,n-11,n-12,n-13,n-14,n-15,n-16,n-17,n-18
0,FA(20:4),20,4,<BBB>,0,304.2,471.3,459.3,445.3,431.3,...,361.3,347.3,333.3,319.3,305.3,291.3,277.3,,,
1,FA(20:4),20,4,<BBF>,1,304.2,471.3,,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,,
2,FA(20:4),20,4,<BFF>,2,304.2,471.3,,,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
3,FA(20:4),20,4,<FFF>,3,304.2,471.3,,,,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
4,FA(20:3),20,3,<BB>,0,306.2,473.3,461.3,447.3,433.3,...,363.3,349.3,335.3,321.3,307.3,293.3,279.3,265.3,,
5,FA(20:3),20,3,<BF>,1,306.2,473.3,,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
6,FA(20:3),20,3,<FF>,2,306.2,473.3,,,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
7,FA(20:2),20,2,<B>,0,308.2,475.3,463.3,449.3,435.3,...,365.3,351.3,337.3,323.3,309.3,295.3,281.3,267.3,253.3,
8,FA(20:2),20,2,<F>,1,308.2,475.3,,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3
9,FA(20:1),20,1,<>,0,310.2,477.3,465.3,451.3,437.3,...,367.3,353.3,339.3,325.3,311.3,297.3,283.3,269.3,255.3,241.3


# Convert all values from df into 1 long df with the name including all the relevant info

In [24]:
# import pandas as pd

# # Assuming results_df already has all the data it needs
# # Create a new DataFrame
# new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

# # Iterate over the rows of the original DataFrame
# for _, row in result_df.iterrows():
#     lipid = row['Lipid']
#     db_location = row['DB Location']
    
#     for i in range(2, chain_length +1):  # Assuming chain length of 20 for the given example
#         column_name = f'n-{i}'
#         if pd.notna(row[column_name]):
#             new_row = {
#                 'Lipid': f'{lipid}_{db_location}_{column_name}',
#                 'Parent Ion': row[column_name],
#                 'Chain length': row['Chain length'],
#                 'DB Number': row['DB Number'],
#                 'DB Location': row['DB Location'],
#                 'FA mass': row['FA mass'],
#                 '[FA+AMP]+': row['[FA+AMP]+']
#             }
#             new_df = new_df.append(new_row, ignore_index=True)

# new_df
#### make a function
import pandas as pd

def expand_lipid_data(result_df: pd.DataFrame, chain_length: int = 20) -> pd.DataFrame:
    """
    Expands the lipid data in the input DataFrame by iterating over a specified chain length range
    and creating new rows for each chain length found in the input data.

    Args:
    result_df (pd.DataFrame): The original DataFrame containing lipid data.
    chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    pd.DataFrame: A new DataFrame with expanded lipid data.
    """
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])
    
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']
        
        for i in range(2, chain_length + 1):
            column_name = f'n-{i}'
            if pd.notna(row.get(column_name)):  # Using row.get() to avoid KeyError
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)
    
    return new_df

# Example usage
new_df = expand_lipid_data(result_df)
new_df


Unnamed: 0,Lipid,Parent Ion,Chain length,DB Number,DB Location,FA mass,[FA+AMP]+
0,FA(20:4)_<BBB>_n-2,459.3,20,4,<BBB>,304.2,471.3
1,FA(20:4)_<BBB>_n-3,445.3,20,4,<BBB>,304.2,471.3
2,FA(20:4)_<BBB>_n-4,431.3,20,4,<BBB>,304.2,471.3
3,FA(20:4)_<BBB>_n-5,417.3,20,4,<BBB>,304.2,471.3
4,FA(20:4)_<BBB>_n-6,403.3,20,4,<BBB>,304.2,471.3
...,...,...,...,...,...,...,...
145,FA(20:1)_<>_n-14,297.3,20,1,<>,310.2,477.3
146,FA(20:1)_<>_n-15,283.3,20,1,<>,310.2,477.3
147,FA(20:1)_<>_n-16,269.3,20,1,<>,310.2,477.3
148,FA(20:1)_<>_n-17,255.3,20,1,<>,310.2,477.3


# seperate into own dfs

In [25]:
# import pandas as pd

# # Assuming result_df already has all the data it needs
# # Create a new DataFrame
# new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

# # Iterate over the rows of the original DataFrame
# chain_length = 20  # Assuming chain length of 20 for the given example

# for _, row in result_df.iterrows():
#     lipid = row['Lipid']
#     db_location = row['DB Location']
    
#     for i in range(2, chain_length + 1):  # Iterating from 2 to chain_length
#         column_name = f'n-{i}'
#         if pd.notna(row[column_name]):
#             new_row = {
#                 'Lipid': f'{lipid}_{db_location}_{column_name}',
#                 'Parent Ion': row[column_name],
#                 'Chain length': row['Chain length'],
#                 'DB Number': row['DB Number'],
#                 'DB Location': row['DB Location'],
#                 'FA mass': row['FA mass'],
#                 '[FA+AMP]+': row['[FA+AMP]+']
#             }
#             new_df = new_df.append(new_row, ignore_index=True)

# # Create a dictionary to store the separate DataFrames
# dfs_dict = {}

# # Get unique combinations of Chain length and DB Number
# unique_combinations = new_df[['Chain length', 'DB Number']].drop_duplicates()

# for _, combo in unique_combinations.iterrows():
#     chain_length = combo['Chain length']
#     db_number = combo['DB Number']
    
#     # Filter the DataFrame based on the current combination
#     filtered_df = new_df[(new_df['Chain length'] == chain_length) & (new_df['DB Number'] == db_number)]
    
#     # Store the filtered DataFrame in the dictionary
#     dfs_dict[f'Chain_length_{chain_length}_DB_Number_{db_number}'] = filtered_df

# # Displaying the dictionary keys to verify the split
# dfs_dict.keys()

### make int oa function
import pandas as pd

def split_lipid_dataframe(result_df, chain_length=20):
    """
    This function takes in a DataFrame, iterates over its rows, and creates a new DataFrame
    with additional columns. It then splits the new DataFrame into separate DataFrames based 
    on unique combinations of Chain length and DB Number.

    Parameters:
    - result_df (pd.DataFrame): The original DataFrame with lipid data.
    - chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    - dict: A dictionary containing separate DataFrames for each unique combination of 
            Chain length and DB Number.
    """
    
    # Create a new DataFrame with the desired columns
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

    # Iterate over the rows of the original DataFrame
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']

        for i in range(2, chain_length + 1):  # Iterating from 2 to chain_length
            column_name = f'n-{i}'
            if pd.notna(row[column_name]):
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)

    # Create a dictionary to store the separate DataFrames
    dfs_dict = {}

    # Get unique combinations of Chain length and DB Number
    unique_combinations = new_df[['Chain length', 'DB Number']].drop_duplicates()

    for _, combo in unique_combinations.iterrows():
        chain_length_val = combo['Chain length']
        db_number = combo['DB Number']

        # Filter the DataFrame based on the current combination
        filtered_df = new_df[(new_df['Chain length'] == chain_length_val) & (new_df['DB Number'] == db_number)]

        # Store the filtered DataFrame in the dictionary
        dfs_dict[f'Chain_length_{chain_length_val}_DB_Number_{db_number}'] = filtered_df

    return dfs_dict

# Example usage
dfs_dict = split_lipid_dataframe(result_df)
#Displaying the dictionary keys to verify the split
print(dfs_dict.keys())



KeyError: 'n-19'

# fixed for NaN values

In [26]:
import pandas as pd

def split_lipid_dataframe(result_df, chain_length=20):
    """
    This function takes in a DataFrame, iterates over its rows, and creates a new DataFrame
    with additional columns. It then splits the new DataFrame into separate DataFrames based 
    on unique combinations of Chain length and DB Number.

    Parameters:
    - result_df (pd.DataFrame): The original DataFrame with lipid data.
    - chain_length (int): The maximum chain length to iterate over (default is 20).

    Returns:
    - dict: A dictionary containing separate DataFrames for each unique combination of 
            Chain length and DB Number.
    """
    
    # Create a new DataFrame with the desired columns
    new_df = pd.DataFrame(columns=['Lipid', 'Parent Ion', 'Chain length', 'DB Number', 'DB Location', 'FA mass', '[FA+AMP]+'])

    # Iterate over the rows of the original DataFrame
    for _, row in result_df.iterrows():
        lipid = row['Lipid']
        db_location = row['DB Location']

        for i in range(2, chain_length + 1):  # Iterating from 2 to chain_length
            column_name = f'n-{i}'
            if column_name in row and pd.notna(row[column_name]):
                new_row = {
                    'Lipid': f'{lipid}_{db_location}_{column_name}',
                    'Parent Ion': row[column_name],
                    'Chain length': row['Chain length'],
                    'DB Number': row['DB Number'],
                    'DB Location': row['DB Location'],
                    'FA mass': row['FA mass'],
                    '[FA+AMP]+': row['[FA+AMP]+']
                }
                new_df = new_df.append(new_row, ignore_index=True)

    # Create a dictionary to store the separate DataFrames
    dfs_dict = {}

    # Get unique combinations of Chain length and DB Number
    unique_combinations = new_df[['Chain length', 'DB Number']].drop_duplicates()

    for _, combo in unique_combinations.iterrows():
        chain_length_val = combo['Chain length']
        db_number = combo['DB Number']

        # Filter the DataFrame based on the current combination
        filtered_df = new_df[(new_df['Chain length'] == chain_length_val) & (new_df['DB Number'] == db_number)]

        # Store the filtered DataFrame in the dictionary
        dfs_dict[f'Chain_length_{chain_length_val}_DB_Number_{db_number}'] = filtered_df

    return dfs_dict


# Run the function
dfs_dict = split_lipid_dataframe(result_df)

# Display the dictionary keys to verify the split
print(dfs_dict.keys())

# # Display the first few rows of one of the resulting DataFrames to check the contents
# for key, df in dfs_dict.items():
#     print(f"DataFrame for {key}:")
#     print(df.head(), "\n")

dict_keys(['Chain_length_20_DB_Number_4', 'Chain_length_20_DB_Number_3', 'Chain_length_20_DB_Number_2', 'Chain_length_20_DB_Number_1'])


In [27]:
key = 'Chain_length_20_DB_Number_2'

dfs_dict[key]

Unnamed: 0,Lipid,Parent Ion,Chain length,DB Number,DB Location,FA mass,[FA+AMP]+
101,FA(20:2)_<B>_n-2,463.3,20,2,<B>,308.2,475.3
102,FA(20:2)_<B>_n-3,449.3,20,2,<B>,308.2,475.3
103,FA(20:2)_<B>_n-4,435.3,20,2,<B>,308.2,475.3
104,FA(20:2)_<B>_n-5,421.3,20,2,<B>,308.2,475.3
105,FA(20:2)_<B>_n-6,407.3,20,2,<B>,308.2,475.3
106,FA(20:2)_<B>_n-7,393.3,20,2,<B>,308.2,475.3
107,FA(20:2)_<B>_n-8,379.3,20,2,<B>,308.2,475.3
108,FA(20:2)_<B>_n-9,365.3,20,2,<B>,308.2,475.3
109,FA(20:2)_<B>_n-10,351.3,20,2,<B>,308.2,475.3
110,FA(20:2)_<B>_n-11,337.3,20,2,<B>,308.2,475.3
