In [40]:
import pandas as pd
import re
import numpy as np

formula = pd.read_excel('formula.xlsx')
infor = pd.read_excel('infor.xlsx')

In [46]:
samples = infor[infor['compound'].str.contains('MTEE', na=False)]
print(samples)

Empty DataFrame
Columns: [id, compound, C, H, O, F, N, S, P, Li, B, Cl, As, Rb, Si, K, Mg, Cs, MW, cyclic, density (g/mL), Boiling, Vabc, Radius, SLogP, SP, apol, bpol, SMILES]
Index: []

[0 rows x 29 columns]


In [41]:
def split_general_formula(row):
    try:
        formula_str = str(row['formula']).strip()
        match = re.match(r"([A-Za-z0-9\-]+(?:-[A-Za-z0-9\-]+)*)\s*\(([\d\.\-]+(?:-[\d\.\-]+)*)\s*mol\)", formula_str)
        if not match:
            return pd.Series()
        compounds_part, mols_part = match.groups()
        compounds = compounds_part.split("-")
        mols = [float(m) for m in mols_part.split("-")]
        if len(compounds) != len(mols):
            return pd.Series()
        data = {}
        for i, (compound, mol) in enumerate(zip(compounds, mols), start=1):
            data[f"c{i}"] = compound
            data[f"m{i}"] = mol

        return pd.Series(data)
    except Exception as e:
        return pd.Series()
df_result = formula.apply(split_general_formula, axis=1)
df_result = df_result.dropna(how='all')
formula = pd.concat([formula, df_result], axis=1)


In [42]:
mw_dict = dict(zip(infor['compound'], infor['MW']))
for i in formula.index:
    for j in range(1, 9):
        c_col = f'c{j}'
        m_col = f'm{j}'
        n_col = f'n{j}'
        
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            mol = formula.loc[i, m_col]
            if compound in mw_dict:
                formula.loc[i, n_col] = mw_dict[compound] * mol
            else:
                compound_with_formula = next((k for k in mw_dict.keys() if k.startswith(compound + " (")), None)
                if compound_with_formula:
                    formula.loc[i, n_col] = mw_dict[compound_with_formula] * mol

In [None]:
o_dict = dict(zip(infor['compound'], infor['O']))
f_dict = dict(zip(infor['compound'], infor['F']))
c_dict = dict(zip(infor['compound'], infor['C']))
h_dict = dict(zip(infor['compound'], infor['H']))

for i in formula.index:
    for j in range(1, 9):
        c_col = f'c{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            if 'Li' in compound:
                # For Li-containing compounds
                for k in o_dict.keys():
                    if k.startswith(compound + " ("):
                        formula.loc[i, f'a{j}O'] = o_dict[k] if pd.notna(o_dict[k]) else 0
                        formula.loc[i, f'a{j}F'] = f_dict[k] if pd.notna(f_dict[k]) else 0
                        break
            else:
                # For non-Li compounds
                for k in o_dict.keys():
                    if k.startswith(compound + " ("):
                        formula.loc[i, f's{j}C'] = c_dict[k] if pd.notna(c_dict[k]) else 0
                        formula.loc[i, f's{j}H'] = h_dict[k] if pd.notna(h_dict[k]) else 0
                        formula.loc[i, f's{j}O'] = o_dict[k] if pd.notna(o_dict[k]) else 0
                        formula.loc[i, f's{j}F'] = f_dict[k] if pd.notna(f_dict[k]) else 0
                        break
