In [43]:
import pandas as pd
import re
import numpy as np

formula = pd.read_excel('sample.xlsx', sheet_name='formula')
infor = pd.read_excel('sample.xlsx', sheet_name='infor')    

In [44]:
def split_general_formula(row):
    try:
        formula_str = str(row['formula']).strip()
        match = re.match(r"([A-Za-z0-9\-]+(?:-[A-Za-z0-9\-]+)*)\s*\(([\d\.\-]+(?:-[\d\.\-]+)*)\s*mol\)", formula_str)
        if not match:
            return pd.Series()
        compounds_part, mols_part = match.groups()
        compounds = compounds_part.split("-")
        mols = [float(m) for m in mols_part.split("-")]
        if len(compounds) != len(mols):
            return pd.Series()
        data = {}
        for i, (compound, mol_val) in enumerate(zip(compounds, mols), start=1):
            data[f"c{i}"] = compound
            data[f"n{i}"] = mol_val
        return pd.Series(data)
    except:
        return pd.Series()

df_result = formula.apply(split_general_formula, axis=1)
df_result = df_result.dropna(how='all')
formula = pd.concat([formula, df_result], axis=1)


In [45]:
o_dict = dict(zip(infor['compound'], infor['O']))
f_dict = dict(zip(infor['compound'], infor['F']))
c_dict = dict(zip(infor['compound'], infor['C']))
h_dict = dict(zip(infor['compound'], infor['H']))

for i in formula.index:
    s_count = 1  
    a_count = 1
    Li = 0
    sol = 0
    
    for j in range(1, 9):
        c_col = f'c{j}'
        n_col = f'n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n_value = formula.loc[i, n_col]
            
            if 'Li' in compound:
                Li += n_value
                for k in o_dict.keys():
                    if compound in k:
                        formula.loc[i, f'a{a_count}O'] = o_dict[k] if pd.notna(o_dict[k]) else 0
                        formula.loc[i, f'a{a_count}F'] = f_dict[k] if pd.notna(f_dict[k]) else 0
                        a_count += 1
                        break
            else:
                sol += n_value
                for k in o_dict.keys():
                    if compound in k:
                        formula.loc[i, f's{s_count}C'] = c_dict[k] if pd.notna(c_dict[k]) else 0
                        formula.loc[i, f's{s_count}H'] = h_dict[k] if pd.notna(h_dict[k]) else 0
                        formula.loc[i, f's{s_count}O'] = o_dict[k] if pd.notna(o_dict[k]) else 0
                        formula.loc[i, f's{s_count}F'] = f_dict[k] if pd.notna(f_dict[k]) else 0
                        s_count += 1
                        break
    
    formula.loc[i, 'Li'] = Li
    formula.loc[i, 'sol'] = sol
    formula.loc[i, 'Li/sol'] = round(Li/sol, 3) if sol != 0 else 0


In [46]:
prefix_suffix_pairs = [
    ('a', 'O'),
    ('a', 'F'),
    ('s', 'C'),
    ('s', 'H'),
    ('s', 'O'),
    ('s', 'F')
]

for prefix, suffix in prefix_suffix_pairs:
    cols = [col for col in formula.columns if col.startswith(prefix) and col.endswith(suffix) and len(col) == 3]
    formula[prefix + suffix] = formula[cols].sum(axis=1, skipna=True)

formula['sF/sC'] = formula.apply(lambda x: round(x['sF'] / x['sC'], 2) if pd.notna(x['sF']) and pd.notna(x['sC']) and x['sC'] != 0 else 0, axis=1)
formula['sF/sO'] = formula.apply(lambda x: round(x['sF'] / x['sO'], 2) if pd.notna(x['sF']) and pd.notna(x['sO']) and x['sO'] != 0 else 0, axis=1)
formula['sC/sH'] = formula.apply(lambda x: round(x['sC'] / x['sH'], 2) if pd.notna(x['sC']) and pd.notna(x['sH']) and x['sH'] != 0 else 0, axis=1)
formula['sO/sC'] = formula.apply(lambda x: round(x['sO'] / x['sC'], 2) if pd.notna(x['sO']) and pd.notna(x['sC']) and x['sC'] != 0 else 0, axis=1)

In [47]:
smiles_dict = dict(zip(infor['compound'], infor['SMILES']))
formula['dbO'] = 0
for i in formula.index:
    db_count = 0
    for j in range(1, 9):
        c_col = f'c{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            for k in smiles_dict.keys():
                if compound in k and pd.notna(smiles_dict[k]):
                    db_count += smiles_dict[k].count('=')
                    break
    formula.loc[i, 'dbO'] = db_count
formula['sbO'] = formula['sO'] - formula['dbO']

In [48]:
for i in range(1, 9):
    c_col = f'c{i}'
    n_col = f'n{i}'
    
    if c_col in formula.columns:
        pct_col = f'%n{i}'
        formula[pct_col] = formula.apply(
            lambda row: round(row[n_col] / row['sol'] * 100, 2) if pd.notna(row[c_col]) and 'Li' not in str(row[c_col]) else np.nan,
            axis=1
        )

In [49]:
mw_dict = dict(zip(infor['compound'], infor['MW']))
vabc_dict = dict(zip(infor['compound'], infor['Vabc']))

for i in formula.index:
    total_mw = 0
    total_vabc = 0
    for j in range(1, 9):
        c_col = f'c{j}'
        n_col = f'n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n_value = formula.loc[i, n_col]
            if 'Li' not in compound:
                for k in mw_dict.keys():
                    if compound in k:
                        total_mw += mw_dict[k] * n_value if pd.notna(mw_dict[k]) else 0
                        total_vabc += vabc_dict[k] * n_value if pd.notna(vabc_dict[k]) else 0
                        break
    
    formula.loc[i, 'density'] = round(total_mw/total_vabc, 3) if total_vabc != 0 else 0

In [50]:
radius_dict = dict(zip(infor['compound'], infor['Radius']))

for i in formula.index:
    total_radius = 0
    minus_radius = 1
    for j in range(1, 9):
        c_col = f'c{j}'
        n_col = f'n{j}'
        pct_col = f'%n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n_value = formula.loc[i, n_col]
            n2_value = formula.loc[i, pct_col]
            if 'Li' not in compound:
                for k in radius_dict.keys():
                    if compound in k:
                        total_radius += radius_dict[k] * n_value if pd.notna(radius_dict[k]) else 0
                        minus_radius *= radius_dict[k] ** n2_value if pd.notna(radius_dict[k]) else 1
                        break
                        
    formula.loc[i, 'Radius'] = round(total_radius, 3) if total_radius != 0 else 0
    formula.loc[i, 'Radius-'] = round(minus_radius, 3) if minus_radius != 1 else 1
formula['Radius-r'] = formula.apply(lambda x: round(x['Radius'] / x['sol'], 3) if x['sol'] != 0 else 0, axis=1)
formula['Radius+'] = formula.apply(lambda x: round((2 * x['Radius-r']) - x['Radius-'], 3), axis=1)


In [51]:
Vabc_dict = dict(zip(infor['compound'], infor['Vabc']))

for i in formula.index:
    total_vabc = 0
    minus_vabc = 1
    for j in range(1, 9):
        c_col = f'c{j}'
        n_col = f'n{j}'
        pct_col = f'%n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n_value = formula.loc[i, n_col]
            n2_value = formula.loc[i, pct_col]
            if 'Li' not in compound:
                for k in vabc_dict.keys():
                    if compound in k:
                        total_vabc += vabc_dict[k] * n_value if pd.notna(vabc_dict[k]) else 0
                        minus_vabc *= vabc_dict[k] ** n2_value if pd.notna(vabc_dict[k]) else 1
                        break
    formula.loc[i, 'Vabc'] = round(total_vabc, 3) if total_vabc != 0 else 0
    formula.loc[i, 'Vabc-'] = round(minus_vabc, 3) if minus_vabc != 1 else 1
formula['Vabc-r'] = formula.apply(lambda x: round(x['Vabc'] / x['sol'], 3) if x['sol'] != 0 else 0, axis=1)
formula['Vabc+'] = formula.apply(lambda x: round((2 * x['Vabc-r']) - x['Vabc-'], 3), axis=1)

In [52]:
apol_dict = dict(zip(infor['compound'], infor['apol']))

for i in formula.index:
    total_apol = 0
    minus_apol = 1
    for j in range(1, 9):
        c_col = f'c{j}'
        n_col = f'n{j}'
        pct_col = f'%n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n_value = formula.loc[i, n_col]
            n2_value = formula.loc[i, pct_col]
            if 'Li' not in compound:
                for k in apol_dict.keys():
                    if compound in k:
                        total_apol += apol_dict[k] * n_value if pd.notna(apol_dict[k]) else 0
                        minus_apol *= apol_dict[k] ** n2_value if pd.notna(apol_dict[k]) else 1
                        break
                        
    formula.loc[i, 'apol'] = round(total_apol, 3) if total_apol != 0 else 0
    formula.loc[i, 'apol-'] = round(minus_apol, 3) if minus_apol != 1 else 1
formula['apol-r'] = formula.apply(lambda x: round(x['apol'] / x['sol'], 3) if x['sol'] != 0 else 0, axis=1)
formula['apol+'] = formula.apply(lambda x: round((2 * x['apol-r']) - x['apol-'], 3), axis=1)

In [53]:
bpol_dict = dict(zip(infor['compound'], infor['bpol']))

for i in formula.index:
    total_bpol = 0
    minus_bpol = 1
    for j in range(1, 9):
        c_col = f'c{j}'
        n_col = f'n{j}'
        pct_col = f'%n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n_value = formula.loc[i, n_col]
            n2_value = formula.loc[i, pct_col]
            if 'Li' not in compound:
                for k in bpol_dict.keys():
                    if compound in k:
                        total_bpol += bpol_dict[k] * n_value if pd.notna(bpol_dict[k]) else 0
                        minus_bpol *= bpol_dict[k] ** n2_value if pd.notna(bpol_dict[k]) else 1
                        break
                        
    formula.loc[i, 'bpol'] = round(total_bpol, 3) if total_bpol != 0 else 0
    formula.loc[i, 'bpol-'] = round(minus_bpol, 3) if minus_bpol != 1 else 1
formula['bpol-r'] = formula.apply(lambda x: round(x['bpol'] / x['sol'], 3) if x['sol'] != 0 else 0, axis=1)
formula['bpol+'] = formula.apply(lambda x: round((2 * x['bpol-r']) - x['bpol-'], 3), axis=1)

In [54]:
SLogP_dict = dict(zip(infor['compound'], infor['SLogP']))

for i in formula.index:
    total_slogp = 0
    for j in range(1, 9):
        c_col = f'c{j}'
        pct_col = f'%n{j}'
        if c_col in formula.columns and pd.notna(formula.loc[i, c_col]):
            compound = formula.loc[i, c_col]
            n2_value = formula.loc[i, pct_col]
            if 'Li' not in compound:
                for k in SLogP_dict.keys():
                    if compound in k:
                        if pd.notna(SLogP_dict[k]) and pd.notna(n2_value):
                            total_slogp += 10**SLogP_dict[k] * n2_value
                        break
                        
    formula.loc[i, 'SLogP-r'] = round(np.log10(total_slogp), 3) if total_slogp > 0 else 0

In [55]:
with pd.ExcelWriter('results.xlsx') as writer:
    formula.to_excel(writer, sheet_name='formula', index=True)
    infor.to_excel(writer, sheet_name='infor', index=True)