In [1]:
import pandas as pd
import json
import ndjson
import itertools
import re

In [2]:
input_files = [
    '/home/jovyan/work/dufourspitze_16/data/data_from_table_1.ndjson',
    '/home/jovyan/work/dufourspitze_16/data/data_from_table_2.ndjson',
    '/home/jovyan/work/dufourspitze_16/data/data_from_table_3.ndjson'
]

In [3]:
def load_ndjson(file_name):

    with open(file_name, 'r') as f:
        data = ndjson.load(f)
    data = [json.loads(i) for i in data]

    return pd.DataFrame(data)

def load_data(file_name):

    df = load_ndjson(file_name)

    ### remove duplicates, if any
    df = df.drop_duplicates(subset='reaction number', keep='first')

    ## we don't care about repetitions in other columns
    # duplicates = {}
    # for col in df.columns:
    #     duplicates[col] = df[col].duplicated().any()
    #     if duplicates[col]:
    #         print('duplicates found in column:', col)

    return df.set_index('reaction number')

In [4]:
dfs_list = []

for ifile in input_files:

    print('processing file', ifile)

    df = load_data(ifile)

    dfs_list.append(df)

processing file /home/jovyan/work/dufourspitze_16/data/data_from_table_1.ndjson
processing file /home/jovyan/work/dufourspitze_16/data/data_from_table_2.ndjson
processing file /home/jovyan/work/dufourspitze_16/data/data_from_table_3.ndjson


In [5]:
df = pd.concat(dfs_list, axis=1).reset_index(drop=False)
df.head()

Unnamed: 0,reaction number,reaction,reaction rate,description
0,1,[Blood-Lymph].CTLA4_mabB <-> Lymph_Node.CTLA4_mab,Kpa_LNB*S_LNB*VL*f_LN_CTLA4*([Blood-Lymph].CTL...,Distribution of Anti-CTLA-4 mAb between the ce...
1,2,[Blood-Lymph].CTLA4_mabB <-> Peripheral.CTLA4_...,0.67*Q_L*[Blood-Lymph].CTLA4_mabB*(1-Sigma1_CT...,Distribution of Anti-CTLA-4 mAb between the ce...
2,3,[Blood-Lymph].CTLA4_mabB <-> Peripheral.CTLA4_...,0.33*Q_L*[Blood-Lymph].CTLA4_mabB*(1-Sigma2_CT...,Distribution of Anti-CTLA-4 mAb between the ce...
3,4,[Blood-Lymph].CTLA4_mabB <-> Tumor.CTLA4_mabt,Kpa_TB*S_TB*Vt_avg_const*[Blood-Lymph].CTLA4_m...,Distribution of Anti-CTLA-4 mAb between the ce...
4,5,[Blood-Lymph].CTLA4_mabB -> null,Cl_CTLA4*[Blood-Lymph].CTLA4_mabB,Clearance of Anti-CTLA-4 mAb from the central ...


Now simplify the reaction by replacing elements with literals

In [6]:
df4 = load_ndjson('/home/jovyan/work/dufourspitze_16/data/data_from_table_4.ndjson')

In [7]:
### compartment_location
comp_mapping_dict = {
    'Blood-Lymph':'alpha',
    'Blood–Lymph':'alpha', ## different hyphen
    'Lungs':'beta',
    'Liv_Spln_GI':'gamma',
    'Lymph_Node':'delta',
    'Peripheral':'epsilon',
    'Tumor':'zeta'
}

df4['compartment_location_mapped'] = df4['compartment_location'].map(comp_mapping_dict)

In [8]:
### variable_name
var_unique = df4['variable_name'].sort_values().unique()
literals = [''.join(pair) for pair in itertools.product('ABCDEFGHIJKLMNOPQRSTUVWXYZ', repeat=2)]
var_mapping_dict = dict(zip(var_unique, literals))

In [9]:
def replace_terms_in_equation(row, comp_dict, var_dict):
    r = row['reaction']
    
    # Replace full matches for compartment names
    for ck, cv in comp_dict.items():
        # Use regex to match the exact term (compartment) within square brackets
        r = re.sub(rf'\[{re.escape(ck)}\]', cv, r)
        # Match compartment names without square brackets
        r = re.sub(rf'\b{re.escape(ck)}\b', cv, r)


    # Replace full matches for variables
    for vk, vv in var_dict.items():
        # Use regex to match the exact variable name as a standalone word
        r = re.sub(rf'\b{re.escape(vk)}\b', vv, r)

    return r


In [10]:
df['simplified_reaction'] = df.apply(lambda r: replace_terms_in_equation(r, comp_mapping_dict, var_mapping_dict), axis=1)
df.head()

Unnamed: 0,reaction number,reaction,reaction rate,description,simplified_reaction
0,1,[Blood-Lymph].CTLA4_mabB <-> Lymph_Node.CTLA4_mab,Kpa_LNB*S_LNB*VL*f_LN_CTLA4*([Blood-Lymph].CTL...,Distribution of Anti-CTLA-4 mAb between the ce...,alpha.CE <-> delta.CD
1,2,[Blood-Lymph].CTLA4_mabB <-> Peripheral.CTLA4_...,0.67*Q_L*[Blood-Lymph].CTLA4_mabB*(1-Sigma1_CT...,Distribution of Anti-CTLA-4 mAb between the ce...,alpha.CE <-> epsilon.CG
2,3,[Blood-Lymph].CTLA4_mabB <-> Peripheral.CTLA4_...,0.33*Q_L*[Blood-Lymph].CTLA4_mabB*(1-Sigma2_CT...,Distribution of Anti-CTLA-4 mAb between the ce...,alpha.CE <-> epsilon.CH
3,4,[Blood-Lymph].CTLA4_mabB <-> Tumor.CTLA4_mabt,Kpa_TB*S_TB*Vt_avg_const*[Blood-Lymph].CTLA4_m...,Distribution of Anti-CTLA-4 mAb between the ce...,alpha.CE <-> zeta.CJ
4,5,[Blood-Lymph].CTLA4_mabB -> null,Cl_CTLA4*[Blood-Lymph].CTLA4_mabB,Clearance of Anti-CTLA-4 mAb from the central ...,alpha.CE -> null


Save outputs

In [11]:
df.to_csv('/home/jovyan/work/dufourspitze_16/data/final_table.txt', index=False)

with open('/home/jovyan/work/dufourspitze_16/data/mappings.json','w') as f:
    json.dump(comp_mapping_dict,f)
    json.dump(var_mapping_dict,f)