# Parsing data

In [1]:
import xml.etree.ElementTree as ET
import simplesbml #ignore the tellurium message
from collections import defaultdict
import csv
import os
import re
import pickle
import halp

The tests rely on tellurium to construct the models
Since tellurium is not installed the tests can't be run
If you want to run the tests, pip install tellurium first


## Chemical reactions

### AstroChem

In [2]:
file = "Data/Astrochem-networks/osu2009.osu"

filtered = {}

i = 0 

reader = csv.reader(open(file), delimiter=" ")
for row in reader:
    
    in_list = row[:13]
    out_list = row[13:40]
    
    #print(in_list, out_list)
    
    i+=1
    
    string = " ".join(in_list + out_list)
    simple_string = re.sub(' +', ' ', string)
    
    string_in = " ".join(in_list)
    simple_in = re.sub(' +', ' ', string_in).strip()
     
    string_out = " ".join(out_list)
    simple_out = re.sub(' +', ' ', string_out).strip()
    
    filtered[i] = (simple_in.split(" "), simple_out.split(" "))
     
    #print(row[0],row[10],row[20], "-->",row[30],row[39],row[50],row[60],row[70])
    #print()

In [3]:
#print(filtered)

In [4]:
filtered[6010]

(['HE+', 'C10H'], ['C9+', 'C', 'H', 'HE'])

In [5]:
filtered

{1: (['H', 'H'], ['H2']),
 2: (['E', 'GRAIN0'], ['GRAIN-']),
 3: (['C+', 'GRAIN-'], ['C', 'GRAIN0']),
 4: (['FE+', 'GRAIN-'], ['FE', 'GRAIN0']),
 5: (['H+', 'GRAIN-'], ['H', 'GRAIN0']),
 6: (['HE+', 'GRAIN-'], ['HE', 'GRAIN0']),
 7: (['MG+', 'GRAIN-'], ['MG', 'GRAIN0']),
 8: (['N+', 'GRAIN-'], ['N', 'GRAIN0']),
 9: (['NA+', 'GRAIN-'], ['NA', 'GRAIN0']),
 10: (['O+', 'GRAIN-'], ['O', 'GRAIN0']),
 11: (['S+', 'GRAIN-'], ['S', 'GRAIN0']),
 12: (['SI+', 'GRAIN-'], ['SI', 'GRAIN0']),
 13: (['H3+', 'GRAIN-'], ['H2', 'H', 'GRAIN0']),
 14: (['HCO+', 'GRAIN-'], ['H', 'CO', 'GRAIN0']),
 15: (['C'], ['C+', 'E']),
 16: (['CL'], ['CL+', 'E']),
 17: (['FE'], ['FE+', 'E']),
 18: (['H'], ['H+', 'E']),
 19: (['HE'], ['HE+', 'E']),
 20: (['MG'], ['MG+', 'E']),
 21: (['N'], ['N+', 'E']),
 22: (['NA'], ['NA+', 'E']),
 23: (['O'], ['O+', 'E']),
 24: (['P'], ['P+', 'E']),
 25: (['S'], ['S+', 'E']),
 26: (['SI'], ['SI+', 'E']),
 27: (['C2'], ['C', 'C']),
 28: (['CCL'], ['C', 'CL']),
 29: (['CH'], ['C', 'H'])

In [6]:
# Check for errors
for num, (ins, out) in filtered.items():
    if '' in ins or '' in out:
        print(num, ins, out)

In [7]:
with open('ParsedHyperedges/Chemical/astrochem.pkl', 'wb') as f:
    pickle.dump(filtered, f)

### Pathways (PID + reactome)

Already preprocessed within its Data/ subfolder, as a `halp` hypergraph. We now turn them into simple dictionaries and simplify the node labels.

In [8]:
def halp_to_dict(H, weighted=False):
    
    if weighted:
        raise Exception("Not implemented yet")
    
    Hdict = {}
    for ID in H.get_hyperedge_id_set():
        in_list = [link.split("/")[-1] for link in H.get_hyperedge_tail(ID)]
        out_list = [link.split("/")[-1] for link in H.get_hyperedge_head(ID)]
        Hdict[ID] = (in_list, out_list)
            
    return Hdict

In [9]:
folder = 'Data/mmunin/'

with open(folder + "allpid-halpHypergraph.pkl", 'rb') as f:
    H_pid = pickle.load(f)
    
with open(folder + "allreactome-halpHypergraph.pkl", 'rb') as f:
    H_reac = pickle.load(f)

In [10]:
Hdict_pid = halp_to_dict(H_pid)
Hdict_reac = halp_to_dict(H_reac)

In [11]:
with open('ParsedHyperedges/Chemical/pid.pkl', 'wb') as f:
    pickle.dump(Hdict_pid, f)

with open('ParsedHyperedges/Chemical/reactome.pkl', 'wb') as f:
    pickle.dump(Hdict_reac, f)

### KIDA Termocular

In [12]:
file = "Data/KIDA-dataset/kida_TERMOLECULAR.dat"

filtered = {}

i = 0

reader = csv.reader(open(file), delimiter=" ")
for row in reader:
    
    in_list = row[:22]
    out_list = row[22:78]
    
    i+=1 
    
    string = " ".join(in_list + out_list)
    simple_string = re.sub(' +', ' ', string)
    
    string_in = " ".join(in_list)
    simple_in = re.sub(' +', ' ', string_in).strip()
     
    string_out = " ".join(out_list)
    simple_out = re.sub(' +', ' ', string_out).strip()
    
    filtered[i] = ([simple_in.split(" "), simple_out.split(" ")])
     
    #print(row[0],row[10],row[20], "-->",row[30],row[39],row[50],row[60],row[70])
    #print()

In [13]:
filtered

{1: [['H', 'H', 'N2'], ['H2', 'N2']],
 2: [['H', 'N2', 'CH2'], ['N2', 'CH3']],
 3: [['H', 'N2', 'CH3'], ['N2', 'CH4']],
 4: [['H', 'CH3', 'He'], ['He', 'CH4']],
 5: [['H', 'CH3', 'Ar'], ['CH4', 'Ar']],
 6: [['H', 'CH3', 'C2H6'], ['CH4', 'C2H6']],
 7: [['H', 'N2', 'CCH'], ['N2', 'C2H2']],
 8: [['H', 'N2', 'C2H2'], ['N2', 'C2H3']],
 9: [['H', 'N2', 'C2H3'], ['N2', 'C2H4']],
 10: [['H', 'C2H3', 'He'], ['He', 'C2H4']],
 11: [['H', 'N2', 'C2H4'], ['N2', 'C2H5']],
 12: [['H', 'N2', 'C2H4'], ['N2', 'C2H5']],
 13: [['H', 'C2H4', 'He'], ['He', 'C2H5']],
 14: [['H', 'N2', 'C2H5'], ['N2', 'C2H6']],
 15: [['H', 'N2', 'CH2CCH'], ['N2', 'CH3CCH']],
 16: [['H', 'N2', 'CH2CCH'], ['N2', 'CH2CCH2']],
 17: [['H', 'N2', 'CH3CCH'], ['N2', 'C3H5']],
 18: [['H', 'N2', 'CH3CCH'], ['N2', 'C2H2', 'CH3']],
 19: [['H', 'N2', 'CH2CCH2'], ['N2', 'C3H5']],
 20: [['H', 'N2', 'CH2CCH2'], ['N2', 'C2H2', 'CH3']],
 21: [['H', 'N2', 'C3H5'], ['N2', 'CH3CHCH2']],
 22: [['H', 'N2', 'CH3CHCH2'], ['N2', 'C3H7']],
 23: [['H', 

In [14]:
with open('ParsedHyperedges/Chemical/KIDA_termolecular.pkl', 'wb') as f:
    pickle.dump(filtered, f)

### KIDA Unibimolecular

In [15]:
file = "Data/KIDA-dataset/kida_UNIBIMOLECULAR.dat"

filtered = {}

i = 0 

reader = csv.reader(open(file), delimiter=" ")
for row in reader:
    
    in_list = row[:18]
    out_list = row[20:60]
    
    i+=1 
    
    string = " ".join(in_list + out_list)
    simple_string = re.sub(' +', ' ', string)
    
    string_in = " ".join(in_list)
    simple_in = re.sub(' +', ' ', string_in).strip()
     
    string_out = " ".join(out_list)
    simple_out = re.sub(' +', ' ', string_out).strip()
    
    filtered[i] = ([simple_in.split(" "), simple_out.split(" ")])
     
    #print(row[0],row[10],row[20], "-->",row[30],row[39],row[50],row[60],row[70])
    #print()

In [16]:
filtered

{1: [['C4H2', 'Photon'], ['H', 'C4H']],
 2: [['C4H2', 'Photon'], ['C4H2+', 'e-']],
 3: [['C4H2', 'Photon'], ['CCH', 'CCH']],
 4: [['C5H', 'Photon'], ['CCH', 'C3']],
 5: [['C5H', 'Photon'], ['C2', 'c-C3H']],
 6: [['C5H', 'Photon'], ['C2', 'l-C3H']],
 7: [['C5H', 'Photon'], ['H', 'C5']],
 8: [['C5N', 'Photon'], ['CN', 'C4']],
 9: [['CH3OH', 'Photon'], ['H', 'H2COH+', 'e-']],
 10: [['CH3OH', 'Photon'], ['H', 'H2COH+', 'e-']],
 11: [['CH3OH', 'Photon'], ['H2', 'H2CO']],
 12: [['CH3OH', 'Photon'], ['H2', 'H2CO']],
 13: [['CH3OH', 'Photon'], ['CH3OH+', 'e-']],
 14: [['CH3OH', 'Photon'], ['CH3OH+', 'e-']],
 15: [['CH3OH', 'Photon'], ['OH', 'CH3']],
 16: [['CH3OH', 'Photon'], ['OH', 'CH3']],
 17: [['CH3CHO', 'Photon'], ['C2H4O+', 'e-']],
 18: [['CH3CHO', 'Photon'], ['C2H4O+', 'e-']],
 19: [['CH3CHO', 'Photon'], ['HCO', 'CH3']],
 20: [['CH3CHO', 'Photon'], ['HCO', 'CH3']],
 21: [['CH3CHO', 'Photon'], ['CO', 'CH4']],
 22: [['CH3CHO', 'Photon'], ['CO', 'CH4']],
 23: [['CH3CCH', 'Photon'], ['C3H4+

In [17]:
with open('ParsedHyperedges/Chemical/KIDA_unibimolecular.pkl', 'wb') as f:
    pickle.dump(filtered, f)

### KIDA Surface

In [18]:
file1 = "Data/KIDA-dataset/kida_SURFACE_REACTIONS.dat"
file2 = "Data/KIDA-dataset/kida_SURFACE_ENERGYACTIVATION.dat"

filtered = {}

i = 0 

reader = csv.reader(open(file1), delimiter=" ")
for row in reader:
    
    in_list = row[:25]
    out_list = row[25:60]
    
    i+=1
    
    string = " ".join(in_list + out_list)
    simple_string = re.sub(' +', ' ', string)
    
    string_in = " ".join(in_list)
    simple_in = re.sub(' +', ' ', string_in).strip()
     
    string_out = " ".join(out_list)
    simple_out = re.sub(' +', ' ', string_out).strip()
    
    filtered[i] = ([simple_in.split(" "), simple_out.split(" ")])

    
reader = csv.reader(open(file2), delimiter=" ")
for row in reader:
    
    in_list = row[:25]
    out_list = row[25:60]
    
    i+=1
    
    string = " ".join(in_list + out_list)
    simple_string = re.sub(' +', ' ', string)
    
    string_in = " ".join(in_list)
    simple_in = re.sub(' +', ' ', string_in).strip()
     
    string_out = " ".join(out_list)
    simple_out = re.sub(' +', ' ', string_out).strip()
    
    filtered[i] = ([simple_in.split(" "), simple_out.split(" ")])
     


In [19]:
filtered

{1: [['C', 'C'], ['C2']],
 2: [['C', 'C2'], ['C3']],
 3: [['C', 'CCH'], ['c-C3H']],
 4: [['C', 'CCH'], ['l-C3H']],
 5: [['C', 'C2H3'], ['CH2CCH']],
 6: [['C', 'CCN'], ['C3N']],
 7: [['C', 'CCO'], ['C3O']],
 8: [['C', 'CCS'], ['C3S']],
 9: [['C', 'C3'], ['C4']],
 10: [['C', 'c-C3H'], ['C4H']],
 11: [['C', 'l-C3H'], ['C4H']],
 12: [['C', 'C4'], ['C5']],
 13: [['C', 'C4H'], ['C5H']],
 14: [['C', 'C5'], ['C6']],
 15: [['C', 'C5H'], ['C6H']],
 16: [['C', 'C6'], ['C7']],
 17: [['C', 'C6H'], ['C7H']],
 18: [['C', 'C7'], ['C8']],
 19: [['C', 'C7H'], ['C8H']],
 20: [['C', 'C8'], ['C9']],
 21: [['C', 'C8H'], ['C9H']],
 22: [['C', 'C9'], ['C10']],
 23: [['C', 'CH'], ['CCH']],
 24: [['C', 'CH2'], ['C2H2']],
 25: [['C', 'CH3'], ['C2H3']],
 26: [['C', 'CN'], ['CCN']],
 27: [['C', 'HS'], ['CS', 'H']],
 28: [['C', 'N'], ['CN']],
 29: [['C', 'NH'], ['HNC']],
 30: [['C', 'NH2'], ['HNC', 'H']],
 31: [['C', 'NO'], ['CN', 'O']],
 32: [['C', 'NO'], ['OCN']],
 33: [['C', 'NS'], ['CN', 'S']],
 34: [['C', 'O']

In [20]:
with open('ParsedHyperedges/Chemical/KIDA_surface.pkl', 'wb') as f:
    pickle.dump(filtered, f)

### KIDA uva2014

In [21]:
file = "Data/kida.uva.2014/kida.uva.2014/kida.uva.2014.dat"

filtered = {}

i = 0 

reader = csv.reader(open(file), delimiter=" ")
for row in reader:
    
    if i==0:
        i+=1
        continue
    
    in_list = row[:18]
    out_list = row[20:60]
    
    i+=1 
    
    string = " ".join(in_list + out_list)
    simple_string = re.sub(' +', ' ', string)
    
    string_in = " ".join(in_list)
    simple_in = re.sub(' +', ' ', string_in).strip()
     
    string_out = " ".join(out_list)
    simple_out = re.sub(' +', ' ', string_out).strip()
    
    filtered[i] = ([simple_in.split(" "), simple_out.split(" ")])
     
    #print(row[0],row[10],row[20], "-->",row[30],row[39],row[50],row[60],row[70])
    #print()

In [22]:
filtered

{2: [['N2', 'CR'], ['N', 'N']],
 3: [['H', 'CR'], ['H+', 'e-']],
 4: [['He', 'CR'], ['He+', 'e-']],
 5: [['N', 'CR'], ['N+', 'e-']],
 6: [['O', 'CR'], ['O+', 'e-']],
 7: [['CO', 'CR'], ['C', 'O']],
 8: [['CO', 'CR'], ['CO+', 'e-']],
 9: [['H2', 'CR'], ['H', 'H']],
 10: [['H2', 'CR'], ['H', 'H+', 'e-']],
 11: [['H2', 'CR'], ['H+', 'H-']],
 12: [['H2', 'CR'], ['H2+', 'e-']],
 13: [['HC7N', 'CRP'], ['CN', 'C6H']],
 14: [['CH3COCH3', 'CRP'], ['H2CCO', 'CH4']],
 15: [['C8H2', 'CRP'], ['H', 'C8H']],
 16: [['C9H', 'CRP'], ['H', 'C9']],
 17: [['C9N', 'CRP'], ['CN', 'C8']],
 18: [['C10', 'CRP'], ['C3', 'C7']],
 19: [['C10', 'CRP'], ['C5', 'C5']],
 20: [['C10H', 'CRP'], ['H', 'C10']],
 21: [['CH3C5N', 'CRP'], ['CH3', 'C5N']],
 22: [['C9H2', 'CRP'], ['H', 'C9H']],
 23: [['CH3C6H', 'CRP'], ['CH3', 'C6H']],
 24: [['CH3C7N', 'CRP'], ['CH3', 'C7N']],
 25: [['HC9N', 'CRP'], ['CN', 'C8H']],
 26: [['HC6N', 'CRP'], ['CH', 'CN', 'C4']],
 27: [['HC8N', 'CRP'], ['CH', 'CN', 'C6']],
 28: [['NC4N', 'CRP'], ['

In [23]:
with open('ParsedHyperedges/Chemical/KIDA_uva.pkl', 'wb') as f:
    pickle.dump(filtered, f)

### Reaction Template Generator

In [24]:
tree = ET.parse('Data/ReactionTemplate/13321_2018_269_MOESM5_ESM.xml')
root = tree.getroot()

In [25]:
unprocessed_list = []
for x in root.findall('inp-reaction-family'):
     #print(x.tag, x.attrib)
    unprocessed_list.append(x.attrib['name'])

In [26]:
filtered = {}

i = 0

for entry in unprocessed_list:
    
    if '(' in entry:
        continue
    
    try:
        reaction, direction = entry.split("_")
    except ValueError:
        splitlist = entry.split("_")
        direction = splitlist[-1]
        reaction = "_".join(splitlist[:-1])
        
    i += 1
        
    in_list, out_list = reaction.split("=")
    
    #print(in_list, out_list)
    
    if direction == "forward":
        filtered[i] = ([in_list.split("+"), out_list.split("+")])
        
    elif direction == "reverse":
        filtered[i] = ([out_list.split("+"), in_list.split("+")])
    
    else: 
        raise Exception()
        

In [27]:
filtered

{1: [['H', 'O2'], ['O', 'OH']],
 2: [['O', 'OH'], ['H', 'O2']],
 3: [['O', 'H2'], ['H', 'OH']],
 4: [['H', 'OH'], ['O', 'H2']],
 5: [['OH', 'H2'], ['H', 'H2O']],
 6: [['H', 'H2O'], ['OH', 'H2']],
 7: [['O', 'H2O'], ['OH', 'OH']],
 8: [['OH', 'OH'], ['O', 'H2O']],
 9: [['H2', 'M'], ['H', 'H', 'M']],
 10: [['H', 'H', 'M'], ['H2', 'M']],
 11: [['O', 'O', 'M'], ['O2', 'M']],
 12: [['O2', 'M'], ['O', 'O', 'M']],
 13: [['O', 'H', 'M'], ['OH', 'M']],
 14: [['OH', 'M'], ['O', 'H', 'M']],
 15: [['H', 'OH', 'M'], ['H2O', 'M']],
 16: [['H2O', 'M'], ['H', 'OH', 'M']],
 17: [['H2', 'O2'], ['H', 'HO2']],
 18: [['H', 'HO2'], ['H2', 'O2']],
 19: [['HO2', 'O'], ['OH', 'O2']],
 20: [['OH', 'O2'], ['HO2', 'O']],
 21: [['HO2', 'OH'], ['H2O', 'O2']],
 22: [['H2O', 'O2'], ['HO2', 'OH']],
 23: [['H2O2', 'H'], ['H2O', 'OH']],
 24: [['H2O', 'OH'], ['H2O2', 'H']],
 25: [['OH', 'HO2'], ['H2O2', 'O']],
 26: [['H2O2', 'OH'], ['H2O', 'HO2']],
 27: [['CO', 'O2'], ['CO2', 'O']],
 28: [['CO2', 'O'], ['CO', 'O2']],
 29

In [28]:
with open('ParsedHyperedges/Chemical/ReactionTemplate.pkl', 'wb') as f:
    pickle.dump(filtered, f)

## Metabolical reactions

### BiGG datasets

In [None]:
folder = 'Data/BiGG/'
for file in os.listdir(folder):
    
    print(file)
    
    try:
        model = simplesbml.loadSBMLFile(folder + file)
    
    except ValueError:
        print("ERROR --- invalid SBML file ---", file)
        continue
    
    filtered = {}
    for i, ID in enumerate(model.getListOfReactionIds()):

        #print(ID)

        reactants = []
        products = []

        for reactantID in range(model.getNumReactants(ID)):
            reactants.append(model.getReactant(ID, reactantID))
            #print("##", reactant)

        for productID in range(model.getNumProducts(ID)):
            products.append(model.getProduct(ID, productID))

        filtered[i] = [reactants, products]
        
    with open(f'ParsedHyperedges/Metabolical-BiGG/{file.split(".xml")[0]}.pkl', 'wb') as f:
        pickle.dump(filtered, f)

### BioModels

In [2]:
folder = 'Data/BioModels/'
for file in os.listdir(folder):
    
    if "BIOMD" not in file:
        continue
        
    print(file)
    
    try:
        model = simplesbml.loadSBMLFile(folder + file)
    
    except ValueError:
        print("ERROR --- invalid SBML file ---", file)
        continue
    
    fail = False
    filtered = {}
    for i, ID in enumerate(model.getListOfReactionIds()):

        #print(ID)

        reactants = []
        products = []

        for reactantID in range(model.getNumReactants(ID)):
            reactants.append(model.getReactant(ID, reactantID))
            #print("##", reactant)

        for productID in range(model.getNumProducts(ID)):
            products.append(model.getProduct(ID, productID))
        
        if not reactants or not products:
            print("Empty sets...")
            fail = True
            break
            
        filtered[i] = [reactants, products]
    
    if not fail:
        with open(f'ParsedHyperedges/Metabolical-BioModels/{file.split(".xml")[0]}.pkl', 'wb') as f:
            pickle.dump(filtered, f)

BIOMD0000000569.xml
BIOMD0000000289.xml
Empty sets...
BIOMD0000000917.xml
Empty sets...
BIOMD0000001015.xml
Empty sets...
BIOMD0000000137.xml
Empty sets...
BIOMD0000000397.xml
BIOMD0000000479.xml
Empty sets...
BIOMD0000000232.xml
Empty sets...
BIOMD0000000548.xml
BIOMD0000001045.xml
BIOMD0000000684.xml
BIOMD0000000914.xml
Empty sets...
BIOMD0000000880.xml
Empty sets...
BIOMD0000000029.xml
BIOMD0000001033.xml
Empty sets...
BIOMD0000000100.xml
Empty sets...
BIOMD0000000346.xml
BIOMD0000000356.xml
Empty sets...
BIOMD0000000003.xml
Empty sets...
BIOMD0000001058.xml
Empty sets...
BIOMD0000000843.xml
Empty sets...
BIOMD0000000872.xml
ERROR --- invalid SBML file --- BIOMD0000000872.xml
BIOMD0000000938.xml
Empty sets...
BIOMD0000000653.xml
BIOMD0000000599.xml
Empty sets...
BIOMD0000000032.xml
Empty sets...
BIOMD0000000780.xml
Empty sets...
BIOMD0000000906.xml
Empty sets...
BIOMD0000000551.xml
BIOMD0000000568.xml
Empty sets...
BIOMD0000000761.xml
Empty sets...
BIOMD0000000276.xml
BIOMD000000058