In [1]:
import pandas as pd

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from IPython.display import SVG
import re

In [2]:
pairs = pd.read_csv('data/Main_RPAIRS_KEGG.tsv', sep='\t')
pairs['source'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[0])
pairs['target'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[1])
pairs.shape

(10747, 6)

In [3]:
df = pd.read_csv('data/compounds_list_KEGG.csv')
df.head()

Unnamed: 0,id,mol_weight,formula
0,C00002,507.181,C10H16N5O13P3
1,C00003,664.433,C21H28N7O14P2
2,C00005,745.4209,C21H30N7O17P3
3,C00007,31.9988,O2
4,C00011,44.0095,CO2


In [4]:
df.shape

(5620, 3)

How many nans

In [5]:
df.isna().sum()

id              0
mol_weight    680
formula         2
dtype: int64

In [6]:
# These are probably mistakes
display(df[df['formula'].isna()])

Unnamed: 0,id,mol_weight,formula
665,C15778,,
2845,C20798,,


### Fix manually these mistakes

**Correct *C15778***

In [7]:
pairs[pairs['source'] == 'C15778']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
787,C15778_C15781,1.0,R07492,False,C15778,C15781


In [8]:
pairs[pairs['target'] == 'C15778']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target


According to **KEGG** for the Reaction *R07492*, *C15778* should be *C15780*

In [9]:
pairs.loc[pairs.index[787], 'Reactant_pair'] = 'C15780_C15781'

df.loc[df.index[665], 'id'] = 'C15780'
df.loc[df.index[665], 'mol_weight'] = 396.6484
df.loc[df.index[665], 'formula'] = 'C28H44O'

**Correct *C20798***

In [10]:
pairs[pairs['source'] == 'C20798']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
5112,C20798_C21180,0.48,R11213,False,C20798,C21180
5767,C20798_C20831,0.349,R10760,True,C20798,C20831


In [11]:
pairs[pairs['target'] == 'C20798']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
890,C19675_C20798,1.0,R10719,False,C19675,C20798
2787,C11499_C20798,0.855,"R11677,R11678",False,C11499,C20798


According to **KEGG** for the Reaction *R10719*, *C20798* should be *C21181*

In [12]:
pairs.loc[pairs.index[5112], 'Reactant_pair'] = 'C21181_C21180'
pairs.loc[pairs.index[5767], 'Reactant_pair'] = 'C21181_C20831'
pairs.loc[pairs.index[890], 'Reactant_pair'] = 'C19675_C21181'
pairs.loc[pairs.index[2787], 'Reactant_pair'] = 'C11499_C21181'

df.loc[df.index[2845], 'id'] = 'C21181'
df.loc[df.index[2845], 'mol_weight'] = 154.1417
df.loc[df.index[2845], 'formula'] = 'C3H6O5S'

Recalculate *source* and *target* in pairs

In [13]:
pairs['source'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[0])
pairs['target'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[1])

pairs.to_csv('data/Main_RPAIRS_KEGG_fixed.csv', index=None)

## Exctract Features From Chemical Formula

In [14]:
def extract_elements(df, column_name):
    # define the regular expression pattern to match the chemical formula
    pattern = r'[A-Z][a-z]?'
    # initialize a set to store the element symbols
    elements = set()
    # loop over the values in the specified column of the DataFrame
    for value in df[column_name].values:
        # find all matches of the pattern in the value string
        matches = re.findall(pattern, value)
        # add the matches to the set of elements
        elements.update(matches)
    return elements

def extract_stoichiometry(formula):
    # define the regular expression pattern to match the chemical formula
    pattern = r'([A-Z][a-z]?)(\d*)'
    # initialize the dictionary to store the element symbol and its stoichiometry
    stoichiometry = {}
    # loop over the matches of the pattern in the formula string
    for match in re.findall(pattern, formula):
        symbol, count = match
        # if the count is empty, set it to 1
        count = int(count) if count else 1
        # add the symbol and count to the stoichiometry dictionary
        stoichiometry[symbol] = count
    return stoichiometry

# example usage
elements = extract_elements(df, 'formula')
print(elements)

# Create a col for every element
for elm in elements: df[elm]=0

{'Se', 'I', 'F', 'Br', 'Mo', 'Co', 'Mg', 'Cl', 'R', 'X', 'Zn', 'N', 'C', 'S', 'P', 'O', 'As', 'H'}


In [15]:
for row in range(len(df)):
    
    formula = df['formula'].iloc[row]
    stoichiometry = extract_stoichiometry(formula)
    
    for key, value in stoichiometry.items():
        #df[key].iloc[row] = value
        df.loc[df.index[row], key] = value

In [16]:
df.head(3)

Unnamed: 0,id,mol_weight,formula,Se,I,F,Br,Mo,Co,Mg,...,R,X,Zn,N,C,S,P,O,As,H
0,C00002,507.181,C10H16N5O13P3,0,0,0,0,0,0,0,...,0,0,0,5,10,0,3,13,0,16
1,C00003,664.433,C21H28N7O14P2,0,0,0,0,0,0,0,...,0,0,0,7,21,0,2,14,0,28
2,C00005,745.4209,C21H30N7O17P3,0,0,0,0,0,0,0,...,0,0,0,7,21,0,3,17,0,30


In [17]:
# Many many compounds with R
print(len(df[df['R']!=0]))

# Col that contains the info if the compound is a polymer or not
df['polymer'] = 0

for row in range(len(df)):
    if 'n' in df['formula'].iloc[row]: 
        df.loc[df.index[row], 'polymer'] = 1
        
# Print how many polymers do we have
print(len(df[df['polymer'] == 1]))

597
129


In [18]:
df[df['mol_weight'].isna()]

Unnamed: 0,id,mol_weight,formula,Se,I,F,Br,Mo,Co,Mg,...,X,Zn,N,C,S,P,O,As,H,polymer
23,C00071,,CHOR,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
54,C00161,,C2HO3R,0,0,0,0,0,0,0,...,0,0,0,2,0,0,3,0,1,0
58,C00173,,C3H4OSR2,0,0,0,0,0,0,0,...,0,0,0,3,1,0,1,0,4,0
64,C00195,,C19H36NO3R,0,0,0,0,0,0,0,...,0,0,1,19,0,0,3,0,36,0
83,C00264,,C24H37N7O18P3SR,0,0,0,0,0,0,0,...,0,0,7,24,1,3,18,0,37,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5583,C15544,,C14H24NO11R,0,0,0,0,0,0,0,...,0,0,1,14,0,0,11,0,24,0
5602,C18009,,C9H15N2O5SR,0,0,0,0,0,0,0,...,0,0,2,9,1,0,5,0,15,0
5609,C02000,,HSO4R,0,0,0,0,0,0,0,...,0,0,0,0,1,0,4,0,1,0
5610,C02885,,HS2O3R,0,0,0,0,0,0,0,...,0,0,0,0,2,0,3,0,1,0


In [19]:
df['mol_weight'].fillna(999, inplace=True)

# Import to Networkx 

In [37]:
import networkx as nx

G = nx.Graph()

# add edges from the DataFrame
G.add_edges_from(pairs[['source', 'target']].values)

# Add df cols as node features
node_data = df.set_index('id').to_dict('index')

# Add the node features to the graph
for node, data in G.nodes(data=True):
    node_features = node_data.get(node)
    if node_features:
        data.update(node_features)

print(G.number_of_nodes(), G.number_of_edges())

5620 10747


In [38]:
# PageRank
pr = nx.pagerank(G)
pr = pd.DataFrame(list(pr.items()), columns=['Node', 'PageRank'])

# degree centrality
dc = nx.degree_centrality(G)
dc = pd.DataFrame(list(dc.items()), columns=['Node', 'Degree Centrality'])

# centralities
dc['PageRank'] = pr['PageRank'].copy()
dc.sort_values(by='PageRank', ascending=False, inplace=True)
dc = pd.merge(dc, df[['id','formula','mol_weight']].rename({'id':'Node'}, axis=1), on='Node')
dc.head(20)

Unnamed: 0,Node,Degree Centrality,PageRank,formula,mol_weight
0,C00001,0.163018,0.034408,H2O,18.0153
1,C00007,0.127247,0.030908,O2,31.9988
2,C00019,0.064068,0.016209,C15H22N6O5S,398.4374
3,C00029,0.033814,0.007744,C15H24N2O17P2,566.3018
4,C00002,0.040043,0.007642,C10H16N5O13P3,507.181
5,C00011,0.038263,0.007359,CO2,44.0095
6,C00014,0.037729,0.007142,NH3,17.0305
7,C00009,0.034348,0.006343,H3PO4,97.9952
8,C00024,0.028297,0.006223,C23H38N7O17P3S,809.5708
9,C00010,0.021178,0.004577,C21H36N7O16P3S,767.5341


#### Add edge attributes

In [88]:
def get_weights(a,b):
    return 2*abs(a-b)/b
    
for edge in G.edges():
    G.edges[(edge[0], edge[1])]['weight'] = get_weights(G.nodes[edge[0]]['mol_weight'], 
                                                        G.nodes[edge[1]]['mol_weight'])

ex. Coa is C00010

In [89]:
source = 'C00082'
target = 'C01533'
list(nx.shortest_path(G, source, target, weight='weight'))

['C00082',
 'C00013',
 'C01197',
 'C01494',
 'C05619',
 'C00482',
 'C05610',
 'C02325',
 'C01533']

In [62]:
df[df['id'] == 'C00082']

Unnamed: 0,id,mol_weight,formula,Se,I,F,Br,Mo,Co,Mg,...,X,Zn,N,C,S,P,O,As,H,polymer
29,C00082,181.1885,C9H11NO3,0,0,0,0,0,0,0,...,0,0,1,9,0,0,3,0,11,0


In [64]:
df[df['id'] == 'C00013']

Unnamed: 0,id,mol_weight,formula,Se,I,F,Br,Mo,Co,Mg,...,X,Zn,N,C,S,P,O,As,H,polymer
2292,C00013,177.9751,H4P2O7,0,0,0,0,0,0,0,...,0,0,0,0,0,2,7,0,4,0
