In [28]:
import pandas as pd

from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from IPython.display import SVG
import re

In [29]:
pairs = pd.read_csv('data/Main_RPAIRS_KEGG.tsv', sep='\t')
pairs['source'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[0])
pairs['target'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[1])
pairs.shape

(10747, 6)

In [30]:
df = pd.read_csv('data/compounds_list_KEGG.csv')
df.head()

Unnamed: 0,id,mol_weight,formula
0,C00002,507.181,C10H16N5O13P3
1,C00003,664.433,C21H28N7O14P2
2,C00005,745.4209,C21H30N7O17P3
3,C00007,31.9988,O2
4,C00011,44.0095,CO2


In [31]:
df.shape

(5620, 3)

How many nans

In [32]:
df.isna().sum()

id              0
mol_weight    680
formula         2
dtype: int64

In [33]:
# These are probably mistakes
display(df[df['formula'].isna()])

Unnamed: 0,id,mol_weight,formula
665,C15778,,
2845,C20798,,


### Fix manually these mistakes

**Correct *C15778***

In [34]:
pairs[pairs['source'] == 'C15778']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
787,C15778_C15781,1.0,R07492,False,C15778,C15781


In [35]:
pairs[pairs['target'] == 'C15778']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target


According to **KEGG** for the Reaction *R07492*, *C15778* should be *C15780*

In [36]:
pairs.loc[pairs.index[787], 'Reactant_pair'] = 'C15780_C15781'

df.loc[df.index[665], 'id'] = 'C15780'
df.loc[df.index[665], 'mol_weight'] = 396.6484
df.loc[df.index[665], 'formula'] = 'C28H44O'

**Correct *C20798***

In [37]:
pairs[pairs['source'] == 'C20798']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
5112,C20798_C21180,0.48,R11213,False,C20798,C21180
5767,C20798_C20831,0.349,R10760,True,C20798,C20831


In [38]:
pairs[pairs['target'] == 'C20798']

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
890,C19675_C20798,1.0,R10719,False,C19675,C20798
2787,C11499_C20798,0.855,"R11677,R11678",False,C11499,C20798


According to **KEGG** for the Reaction *R10719*, *C20798* should be *C21181*

In [39]:
pairs.loc[pairs.index[5112], 'Reactant_pair'] = 'C21181_C21180'
pairs.loc[pairs.index[5767], 'Reactant_pair'] = 'C21181_C20831'
pairs.loc[pairs.index[890], 'Reactant_pair'] = 'C19675_C21181'
pairs.loc[pairs.index[2787], 'Reactant_pair'] = 'C11499_C21181'

df.loc[df.index[2845], 'id'] = 'C21181'
df.loc[df.index[2845], 'mol_weight'] = 154.1417
df.loc[df.index[2845], 'formula'] = 'C3H6O5S'

Recalculate *source* and *target* in pairs

In [40]:
pairs['source'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[0])
pairs['target'] = pairs['Reactant_pair'].apply(lambda x: x.split('_')[1])

pairs.to_csv('data/Main_RPAIRS_KEGG_fixed.csv', index=None)

## Exctract Features From Chemical Formula

In [100]:
def extract_elements(df, column_name):
    # define the regular expression pattern to match the chemical formula
    pattern = r'[A-Z][a-z]?'
    # initialize a set to store the element symbols
    elements = set()
    # loop over the values in the specified column of the DataFrame
    for value in df[column_name].values:
        # find all matches of the pattern in the value string
        matches = re.findall(pattern, value)
        # add the matches to the set of elements
        elements.update(matches)
    return elements

def extract_stoichiometry(formula):
    # define the regular expression pattern to match the chemical formula
    pattern = r'([A-Z][a-z]?)(\d*)'
    # initialize the dictionary to store the element symbol and its stoichiometry
    stoichiometry = {}
    # loop over the matches of the pattern in the formula string
    for match in re.findall(pattern, formula):
        symbol, count = match
        # if the count is empty, set it to 1
        count = int(count) if count else 1
        # add the symbol and count to the stoichiometry dictionary
        stoichiometry[symbol] = count
    return stoichiometry

# example usage
elements = extract_elements(df, 'formula')
print(elements)

# Create a col for every element
for elm in elements: df[elm]=0

{'Zn', 'C', 'As', 'P', 'S', 'H', 'Br', 'Mo', 'R', 'Cl', 'F', 'X', 'Mg', 'Co', 'Se', 'I', 'N', 'O'}


In [42]:
for row in range(len(df)):
    
    formula = df['formula'].iloc[row]
    stoichiometry = extract_stoichiometry(formula)
    
    for key, value in stoichiometry.items():
        #df[key].iloc[row] = value
        df.loc[df.index[row], key] = value

In [43]:
df.head(3)

Unnamed: 0,id,mol_weight,formula,Zn,C,As,P,S,H,Br,...,R,Cl,F,X,Mg,Co,Se,I,N,O
0,C00002,507.181,C10H16N5O13P3,0,10,0,3,0,16,0,...,0,0,0,0,0,0,0,0,5,13
1,C00003,664.433,C21H28N7O14P2,0,21,0,2,0,28,0,...,0,0,0,0,0,0,0,0,7,14
2,C00005,745.4209,C21H30N7O17P3,0,21,0,3,0,30,0,...,0,0,0,0,0,0,0,0,7,17


In [45]:
# Many many compounds with R
print(len(df[df['R']!=0]))

# Col that contains the info if the compound is a polymer or not
df['polymer'] = 0

for row in range(len(df)):
    if 'n' in df['formula'].iloc[row]: 
        df.loc[df.index[row], 'polymer'] = 1
        
# Print how many polymers do we have
print(len(df[df['polymer'] == 1]))

597
129


In [48]:
df[df['mol_weight'].isna()]

Unnamed: 0,id,mol_weight,formula,Zn,C,As,P,S,H,Br,...,Cl,F,X,Mg,Co,Se,I,N,O,polymer
23,C00071,,CHOR,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
54,C00161,,C2HO3R,0,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,3,0
58,C00173,,C3H4OSR2,0,3,0,0,1,4,0,...,0,0,0,0,0,0,0,0,1,0
64,C00195,,C19H36NO3R,0,19,0,0,0,36,0,...,0,0,0,0,0,0,0,1,3,0
83,C00264,,C24H37N7O18P3SR,0,24,0,3,1,37,0,...,0,0,0,0,0,0,0,7,18,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5583,C15544,,C14H24NO11R,0,14,0,0,0,24,0,...,0,0,0,0,0,0,0,1,11,0
5602,C18009,,C9H15N2O5SR,0,9,0,0,1,15,0,...,0,0,0,0,0,0,0,2,5,0
5609,C02000,,HSO4R,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,4,0
5610,C02885,,HS2O3R,0,0,0,0,2,1,0,...,0,0,0,0,0,0,0,0,3,0


In [52]:
df['mol_weight'].fillna(999, inplace=True)

## Extract features at of chemical formula (Word2Vec)

In [72]:
df.sort_values(by='mol_weight')

Unnamed: 0,id,mol_weight,formula,Zn,C,As,P,S,H,Br,...,F,X,Mg,Co,Se,I,N,O,polymer,clusters
2940,C00014,17.0305,NH3,0,0,0,0,0,3,0,...,0,0,0,0,0,0,1,0,0,0
2939,C00001,18.0153,H2O,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,1,0,0
5618,C16487,20.0063,HF,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2747,C00177,26.0174,CN,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3103,C00237,28.0101,CO,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4465,C19876,2286.4136,C100H178N2O51P2,0,100,0,2,0,178,0,...,0,0,0,0,0,0,2,51,0,1
4027,C21462,2361.7661,C112H208N3O42P3,0,112,0,3,0,208,0,...,0,0,0,0,0,0,3,42,0,1
4026,C21173,2361.7661,C112H208N3O42P3,0,112,0,3,0,208,0,...,0,0,0,0,0,0,3,42,0,1
1290,C21461,2361.7661,C112H208N3O42P3,0,112,0,3,0,208,0,...,0,0,0,0,0,0,3,42,0,1


In [141]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# tokenize the chemical formulas and create a list of sentences
sentences = [formula.split(' ') for formula in df['formula'].tolist()]

model = Word2Vec(sentences, vector_size=18, window=1, \
                 min_count=1, alpha=.001, workers=4)\

# get the top 10 most similar chemical formulas to a given formula
similar_formulas = model.wv.most_similar('H2O', topn=10)
print(similar_formulas)

[('C8H11O3SR', 0.7614421844482422), ('C20H32N6O12S2', 0.7399899363517761), ('C12H20N4O10P3S', 0.7274306416511536), ('C10H15NO', 0.6987376809120178), ('C7H12NO7PR2', 0.6737675070762634), ('C30H44O4', 0.6696857213973999), ('C10H16O2', 0.661088764667511), ('C10H16N5O8P(C5H8O5PR)n(C5H8O5PR)n', 0.6554839015007019), ('C15H24O2', 0.6457510590553284), ('C34H48O9', 0.6424026489257812)]


In [131]:
s = '12abcd405'
result = ''.join([i for i in s if not i.isdigit()])
result

'abcd'

In [114]:
model.wv.similarity('H2O', 'C6H12O2')

0.13780722

In [125]:
model.wv.most_similar('C17H34N4O10')

[('C15H24N5O14P3', 0.8350638747215271),
 ('C6H7NO2', 0.7188903093338013),
 ('C11H23N2O7PS', 0.712706446647644),
 ('C11H21NO4S', 0.6968957185745239),
 ('C11H15O8P', 0.6923198699951172),
 ('C3H3O3SR', 0.6882240772247314),
 ('C15H24N2O17P2', 0.6732925772666931),
 ('C10H13NO', 0.6688765287399292),
 ('C6H13NO8S', 0.6626027226448059),
 ('C9H7BrO3S', 0.6525627970695496)]

# Import to Networkx 

In [117]:
pairs.head()

Unnamed: 0,Reactant_pair,CAR,KEGG_reactions,RPAIR_main,source,target
0,C00002_C07024,1.0,R05668,True,C00002,C07024
1,C00003_C00004,1.0,"R00091,R00094,R00143,R00145,R00203,R00209,R002...",True,C00003,C00004
2,C00005_C00006,1.0,"R00114,R00134,R00146,R00205,R00216,R00243,R002...",True,C00005,C00006
3,C00007_C00027,1.0,"R00033,R00069,R00072,R00250,R00279,R00277,R002...",True,C00007,C00027
4,C00011_C00058,1.0,"R00134,R00519,R03215,R09094,R09481,R09494",True,C00011,C00058


In [123]:
import networkx as nx

G = nx.Graph()

# add edges from the DataFrame
G.add_edges_from(pairs[['source', 'target']].values)

print(G.number_of_nodes(), G.number_of_edges())

5620 10747
