In [76]:
import pandas as pd
import numpy as np
import torch
import networkx as nx
import cobra
import matplotlib.pyplot as plt

import sys
sys.path.append("../src/")
import GEMtoGRAPH as gg

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
model = cobra.io.load_json_model('redYeast_ST8943_fdp1.json')
S = cobra.util.array.create_stoichiometric_matrix(model, array_type='DataFrame')
S.shape

(300, 373)

# MFG

#### Load TFA fluxes

In [3]:
# load tfa fluxes and send them to graph construction functions
tfa = pd.read_csv('fluxes_for_graph.csv', index_col=0)
tfa = tfa.head(1)

zero_flux = [col for col in tfa.columns if (tfa[col] == 0).all()]

print('Zero flux reactions:',len(zero_flux))

tfa.drop(columns=zero_flux, inplace=True)
print("TFA fluxes:", tfa.shape[1])

# For _reverse reactions we should change the sign of the flux to negative
for col in tfa.columns:
    if '_reverse' in col: tfa[col] = -tfa[col]


tfa.rename(columns={col: col.split("_reverse_")[0] for col in tfa.columns}, inplace=True)

tfa_flux = tfa.iloc[0].values
tfa_flux = pd.DataFrame(columns=['fluxes'], data=tfa_flux)
tfa_flux.index = S.columns

Zero flux reactions: 373
TFA fluxes: 373


### Create MFG Graph

In [6]:
M, S_2m, G = gg.MFG(S, model, tfa_flux)

# Remove isolated nodes from G
print()
print('Removing isolated nodes...')
isolated_nodes = list(nx.isolates(G))
G.remove_nodes_from(isolated_nodes)

print("# nodes:", G.number_of_nodes(), "\n# edges:", G.number_of_edges())

# nodes: 746 
# edges: 6157

Removing isolated nodes...
# nodes: 373 
# edges: 6157


## Read ORACLE's data

In [7]:
sigma = pd.read_csv('saturations.csv', index_col=0)
gamma = pd.read_csv('gamma.csv', index_col=0)
vmax = pd.read_csv('Vmax_matrix.csv', index_col=0)

gamma = gamma.head(1)
sigma = sigma.head(1)
vmax = vmax.head(1)

In [8]:
# get the reactions that are the reversible version
rev_rxn = []
for node in list(G.nodes()):
    if node.split("?")[0] == 'rev': rev_rxn.append(node.split("?")[1])

# rename the reactions of gamma; if it's the reversible one add rev? to the column name
for col in gamma.columns:
    if col in rev_rxn: gamma.rename(columns={col:'rev?'+col}, inplace=True)

In [9]:
listA = list(G.nodes())
listB = gamma.columns

print('In Graph but not in gamma:', [item for item in listA if item not in listB])
print()
print('In gamma but not in Graph:', [item for item in listB if item not in listA])

In Graph but not in gamma: ['EX_lac__D_e', 'EX_mal__L_e', 'EX_akg_e', 'EX_2phetoh_e', 'EX_acald_e', 'EX_ac_e', 'EX_gam6p_e', 'EX_co2_e', 'EX_cit_e', 'EX_etoh_e', 'EX_fum_e', 'EX_gly_e', 'EX_gcald_e', 'EX_glx_e', 'EX_id3acald_e', 'EX_ala__L_e', 'EX_asn__L_e', 'EX_asp__L_e', 'EX_cys__L_e', 'EX_glu__L_e', 'EX_gln__L_e', 'EX_phe__L_e', 'EX_ser__L_e', 'EX_trp__L_e', 'EX_tyr__L_e', 'EX_oaa_e', 'EX_pacald_e', 'EX_pyr_e', 'EX_succ_e', 'EX_ind3eth_e', 'EX_h2o_e', 'EX_g6p_e', 'EX_g1p_e', 'EX_2pg_e', 'EX_pser__L_e', 'EX_ppi_e', 'EX_pep_e', 'EX_cbp_e', 'EX_6pgc_e', 'EX_3pg_e', 'EX_cmp_e', 'GROWTH', 'EX_ccm_e', 'EX_pca_e', 'rev?EX_nh4_e', 'rev?EX_glc__D_e', 'rev?EX_h_e', 'rev?EX_fe2_e', 'rev?EX_o2_e', 'rev?EX_pi_e', 'rev?EX_k_e', 'rev?EX_na1_e', 'rev?EX_so4_e', 'rev?EX_cl_e', 'rev?EX_cu2_e', 'rev?EX_mn2_e', 'rev?EX_zn2_e', 'rev?EX_mg2_e', 'rev?EX_ca2_e']

In gamma but not in Graph: []


#### Add `gamma` values as Graph node features

In [10]:
for node in gamma.columns:
    try:
        G.nodes[node]['gamma'] =  gamma[node].values[0]
    except KeyError:
        pass

no_gamma_nodes = [node for node, data in G.nodes(data=True) if not data]

for node in no_gamma_nodes: G.nodes[node]['gamma'] = np.nan

### We have the `Networkx` Graph G

In [11]:
G.number_of_nodes(), G.number_of_edges()

(373, 6157)

In [35]:
## Create features based on graph statistics such as degree_centrality etc

df_g = pd.DataFrame(index=G.nodes())
df_g['clustering'] = pd.Series(nx.clustering(G))
df_g['in_degree'] = pd.Series(nx.in_degree_centrality(G))
df_g['out_degree'] = pd.Series(nx.out_degree_centrality(G))
df_g['degree_centrality'] = pd.Series(nx.degree_centrality(G))
df_g['closeness'] = pd.Series(nx.closeness_centrality(G))
df_g['betweeness'] = pd.Series(nx.betweenness_centrality(G))
df_g['pr'] = pd.Series(nx.pagerank(G))

node_data = pd.DataFrame.from_dict(dict(G.nodes), orient='index')
node_data.sort_index(inplace=True)

df_g.sort_index(inplace=True)

df_g['gamma'] = node_data['gamma'].copy()
df_g.head()



Unnamed: 0,clustering,in_degree,out_degree,degree_centrality,closeness,betweeness,pr,gamma
2OBUTtm,0.0,0.002688,0.002688,0.005376,0.315599,0.0,0.0006,0.999
2OXOADPTm,0.04,0.034946,0.013441,0.048387,0.334415,0.001647,0.000789,0.999
2PHETOHtm,0.0,0.002688,0.002688,0.005376,0.254019,0.000191,0.001022,0.993217
6PHOPHO,0.142857,0.005376,0.010753,0.016129,0.227674,1.9e-05,0.0008,0.999
AATA,0.107803,0.056452,0.02957,0.086022,0.312799,0.001287,0.001056,0.308622


In [87]:
nan_gamma_index = df_g[df_g['gamma'].isna()].index
df_g.drop(index=nan_gamma_index, inplace=True)

X = df_g.drop('gamma', axis=1)
y = df_g['gamma'].copy()

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=20)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(metrics.mean_absolute_error(y_test, y_pred))

20716194897.041557


In [88]:
results = pd.DataFrame(columns=['y_true', 'y_pred'])
results['y_true'] = y_test
results['y_pred'] = y_pred
results

Unnamed: 0,y_true,y_pred
ILEtmi,0.41710,1.89485
ALCD26xi,0.71787,0.67344
SUCD2_u6m,0.00000,7.21653
CBPS,0.05823,0.66199
rev?ALAt2r,1.93013,2.21824
...,...,...
ADK1,0.99900,26878689.93514
rev?2PGt6,2.11266,4.66059
GAPD,0.99900,239232500004.40216
E4Ptm,0.99900,0.65332


### Add more node features using data from GEM

### Create a _node features_ matrix

In [None]:
df = pd.DataFrame(index=list(G.nodes()), columns=['compartements', 'metabolites', 'num_of_mets'])

for rxn in df.index:
    if 'rev?' in rxn: rxn = rxn.split("?")[1]
    else: rxn = rxn

    metabolites = []
    for m in model.reactions.get_by_id(rxn).metabolites: metabolites.append(m.id)
  
    metabolites = "|".join(metabolites)
    compartements = "|".join(list(model.reactions.get_by_id(rxn).compartments))

    # compartements = list(model.reactions.get_by_id(rxn).compartments)
    
    num_of_mets = len(model.reactions.get_by_id(rxn).metabolites)
    new_row = [compartements, metabolites, num_of_mets]

    df.loc[rxn] = new_row

df.head(3)

In [None]:
# LABEL ENCODING to every compartement and metabolite
from sklearn.preprocessing import LabelEncoder

def enc_for_every():
    COBRA_METABOLITES = pd.DataFrame([m.id for m in model.metabolites])
    COBRA_METABOLITES.rename(columns = {0:'id'}, inplace = True)
    COBRA_METABOLITES['enc'] = LabelEncoder().fit_transform(COBRA_METABOLITES['id'])

    COBRA_COMPARTEMETNS = pd.DataFrame(list(model.compartments.keys()))
    COBRA_COMPARTEMETNS.rename(columns= {0:'id'}, inplace=True)
    COBRA_COMPARTEMETNS['enc'] = LabelEncoder().fit_transform(COBRA_COMPARTEMETNS['id'])

    for row in range(len(df)):

        c = df.iloc[row]['compartements']
        m = df.iloc[row]['metabolites']

        try:
            df.loc[df.index[row], 'compartements'] = ([dict(COBRA_COMPARTEMETNS.values).get(item, item) for item in c])
            df.loc[df.index[row], 'metabolites'] = ([dict(COBRA_METABOLITES.values).get(item, item) for item in m])
        except TypeError:
            pass

In [None]:
# Encoding for c|m -> 0, ... 
df['compartements'] = LabelEncoder().fit_transform(df['compartements'])

In [None]:
for node in list(G.nodes()):
    if 'rev?' in node: rxn = node.split("?")[1]
    else: rxn = node

    nx.set_node_attributes(G, {node: {'compartements':df.loc[rxn]['compartements']}})
    # nx.set_node_attributes(G, {node: {'metabolites':df.loc[rxn]['metabolites']}})
    nx.set_node_attributes(G, {node: {'num_of_mets':df.loc[rxn]['num_of_mets']}})

In [None]:
from operator import itemgetter

def orderDict(x: dict, desc=True):
    return sorted(x.items(), key=itemgetter(1), reverse=desc)

indg = orderDict(nx.in_degree_centrality(G))

ccen = orderDict(nx.closeness_centrality(G))

betcen = orderDict(nx.betweenness_centrality(G))

pgrk = orderDict(nx.pagerank(G))

hubs, autr = nx.hits(G)

hubs = orderDict(hubs)
autr = orderDict(autr)

col_names = ['in-degree', 'closeness', 'betweness', 'page rank', 'autr', 'hubs']

graphStats = pd.DataFrame(columns=col_names)

graphStats['in-degree'] = [n for n, v in indg]
graphStats['closeness'] = [n for n, v in ccen]
graphStats['betweness'] = [n for n, v in betcen]
graphStats['page rank'] = [n for n, v in pgrk]
graphStats['autr'] = [n for n, v in autr]
graphStats['hubs'] = [n for n, v in hubs]

# print('Top:\n')
# display(df.head(10))
# print('Bottom:\n')
# display(df.tail(10))


## Node2Vec 

In [None]:
G_labels = nx.convert_node_labels_to_integers(G)

from karateclub import Node2Vec

" Perform node embedding using Node2Vec "
N2vec_model = Node2Vec(walk_number=10, walk_length=80,p=0.9 ,q=0.1,dimensions=12)
N2vec_model.fit(G_labels)
N2Vec_embedding = N2vec_model.get_embedding()
print('Embedding array shape (nodes x features):',N2Vec_embedding.shape )

df_embedding = pd.DataFrame(index=list(G.nodes), data=N2Vec_embedding)
df_embedding.head(3)

#### Clustering based on embeddings

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=0).fit(df_embedding)

results = pd.DataFrame(columns=['cluster'], index=df_embedding.index)
results['cluster'] = kmeans.labels_
results

## Networkx to Torch Geometric

In [None]:
import torch
from torch_geometric.utils.convert import from_networkx

device = 'cuda' if torch.cuda.is_available() else 'cpu'

data = from_networkx(G, group_node_attrs=all, group_edge_attrs=all)
print(data)
data.num_nodes ,data.num_edges