# This project implemented by Elham Mahdipour
## She is a Ph.D. Candidate of computer engineering at Yazd University, Yazd, Iran.
### Please feel free and contact to me: elham.mahdipour@gmail.com/ elham.mahdipour@stu.yazd.ac.ir

# Phase 1 
## Create Dataset and Preprocessing

In [1]:
import networkx as nx
G1=nx.read_weighted_edgelist('large dataset\ec-ec.evals')
G1

<networkx.classes.graph.Graph at 0x2bff6888780>

In [2]:
G2=nx.read_weighted_edgelist('large dataset\hs-hs.evals')
G2

<networkx.classes.graph.Graph at 0x2bff71517f0>

In [3]:
### Check and Swap if G1 > G2 ###
if len(G1)>len(G2):
    temp=G1
    G1=G2
    G2=temp
print(len(G1))
print(len(G2))

4265
22232


In [4]:
G_target_na=nx.read_weighted_edgelist('large dataset\ec-hs.evals')
G_target_na  

<networkx.classes.graph.Graph at 0x2bff4ad6668>

In [5]:
ed1=G1.edges()
ed2=G2.edges()

nd1=G1.nodes()
nd2=G2.nodes()

el1=list(ed1)
el2=list(ed2)

nd1=list(nd1)
nd2=list(nd2)

degG1 = [val for (node, val) in G1.degree()]
degG2 = [val for (node, val) in G2.degree()]

# Feature Extraction

# compute score for create similarity matrix

In [6]:
def deg_Diff(G1,G2):
    Degree_Difference=np.zeros((len(G1),len(G2)))
    for i in range(len(G1)):
        for j in range(len(G2)):
            Degree_Difference[i][j]=abs(degG1[i]-degG2[j])/max(degG1[i],degG2[j])
    return Degree_Difference

In [7]:
def score_pageRank(X):
    a=nx.pagerank(X)
    return a

In [8]:
def coefficient_pagerank(x,y):  #x is G1, y is G2
   # print(len(x))
    p1=score_pageRank(x)
    b=p1.values()
    pr1=list(b)
    p2=score_pageRank(y)
    c=p2.values()
    pr2=list(c)
    pr=np.zeros((len(x),len(y)))
    for i in range(len(x)):
        for j in range(len(y)):
            #print(pr1[i],pr2[j])
            pr[i][j]=abs(pr1[i]-pr2[j])/max(pr1[i],pr2[j])   #minimum pr is maximum similarity of topology 
    return pr

In [9]:
def coefficient_edges(index_node, G, GraphNumber):
    if GraphNumber==1:
        sum_edge=0        
        for i in G.neighbors(nd1[index_node]):                       
            sum_edge=sum_edge+degG1[nd1.index(i)]
        #print(sum_edge)
        temp=(degG1[index_node]-1) if degG1[index_node]> 1 else 1        
        coeff_node=(2*sum_edge)/(degG1[index_node]*temp)
    else:
        sum_edge=0        
        for i in G.neighbors(nd2[index_node]):                       
            sum_edge=sum_edge+degG2[nd2.index(i)]
        #print(sum_edge)
        temp=(degG2[index_node]-1) if degG2[index_node]> 1 else 1        
        coeff_node=(2*sum_edge)/(degG2[index_node]*temp)
    return coeff_node

In [10]:
def compute_Ea(G1,G2):
    Ea_G1=np.zeros(len(G1))
    Ea_G2=np.zeros(len(G2))
    for i in range(len(G1)):
        Ea_G1[i]=coefficient_edges(i, G1, 1)
    for j in range(len(G2)):
        Ea_G2[j]=coefficient_edges(j,G2,2)
    ea=[Ea_G1, Ea_G2]
    return(ea)

In [11]:
# compute relative clustering coefficient difference between node a (in G1) and node b (in G2)
def CD(G1, G2):
    cd=np.zeros((len(G1),len(G2)))
    EA=compute_Ea(G1,G2)
    #print(EA[0])      #Ea for G1
    #print("===================")
    #print(EA[1])      #Ea for G2
    for i in range(len(G1)):
        for j in range(len(G2)):
            cd[i,j]=abs(EA[0][i]-EA[1][j])/max(EA[0][i],EA[1][j])
    return cd

In [12]:
def sequence_score(x,y):
    seq=np.zeros((len(x),len(y)))
    for i in range(len(x)):
        for j in range(len(y)):
            q1=G_target_na.get_edge_data(str(nd1[i]),str(nd2[j]))
            if q1==None:
                c=0
            else:
                c=list(q1.values())
                c=c[0]
            seq[i][j]=c    
            
    return seq

In [13]:
def compute_score(G1,G2):
    coeff_pr=coefficient_pagerank(G1,G2)
    dd=deg_Diff(G1,G2)
    cd=CD(G1,G2)
    seq_sc=sequence_score(G1,G2)
    
    alpha=0.1
    betta=0.2
    gamma=0.2
    zetta=1-alpha-betta-gamma
    s=alpha*(1-coeff_pr)+betta*(1-dd)+gamma*(1-cd)+zetta*seq_sc
    return s,coeff_pr, dd, cd

In [14]:
import numpy as np
m, coeff_pr, dd, cd=compute_score(G1,G2)
sim=m

# Phase 2: Dataset Generation

# create data for deep learning

# change problem to classification 
## [node of G1, node of G2, BLAST, Coefficient page rank, clustering coefficient difference,  similarity score, alignment=yes(1) or no(0)]

In [15]:
import numpy as np
int_nd1=np.zeros(len(nd1))
int_nd2=np.zeros(len(nd2))

species=['ec','sc','ce','dm','mm','hs']
ch1=0  #please set index for first species of species list, for example index of ec is 0
ch2=5  #please set index for second species of species list, for example index of hs is 5
# If don't set index with considering species may be given an error

for i in range(len(nd1)):
    if (species[ch1] in nd1[i] or species[ch2] in nd1[i]):
        s=nd1[i][2:]
        x=int(s)
        int_nd1[i]=x    
for i in range(len(nd2)):
    if (species[ch1] in nd2[i] or species[ch2] in nd2[i]):
        s=nd2[i][2:]
        x=int(s)
        int_nd2[i]=x    

In [16]:
# en_mat is encoding matrix
en_mat=[]

for i in range(len(nd1)):
    for j in range(len(nd2)):
        if G_target_na.has_edge(nd1[i],nd2[j]):
            align_class='Yes'
        else:
            align_class='No'
        
        sample=[int_nd1[i],int_nd2[j], coeff_pr[i][j], dd[i][j],cd[i][j],sim[i][j],align_class]
        en_mat.append(sample)
print(len(en_mat))

94819480


In [17]:
yc=[]
noc=[]
for i in range(len(en_mat)):
    if en_mat[i][6]=='Yes':
        yc.append(en_mat[i])
    else:
        noc.append(en_mat[i])
print(len(yc), len(noc))

8770 94810710


In [20]:
data=yc+sort_noc[0:30000000] 
len(data)
X=[]
y=[]
for i in range(len(data)):
    X.append(data[i][0:6])
    y.append(data[i][6])

len(X),len(y)

(30008770, 30008770)

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [23]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

27007893 27007893 3000877 3000877


In [None]:
import pickle
with open('large dataset/ec-hs-data.pickle', 'wb') as f:
    pickle.dump([X_train, y_train,X_test,y_test],f)

In [1]:
import pickle
with open('large dataset/ec-hs-data.pickle','rb') as f:
    X_train, y_train,X_test,y_test=pickle.load(f)

In [2]:
print(len(X_train), len(y_train), len(X_test), len(y_test))

27032462 27032462 3003607 3003607


In [25]:
x_tr,y_tr,x_te,y_te=X_train, y_train,X_test,y_test

# test model for real data

In [26]:
# Label encode Class (Species)
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# One Hot Encode
y_train = np_utils.to_categorical(encoded_Y)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [27]:
# Label encode Class (Species)
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_test)
encoded_Y = encoder.transform(y_test)
# One Hot Encode
y_test = np_utils.to_categorical(encoded_Y)

In [28]:
import numpy as np
X_train=np.array(X_train)
y_train=np.array(y_train)
X_test=np.array(X_test)
y_test=np.array(y_test)

# Phase 3: RENA Network 


## Load tune RENA model

In [29]:
from keras.models import load_model
model_rnn = load_model('deep_model_resample_6features_rnn_ec-sc.h5')
model_rnn.load_weights('deep_model_resample_6features_rnn_weights_ec-sc.h5', by_name=True)




In [30]:
# without resample
result_tr = model_rnn.evaluate(X_train, y_train)
result_tr



[0.0015723015683626396, 1.0, 0.0015785049181431532, 2.4786654648778494e-06]

In [31]:
%%time
out_tr=model_rnn.predict(X_train)
out_tr

Wall time: 21min 38s


array([[0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033],
       ...,
       [0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033]], dtype=float32)

In [32]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train.argmax(axis=1), out_tr.argmax(axis=1)))

[[27000005        0]
 [       0     7888]]


In [33]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(precision_score(y_train.argmax(axis=1), out_tr.argmax(axis=1) , average="macro"))
print(recall_score(y_train.argmax(axis=1), out_tr.argmax(axis=1) , average="macro"))
print(f1_score(y_train.argmax(axis=1), out_tr.argmax(axis=1) , average="macro"))

1.0
1.0
1.0


In [34]:
# without resample
result_te = model_rnn.evaluate(X_test, y_test)
result_te



[0.0015723059410443707, 1.0, 0.0015719984658062458, 2.4665660021128133e-06]

In [35]:
%%time
out_te=model_rnn.predict(X_test)
out_te

Wall time: 2min 31s


array([[0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033],
       ...,
       [0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033],
       [0.9984296 , 0.00157033]], dtype=float32)

In [36]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test.argmax(axis=1), out_te.argmax(axis=1)))

[[2999995       0]
 [      0     882]]


In [37]:
from sklearn.metrics import f1_score, precision_score, recall_score
print(precision_score(y_test.argmax(axis=1), out_te.argmax(axis=1) , average="macro"))
print(recall_score(y_test.argmax(axis=1), out_te.argmax(axis=1) , average="macro"))
print(f1_score(y_test.argmax(axis=1), out_te.argmax(axis=1) , average="macro"))

1.0
1.0
1.0


# test other classifier without resample

In [47]:
x_tra,y_tra,x_tes,y_tes=X_train,y_train,X_test,y_test

In [48]:
X_train,y_train,X_test,y_test=x_tr,y_tr,x_te,y_te

In [66]:
%%time
# Linear Discriminant Analysis

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}'
     .format(lda.score(X_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}'
     .format(lda.score(X_test, y_test)))

Accuracy of LDA classifier on training set: 1.00
Accuracy of LDA classifier on test set: 1.00
Wall time: 2min 36s


In [67]:
%%time
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix
# Make predictions
preds_tr = lda.predict(X_train)
print(preds_tr)

# Evaluate accuracy
print(accuracy_score(y_train, preds_tr))

print(confusion_matrix(y_train, preds_tr))
print(precision_score(y_train, preds_tr , average="macro"))
print(recall_score(y_train, preds_tr , average="macro"))
print(f1_score(y_train, preds_tr , average="macro"))

['No' 'No' 'No' ... 'No' 'No' 'No']
0.9999966676408263
[[27000005        0]
 [      90     7798]]
0.9999983333391975
0.9942951318458417
0.9971303664642064
Wall time: 4min 20s


In [68]:
%%time
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix
# Make predictions
preds = lda.predict(X_test)
print(preds)

# Evaluate accuracy
print(accuracy_score(y_test, preds))

print(confusion_matrix(y_test, preds))
print(precision_score(y_test, preds , average="macro"))
print(recall_score(y_test, preds , average="macro"))
print(f1_score(y_test, preds , average="macro"))

['No' 'No' 'No' ... 'No' 'No' 'No']
0.9999973341126611
[[2999995       0]
 [      8     874]]
0.9999986666680001
0.9954648526077097
0.9977214290050929
Wall time: 27.3 s


In [1]:
%%time
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of KNN classifier on training set: 1.00
Accuracy of KNN classifier on test set: 1.00
Wall time: 3h 35min 54s


In [3]:
%%time
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,accuracy_score
# Make predictions
preds_tr = knn.predict(X_train)
print(preds_tr)

# Evaluate accuracy
print(accuracy_score(y_train, preds_tr))

print(confusion_matrix(y_train, preds_tr))
print(precision_score(y_train, preds_tr , average="macro"))
print(recall_score(y_train, preds_tr , average="macro"))
print(f1_score(y_train, preds_tr , average="macro"))

['No' 'No' 'No' ... 'No' 'No' 'No']
0.9998747031275463
[[27000005        0]
 [    3510     4378]]
0.9999373413461971
0.7868480739872145
0.8645219834169753
Wall time: 3h 21min 25s


In [9]:
%%time
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix,accuracy_score
# Make predictions
preds =  knn.predict(X_test)
print(preds)

# Evaluate accuracy
print(accuracy_score(y_test, preds))

print(confusion_matrix(y_test, preds))
print(precision_score(y_test, preds , average="macro"))
print(recall_score(y_test, preds , average="macro"))
print(f1_score(y_test, preds , average="macro"))

['No' 'No' 'No' ... 'No' 'No' 'No']
0.9998700385146987
[[2999995        0]
 [    376      506]]
0.9999350084169723
0.777510142786848147
0.856889074864521931
Wall time: 3min 36s


In [8]:
%%time
# SVM
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.58
Accuracy of SVM classifier on test set: 0.58
Wall time: 10h 44min 39s


In [10]:
%%time
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,accuracy_score
# Make predictions
preds_tr = svm.predict(X_train)
print(preds_tr)

# Evaluate accuracy
print(accuracy_score(y_train, preds_tr))

print(confusion_matrix(y_train, preds_tr))
print(precision_score(y_train, preds_tr , average="macro"))
print(recall_score(y_train, preds_tr , average="macro"))
print(f1_score(y_train, preds_tr , average="macro"))

['No' 'No' 'No' ... 'No' 'No' 'No']
0.5889524961954863
[[15898478   11101527]
 [       0       7888]]
0.500355014356181691
0.794416205264268464
0.371316523841365972
Wall time: 10h 33min 36s


In [11]:
%%time
from sklearn.metrics import f1_score, precision_score, recall_score,confusion_matrix,accuracy_score
# Make predictions
preds = svm.predict(X_test)
print(preds)

# Evaluate accuracy
print(accuracy_score(y_test, preds))

print(confusion_matrix(y_test, preds))
print(precision_score(y_test, preds , average="macro"))
print(recall_score(y_test, preds , average="macro"))
print(f1_score(y_test, preds , average="macro"))

['No' 'No' 'No' ... 'No' 'No' 'No']
0.5890181443697415
[[1766689    1233306]
 [      0        882]]
0.5003573262145774
0.7944486574832169
0.3713468388568891
Wall time: 14min 29s
