In [33]:
# Load the adjacency matrix into a numpy array.
normal_mat = np.loadtxt('../Normal.txt', dtype=int)
for i in range(0,70):
        normal_mat[i][i] = 0
G_Normal = nx.from_numpy_matrix(normal_mat)
print("Number Of Edges: ",len(G_Normal.edges()))
print("Number Of Nodes: ",len(G_Normal.nodes()))

Number Of Edges:  1490
Number Of Nodes:  70


In [39]:
def _complement(mat):
    mat_complement = mat.copy()
    for i in range(70):
        for j in range(70):
            if mat[i][j] == 0 and i!=j:
                mat_complement[i][j] = 1
            else:
                mat_complement[i][j] = 0
                
    return mat_complement
            

In [65]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import re
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [4]:
# Load the adjacency matrix into a numpy array.
eMCI_mat = np.loadtxt('../eMCI.txt', dtype=int)
for i in range(0,70):
        eMCI_mat[i][i] = 0
G_eMCI = nx.from_numpy_matrix(eMCI_mat)
print("Number Of Edges: ",len(G_eMCI.edges()))
print("Number Of Nodes: ",len(G_eMCI.nodes()))

Number Of Edges:  1510
Number Of Nodes:  70


In [5]:
lmci_mat = np.loadtxt('../lMCI.txt', dtype=int)
#removing self loops
for i in range(0,70):
        lmci_mat[i][i] = 0
G_lMCI = nx.from_numpy_matrix(lmci_mat)
print("Number Of Edges: ",len(G_lMCI.edges()))
print("Number Of Nodes: ",len(G_lMCI.nodes()))

Number Of Edges:  1380
Number Of Nodes:  70


In [6]:
# Load the adjacency matrix into a numpy array.
ad_mat = np.loadtxt('../AD.txt', dtype=int)
for i in range(0,70):
        ad_mat[i][i] = 0
G_AD = nx.from_numpy_matrix(ad_mat)
print("Number Of Edges: ",len(G_AD.edges()))
print("Number Of Nodes: ",len(G_AD.nodes()))

Number Of Edges:  1364
Number Of Nodes:  70


## Taking complement

In [43]:
normal_mat_complement = _complement(normal_mat)
G_Normal = nx.from_numpy_matrix(normal_mat_complement)
print("Number Of Edges: ",len(G_AD.edges()))
print("Number Of Nodes: ",len(G_AD.nodes()))


Number Of Edges:  1364
Number Of Nodes:  70


In [45]:
#eMCI complement
eMCI_mat_complement = _complement(eMCI_mat)
G_eMCI_complement = nx.from_numpy_matrix(eMCI_mat_complement)
print("Number Of Edges: ",len(G_eMCI_complement.edges()))
print("Number Of Nodes: ",len(G_eMCI_complement.nodes()))


Number Of Edges:  905
Number Of Nodes:  70


In [49]:
#lMCI complement
lMCI_mat_complement = _complement(lmci_mat)
G_lMCI_complement = nx.from_numpy_matrix(lMCI_mat_complement)
print("Number Of Edges: ",len(G_lMCI_complement.edges()))
print("Number Of Nodes: ",len(G_lMCI_complement.nodes()))


Number Of Edges:  1035
Number Of Nodes:  70


In [50]:
#ad complement
ad_mat_complement = _complement(ad_mat)
G_AD_complement = nx.from_numpy_matrix(ad_mat_complement)
print("Number Of Edges: ",len(G_AD_complement.edges()))
print("Number Of Nodes: ",len(G_AD_complement.nodes()))


Number Of Edges:  1051
Number Of Nodes:  70


## Making pipeline

In [51]:
def Edge_List(Graph):
    edges = list(Graph.edges)
    node_list_1 = [item[0] for item in edges]
    node_list_2 = [item[1] for item in edges]
    # combine all nodes in a list
    node_list = node_list_1 + node_list_2
    # remove duplicate items from the list
    node_list = list(dict.fromkeys(node_list))
    df = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})
    return df,node_list

In [52]:
edges_Normal,edgeList_Normal = Edge_List(G_Normal)
edges_EMCI,edgeList_EMCI = Edge_List(G_eMCI)
edges_LMCI,edgeList_LMCI = Edge_List(G_lMCI)
edges_AD,edgeList_AD = Edge_List(G_AD)

In [53]:
edges_Normal_complement,edgeList_Normal_complement = Edge_List(G_Normal_complement)
edges_EMCI_complement,edgeList_EMCI_complement = Edge_List(G_eMCI_complement)
edges_LMCI_complement,edgeList_LMCI_complement = Edge_List(G_lMCI_complement)
edges_AD_complement,edgeList_AD_complement = Edge_List(G_AD_complement)

In [54]:
#recreate the graph
def recreateGraph(e_df,node_list):
    G = nx.from_pandas_edgelist(e_df, "node_1", "node_2", create_using=nx.Graph())
    adj_G = nx.to_numpy_matrix(G,nodelist = node_list)
    return G,adj_G
    

In [55]:
G_Normal_complement,adj_normal_complement = recreateGraph(edges_Normal_complement,edgeList_Normal_complement)
G_EMCI_complement,adj_emci_complement = recreateGraph(edges_EMCI_complement,edgeList_EMCI_complement)
G_LMCI_complement,adj_lmci_complement = recreateGraph(edges_LMCI_complement,edgeList_LMCI_complement)
G_AD_complement,adj_ad_complement = recreateGraph(edges_AD_complement,edgeList_AD_complement)

In [56]:
G_Normal_,adj_normal = recreateGraph(edges_Normal,edgeList_Normal)
G_EMCI,adj_emci = recreateGraph(edges_EMCI,edgeList_EMCI)
G_LMCI,adj_lmci = recreateGraph(edges_LMCI,edgeList_LMCI)
G_AD,adj_ad = recreateGraph(edges_AD,edgeList_AD)

In [57]:
def create_Dataset(Graph,G_data):
    AA = []
    PA = []
    RAI = []
    JC = []
    CSH = []
    RAISH = []
    WIC = []
    CN=[]
    for i,j in zip(G_data['node_1'],G_data['node_2']):
        AA.append(list(nx.adamic_adar_index(Graph,ebunch=[(i,j)]))[0][2])
        RAI.append(list(nx.resource_allocation_index(Graph,ebunch=[(i,j)]))[0][2])
        PA.append(list(nx.preferential_attachment(Graph,ebunch=[(i,j)]))[0][2])
        JC.append(list(nx.jaccard_coefficient(Graph,ebunch=[(i,j)]))[0][2])
        CN.append(len(list(nx.common_neighbors(Graph, i, j))))
        #CNC.append(list(nx.common_neighbor_centrality(G, ebunch=[(i, j)]))[0][2])
    G_data['AA'] = AA     
    G_data['RAI'] = RAI
    G_data['PA'] = PA
    G_data['JC'] = JC 
    G_data['CN'] = CN
   
    #df_AllNodes['CNC'] = CNC 
    return G_data

In [58]:
def create_Dataset_for_pred(Graph):
    edges = list(Graph.edges)
    node_list_1 = [item[0] for item in edges]
    node_list_2 = [item[1] for item in edges]
    # combine all nodes in a list
    node_list = node_list_1 + node_list_2

    # remove duplicate items from the list
    node_list = list(dict.fromkeys(node_list))
    G_data = pd.DataFrame()
    AA = []
    PA = []
    RAI = []
    JC = []
    CSH = []
    RAISH = []
    WIC = []
    CN=[]
    _node1 = []
    _node2 = []
    target = []
    for i in node_list:
        for j in node_list:
            if i != j:
                _node1.append(i)
                _node2.append(j)
                AA.append(list(nx.adamic_adar_index(Graph,ebunch=[(i,j)]))[0][2])
                RAI.append(list(nx.resource_allocation_index(Graph,ebunch=[(i,j)]))[0][2])
                PA.append(list(nx.preferential_attachment(Graph,ebunch=[(i,j)]))[0][2])
                JC.append(list(nx.jaccard_coefficient(Graph,ebunch=[(i,j)]))[0][2])
                CN.append(len(list(nx.common_neighbors(Graph, i, j))))
                #CNC.append(list(nx.common_neighbor_centrality(G, ebunch=[(i, j)]))[0][2])
                if Graph.has_edge(i,j)==True:
                    target.append(1)
                else:
                    target.append(0)
    G_data['node1'] = _node1
    G_data['node2'] = _node2     
    G_data['AA'] = AA     
    G_data['RAI'] = RAI
    G_data['PA'] = PA
    G_data['JC'] = JC 
    G_data['CN'] = CN
    G_data['Target'] = target
   
    #df_AllNodes['CNC'] = CNC 
    return G_data

In [59]:
def prepareDataSet(Graph,adj_G):
    edges = list(Graph.edges)
    node_list_1 = [item[0] for item in edges]
    node_list_2 = [item[1] for item in edges]
    # combine all nodes in a list
    node_list = node_list_1 + node_list_2
    # remove duplicate items from the list
    node_list = list(dict.fromkeys(node_list))
    G_df = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})
    # get unconnected node-pairs
    
    all_unconnected_pairs = []
    # traverse adjacency matrix
    offset = 0
    for i in tqdm(range(adj_G.shape[0])):
        for j in range(offset,adj_G.shape[1]):
            if i != j:
                if adj_G[i,j] == 0:
                    all_unconnected_pairs.append([node_list[i],node_list[j]])

        offset = offset + 1
    node_1_unlinked = [i[0] for i in all_unconnected_pairs]
    node_2_unlinked = [i[1] for i in all_unconnected_pairs]
    data = pd.DataFrame({'node_1':node_1_unlinked, 
                         'node_2':node_2_unlinked})
    # add target variable 'link'
    data['link'] = 0
    initial_node_count = len(Graph.nodes)

    G_df_temp = G_df.copy()
    # empty list to store removable links
    omissible_links_index = []
    for i in tqdm(G_df.index.values):
      # remove a node pair and build a new graph
      G_temp = nx.from_pandas_edgelist(G_df_temp.drop(index = i), "node_1", "node_2", create_using=nx.Graph())
      # check there is no spliting of graph and number of nodes is same
      if (nx.number_connected_components(G_temp) == 1) and (len(G_temp.nodes) == initial_node_count):
        omissible_links_index.append(i)
        G_df_temp = G_df_temp.drop(index = i)
    # create dataframe of removable edges
    G_df_ghost = G_df.loc[omissible_links_index]
    # add the target variable 'link'
    G_df_ghost['link'] = 1
    data = data.append(G_df_ghost[['node_1', 'node_2', 'link']], ignore_index=True)
    # drop removable edges
    G_df_partial = G_df.drop(index=G_df_ghost.index.values)
    # build graph
    G_data = nx.from_pandas_edgelist(G_df_partial, "node_1", "node_2", create_using=nx.Graph())
    return data,G_data

In [60]:
def model_(G_first,G_sec,adj_mat):
    data,G_data = prepareDataSet(G_first,adj_mat)
    df = create_Dataset(G_data,data)
    
    xtrain, xtest, ytrain, ytest = train_test_split(df[['AA','RAI','PA','JC','CN']], df['link'], test_size = 0.3, random_state = 35)
    lr = DecisionTreeClassifier(random_state=0)
    lr.fit(xtrain, ytrain)
    predictions = lr.predict(xtest)
    #roc_auc_score(ytest, predictions[:,1])
    print("Test accuracy",accuracy_score(ytest,predictions))
    first_dataset = create_Dataset_for_pred(G_first)
    second_dataset = create_Dataset_for_pred(G_sec)
    predictions = lr.predict(first_dataset[['AA','RAI','PA','JC','CN']])
    #roc_auc_score(ytest, predictions[:,1])
    print("predictions :\n",predictions)
    print("Training accuracy",accuracy_score(first_dataset['Target'],predictions))
    print("Next state prediction accuracy",accuracy_score(second_dataset['Target'],predictions))
    print("Confusion matrix: \n",confusion_matrix(second_dataset['Target'],predictions))

In [61]:
data,G_data = prepareDataSet(G_Normal_complement,adj_normal_complement)
df = create_Dataset(G_data,data)
    

100%|██████████| 70/70 [00:00<00:00, 4471.74it/s]
100%|██████████| 925/925 [00:05<00:00, 183.07it/s]


In [62]:
df.head(50)

Unnamed: 0,node_1,node_2,link,AA,RAI,PA,JC,CN
0,0,3,0,0.480898,0.125,1,1.0,1
1,0,25,0,0.0,0.0,1,0.0,0
2,0,35,0,0.0,0.0,1,0.0,0
3,0,36,0,0.0,0.0,1,0.0,0
4,0,37,0,0.0,0.0,1,0.0,0
5,0,38,0,0.480898,0.125,1,1.0,1
6,0,39,0,0.0,0.0,1,0.0,0
7,0,40,0,0.0,0.0,1,0.0,0
8,0,41,0,0.0,0.0,1,0.0,0
9,0,42,0,0.0,0.0,1,0.0,0


## Implementing pipeline on actual graph

In [66]:
model_(G_Normal_complement,G_EMCI,adj_normal)

100%|██████████| 70/70 [00:00<00:00, 3156.70it/s]
100%|██████████| 925/925 [00:05<00:00, 158.73it/s]


Test accuracy 0.6107954545454546
predictions :
 [0 0 0 ... 0 0 0]
Training accuracy 0.6169772256728778


ValueError: Found input variables with inconsistent numbers of samples: [4556, 4830]

In [67]:
model_(G_EMCI,G_LMCI,adj_emci)

100%|██████████| 68/68 [00:00<00:00, 4350.67it/s]
100%|██████████| 1510/1510 [00:08<00:00, 169.88it/s]


Test accuracy 0.6581325301204819
predictions :
 [0 0 0 ... 0 0 0]
Training accuracy 0.334503950834065
Next state prediction accuracy 0.3933274802458297
Confusion matrix: 
 [[1788    8]
 [2756    4]]


In [68]:
model_(G_LMCI,G_AD,adj_lmci)

100%|██████████| 68/68 [00:00<00:00, 4350.47it/s]
100%|██████████| 1380/1380 [00:10<00:00, 134.60it/s]


Test accuracy 0.6129518072289156
predictions :
 [1 1 1 ... 1 1 1]
Training accuracy 0.6229148375768218
Next state prediction accuracy 0.612379280070237
Confusion matrix: 
 [[  70 1758]
 [   8 2720]]


In [69]:
model_(G_Normal,G_AD,adj_normal)

100%|██████████| 70/70 [00:00<00:00, 3434.21it/s]
100%|██████████| 925/925 [00:04<00:00, 185.89it/s]


Test accuracy 0.6732954545454546
predictions :
 [0 0 0 ... 0 0 0]
Training accuracy 0.6774327122153209


ValueError: Found input variables with inconsistent numbers of samples: [4556, 4830]

## Implementing pipeline on complemented graph

In [70]:
model_(G_Normal_complement,G_EMCI_complement,adj_normal_complement)

100%|██████████| 70/70 [00:00<00:00, 4482.67it/s]
100%|██████████| 925/925 [00:04<00:00, 222.04it/s]


Test accuracy 0.6107954545454546
predictions :
 [0 0 0 ... 0 0 0]
Training accuracy 0.6169772256728778
Next state prediction accuracy 0.6252587991718427
Confusion matrix: 
 [[3020    0]
 [1810    0]]


In [71]:
model_(G_EMCI_complement,G_LMCI_complement,adj_emci_complement)

100%|██████████| 70/70 [00:00<00:00, 4479.11it/s]
100%|██████████| 905/905 [00:03<00:00, 232.27it/s]


Test accuracy 0.6491477272727273
predictions :
 [1 1 1 ... 0 0 0]
Training accuracy 0.4248447204968944
Next state prediction accuracy 0.45300207039337476
Confusion matrix: 
 [[ 714 2046]
 [ 596 1474]]


In [72]:
model_(G_LMCI_complement,G_AD_complement,adj_lmci_complement)

100%|██████████| 70/70 [00:00<00:00, 6879.94it/s]
100%|██████████| 1035/1035 [00:06<00:00, 165.06it/s]


Test accuracy 0.5909090909090909
predictions :
 [1 1 1 ... 1 1 1]
Training accuracy 0.36770186335403726
Next state prediction accuracy 0.41904761904761906
Confusion matrix: 
 [[ 306 2422]
 [ 384 1718]]


In [73]:
model_(G_Normal_complement,G_AD_complement,adj_normal_complement)

100%|██████████| 70/70 [00:00<00:00, 3412.86it/s]
100%|██████████| 925/925 [00:05<00:00, 158.34it/s]


Test accuracy 0.6107954545454546
predictions :
 [0 0 0 ... 0 0 0]
Training accuracy 0.6169772256728778
Next state prediction accuracy 0.5648033126293995
Confusion matrix: 
 [[2728    0]
 [2102    0]]
