<a href="https://colab.research.google.com/github/h-rathee30/Link-Prediction-II/blob/master/LinkPredictionII.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# required imports
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import GridSearchCV
from google.colab import files
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
uploaded = files.upload()

In [None]:
#if the data is in a .txt file
data = pd.read_csv('Celegans.txt',delim_whitespace=True, names=('SRC', 'TGT', 'IGNORE') )
data

In [None]:
#data editiing with respect to the data uploaded 
data.drop(['IGNORE'], axis = 1, inplace = True)
data.reset_index(drop = True, inplace = True)
data = data.astype(int)

In [None]:
#if the data provided is in gml format
g = nx.read_gml('dolphins.gml')
G = nx.convert_node_labels_to_integers(g, first_label=1, ordering='default')
nx.draw(G,with_labels=True, font_weight='bold')
data = list(G.edges)
data = pd.DataFrame(data)

In [None]:
def Pair_Of_Nodes(g):
    nodes = list(g.nodes) 
    pair_of_nodes = []
    for i in range(0, len(nodes)):
        for j in range(i+1, len(nodes)) :
            pair = [nodes[i], nodes[j]]
            pair_of_nodes.append(pair)
    return pair_of_nodes

def MatrixForestIndex(g,node_pairs):
  L = nx.laplacian_matrix(g)
  L = L.todense()
  n = L.shape[0]
  I = np.identity(n)
  S = I + L
  S = np.linalg.inv(S)
  Matrix_Forest_Index = []
  for row in node_pairs:
    node1 = row[0]
    node2 = row[1]

    temp = [ node1, node2, S[node1-1,node2-1]]
    Matrix_Forest_Index.append(temp)
  return Matrix_Forest_Index

def Common_Neighbours_List(g, pair_of_nodes):
    common_neighbours = []
    for node1, node2 in pair_of_nodes:
        length = len(list((nx.common_neighbors(g,node1,node2))))
        temp = [node1, node2, length ]
        common_neighbours.append(temp) 
    return common_neighbours

#alpha beta are parameters
def LHN2_Index( graph, pair_of_nodes, beta, alpha):
  LHN2_Index = []
  cnt = 1
  for node1, node2 in pair_of_nodes:
    print(cnt)
    paths = nx.all_simple_paths(graph, source = node1, target = node2, cutoff = 4)
    path_lengths = [len(path) for path in paths];          #this array stores the path-length with (i+1) path length, i is index
    path_lengths.sort();
    freq = {}                                              #this is a dictionary that stores path-length: number of paths
    for item in path_lengths: 
      if (item in freq): 
        freq[item] += 1
      else: 
        freq[item] = 1
    value = 1;
    for pathlength, num_of_path in freq.items():
      value += (beta**pathlength) * num_of_path
    value = alpha*value    
    temp = [node1, node2, value]
    LHN2_Index.append(temp)
    cnt += 1
  return LHN2_Index

def getDictionaryFromList(input_list) :
    dic = {}
    for ele in input_list :
        dic[str(ele[0])+" "+str(ele[1])] = ele[2]
    return dic

In [None]:
#spliting here to see the predictions made just by the three measaures used 
train = data
train = train.values.tolist()

tra = []

for dt in range(0,len(train)) :
    node1 = train[dt][0] if train[dt][0] < train[dt][1] else train[dt][1]  
    node2 = train[dt][0] if train[dt][0] > train[dt][1] else train[dt][1]
    if node1 == node2 : continue
    else : tra.append([node1, node2])  

train = tra    
   

In [None]:
#build a graph which contains edges just in the training data set
No_of_nodes = data.to_numpy()
No_of_nodes = np.unique(No_of_nodes)
nodes_input = np.sort(No_of_nodes)

graph = nx.Graph()
for i in nodes_input: 
    graph.add_node(i)

for dt in range(0,len(train)):
    graph.add_edge(train[dt][0],train[dt][1])

In [None]:
# graph visualization 
nx.draw(graph,with_labels=True, font_weight='bold', )

In [None]:
all_pairs_of_nodes = Pair_Of_Nodes(graph)

In [None]:
# Use the measures for every node pair
C_N = Common_Neighbours_List(graph,all_pairs_of_nodes)
C_N = getDictionaryFromList(C_N)
M_F_Index = MatrixForestIndex(graph,all_pairs_of_nodes)
M_F_Index = getDictionaryFromList(M_F_Index)
LHNII = LHN2_Index(graph,all_pairs_of_nodes, 0.98, 1)
LHNII = getDictionaryFromList(LHNII)

In [None]:
# A dictionary of all the features created above
measures = [C_N, M_F_Index, LHNII]
feature_names = ["Common_Neighbours", "Matrix_Forest_Index", " LHN-II"]
feature_table = {}
for node1, node2 in all_pairs_of_nodes :
    arr = [node1, node2]
    for j in range(len(feature_names)):
      arr.append(measures[j][str(node1)+" "+str(node2)]) 
    if graph.has_edge(node1, node2):
      arr.append(1)
    else: arr.append(0)
    feature_table[str(node1)+" "+str(node2)] = arr

In [None]:
features_dataframe = pd.DataFrame.from_dict(feature_table, orient = 'index', columns = ["Node1", "Node2", "Common_Neighbours", "Matrix_Forest_Index", "LHN-II", "Category"])

In [None]:
features_dataframe.sort_values(by = 'Category', inplace = True)

In [None]:
features_dataframe

In [None]:
# Graphs are usually sparse so we take equal numbers of both category tuples from feature table dataset
cat1_features = pd.DataFrame
cnt = 0
while features_dataframe.iloc[cnt][-1] == 0:
  cnt+= 1
print(cnt)
cat1_features = features_dataframe.iloc[cnt:54946]
cat0_features = features_dataframe.iloc[0:len(cat1_features)]

In [None]:
# Data creation i.e, train and test data and normalization
E = cat1_features.append(cat0_features)
X = E.copy()
Y = E["Category"].copy()
X.drop(["Node1","Node2", "Category"], axis = 1, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y )

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# **Logistic regression**

In [None]:
std_reg = LogisticRegression( )

In [None]:
std_reg_param = {'penalty' : ['l1', 'l2'], 'C' : [0.1, 0.5, 1, 2, 5, 6, 7, 8, 10], 'solver' : ['liblinear', 'saga']}
LR1_reg_grid = GridSearchCV(estimator= std_reg, param_grid= std_reg_param, scoring = 'accuracy', cv = 10, return_train_score=True, verbose=True )
LR1_reg_grid_fit = LR1_reg_grid.fit(x_train, y_train)

In [None]:
cv_results_reg= pd.DataFrame.from_dict(LR1_reg_grid_fit.cv_results_)
cv_results_reg.sort_values(by= 'rank_test_score')

In [None]:
LR1 = LogisticRegression(C= 6, solver= 'saga', penalty= 'l1')
LR1.fit(x_train,y_train)
y_pred = LR1.predict(x_test)

In [None]:
# cofusion matrix
LR1_confusion_mat = confusion_matrix(y_test, y_pred)
LR1_confusion_mat

# **XGBOOST**

In [None]:
xgb = XGBClassifier( )

In [None]:
xgb_param = {'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.5], 
             'max_depth' : [ 6, 8, 10], 
             'objective' : ['binary:logistic', 'reg:logistic'], 
             'subsample' : [0.5, 0.7], 
             'gamma' : [0, 0.5, 1, 5]  }

xgb_grid = GridSearchCV(estimator= xgb, param_grid= xgb_param, scoring = 'accuracy', cv = 5, return_train_score=True, verbose=True, )
xgb_grid_fit = xgb_grid.fit(x_train, y_train)

In [None]:
XGB_cv_results_reg= pd.DataFrame.from_dict(xgb_grid_fit.cv_results_)
XGB_cv_results_reg.sort_values(by = 'rank_test_score')

In [None]:
xgb = XGBClassifier(max_depth= 5, learning_rate=0.2, gamma= 5, subsample= 0.7, objective= 'reg:logistic')

In [None]:
xgb.fit(x_train, y_train)

In [None]:
XGB_y_pred = xgb.predict(x_test)

In [None]:
# cofusion matrix
XGB_confusion_mat = confusion_matrix(y_test, XGB_y_pred)
XGB_confusion_mat

# STACKING 

In [None]:
def get_stacking():
  level0 = list()
  level0.append(('lr', LogisticRegression(C= 6, solver= 'saga' , penalty='l1') ))
  level0.append(('xgb1', XGBClassifier(max_depth= 6, learning_rate=0.2, gamma= 7, subsample= 0.7, objective='reg:logistic') ))
  level1 = LogisticRegression(C= 6, solver= 'saga', penalty = 'l1')
  model = StackingClassifier(estimators=level0, final_estimator= level1, cv = 5)
  return model

In [None]:
models = get_stacking()

In [None]:
models.fit(x_train,y_train)

In [None]:
stacking_y_pred = models.predict(x_test)

In [None]:
stacking_confusion_mat = confusion_matrix(y_test, stacking_y_pred)
stacking_confusion_mat