In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
from sklearn.metrics import confusion_matrix

In [2]:
edges_labeled = pd.read_csv('edges_labeled.csv')
y = edges_labeled['prediction']
X = edges_labeled.drop('prediction', axis=1)
print(X.shape)
X.head()

(42004, 2)


Unnamed: 0,source,target
0,David J. Hawkes,A. A. J. Marley
1,Steven Feiner,A. A. J. Marley
2,Jürgen Wüst,A. A. J. Marley
3,Shlomo Shamai,A. A. J. Marley
4,Maurizio Talamo,A. A. J. Marley


In [3]:
df = pd.read_csv('edges.csv')
print(df.shape)
df.head(10)

(284654, 2)


Unnamed: 0,source,target
0,Aly A. Farag,Elsayed E. Hemayed
1,Aly A. Farag,Moumen T. Ahmed
2,Aly A. Farag,Sameh M. Yamany
3,Aly A. Farag,Mostafa Gadal-Haqq M. Mostafa
4,Aly A. Farag,David Tasman
5,Aly A. Farag,Allan G. Farman
6,Aly A. Farag,Mohamed N. Ahmed
7,Aly A. Farag,S. Ahmed
8,Aly A. Farag,S. Roberts
9,Aly A. Farag,Edward J. Delp


In [4]:
G = nx.Graph()

for _, row in df.iterrows():
    G.add_edge(row['source'], row['target'])

In [5]:
tuples = [tuple(x) for x in X.to_numpy()]
tuples[:5]

[('David J. Hawkes', 'A. A. J. Marley'),
 ('Steven Feiner', 'A. A. J. Marley'),
 ('Jürgen Wüst', 'A. A. J. Marley'),
 ('Shlomo Shamai', 'A. A. J. Marley'),
 ('Maurizio Talamo', 'A. A. J. Marley')]

In [6]:
def Link_predict(s, t, j_c):
    if s in G and t in G:
        
        if not nx.has_path(G,s,t):
            return False
        
        if nx.shortest_path_length(G,s,t) < 4:
            return True
        
        aux = len(list(nx.common_neighbors(G,s,t)))/400
        pred = nx.jaccard_coefficient(G, [(s, t)])
        for _, _, p in pred:
            return (p + aux > j_c)
        
    return False

In [7]:
predictions = []
for elem in tuples:
    # predictions.append('P' if np.random.rand() > 0.5 else 'N')
    if Link_predict(elem[0], elem[1], 7/300):
        predictions.append('P')
    else:
        predictions.append('N')

tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()   
print(tn, fp, fn, tp)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
    
print("precision", precision)
print("recall", recall)
print("f-value", 2*precision*recall/(precision+recall))

30887 616 4620 5881
precision 0.9051870093889487
recall 0.5600419007713551
f-value 0.6919637604424049


In [8]:
respuesta = X.copy()
respuesta['prediction'] = predictions
respuesta.head()

Unnamed: 0,source,target,prediction
0,David J. Hawkes,A. A. J. Marley,N
1,Steven Feiner,A. A. J. Marley,N
2,Jürgen Wüst,A. A. J. Marley,N
3,Shlomo Shamai,A. A. J. Marley,N
4,Maurizio Talamo,A. A. J. Marley,N


In [9]:
respuesta.to_csv('edges_prediction.csv', index=False)