In [16]:
#using 3.12.3
import pandas as pd
import networkx as nx
import random
import csv

In [17]:
df = pd.read_csv('../data/edges.csv')
df

Unnamed: 0,Customer,Supplier
0,Volvo,AAPICO Hitech
1,Volvo,ABC Technologies
2,Volvo,Adient plc
3,Volvo,AGC
4,Volvo,Aisin
...,...,...
51358,Unipres,voestalpine Rotec
51359,Unipres,Xiangyang Sunrise
51360,Unipres,Yakumo Manufacturing
51361,Unipres,Yamashita Rubber


In [18]:
all_suppliers = set(df['Customer'].tolist() + df['Supplier'].tolist())
nametoi = {name: i for i, name in enumerate(all_suppliers)}
edges = []

for _, row in df.iterrows():
    supplier1, supplier2 = row
    idx1, idx2 = nametoi[supplier1], nametoi[supplier2]
    edges.append((idx1, idx2))

In [19]:
# Create the graph
G = nx.Graph()
G.add_edges_from(edges)

In [20]:
test_split = .20
edges = list(G.edges())
test_edges = random.sample(edges, int(G.number_of_edges()*test_split))
test_G = nx.Graph()
test_G.add_edges_from(test_edges)
train_edges = [edge for edge in edges if edge not in test_edges]
train_G = G.copy()
train_G.remove_edges_from(test_G.edges())

In [21]:
non_edges = list(nx.non_edges(G))

In [22]:
print(f'there are {len(non_edges)} non-edges in the graph')

there are 112849590 non-edges in the graph


In [23]:
#get the same amount of negative edges as there are positive in the test set
n_times_as_many_pos_test_edges = 4
non_edges_test = random.sample(non_edges, len(test_edges)*n_times_as_many_pos_test_edges)

In [24]:
print(f'test set has {len(non_edges_test)} negative edges and {len(test_edges)} positive edges')

test set has 38636 negative edges and 9659 positive edges


In [25]:
#create the labels for the test set
test_edges_labels = [True]*len(test_edges) #all of these are positive labels
non_edge_test_labels = [False]*len(non_edges_test) #all of these are negative labels

In [26]:
#merge non_edge labels into test labels 
# (nx.jaccard_coefficient outputs predictions in order, so we can just extend them like this)
test_edges_labels.extend(non_edge_test_labels)

In [27]:
#merge non edge test into the test edge set
# (nx.jaccard_coefficient outputs predictions in order, so we can just extend them like this)
test_edges.extend(non_edges_test)

In [28]:
#get the jaccard prectiction on the test set, given the training graph
preds = list(nx.jaccard_coefficient(train_G, test_edges))

In [29]:
#write results to csv so we can save them and use them in process_jaccard_results.py
with open('jaccard_results.csv', 'w') as csvFile:    
    for i in range(len(test_edges_labels)):
        csvFile.write(f'{test_edges_labels[i]},{preds[i][2]}\n')