In [1]:
#using 3.12.3
import pandas as pd
import networkx as nx
import random
import csv

In [2]:
#NOTE run from scripts directory!
#read in the edge data
df = pd.read_csv('../data/edges.csv')
df

Unnamed: 0,Customer,Supplier
0,Volvo,AAPICO Hitech
1,Volvo,ABC Technologies
2,Volvo,Adient plc
3,Volvo,AGC
4,Volvo,Aisin
...,...,...
51358,Unipres,voestalpine Rotec
51359,Unipres,Xiangyang Sunrise
51360,Unipres,Yakumo Manufacturing
51361,Unipres,Yamashita Rubber


In [3]:
#turn the edges into integer encodings, rather than text labels
all_suppliers = set(df['Customer'].tolist() + df['Supplier'].tolist())
nametoi = {name: i for i, name in enumerate(all_suppliers)}
edges = []

for _, row in df.iterrows():
    supplier1, supplier2 = row
    idx1, idx2 = nametoi[supplier1], nametoi[supplier2]
    edges.append((idx1, idx2))

In [4]:
# Create the graph
G = nx.Graph()
G.add_edges_from(edges)

In [23]:
#get a training-test split of edges
test_split = .20 #the percentage of edges to go into the test set
edges = list(G.edges())
#randomly sample the positive edges to get the specified percentage of test edges
test_edges = random.sample(edges, int(G.number_of_edges()*test_split))
#make a graph out of test edges, so we can remove edges from training set
test_G = nx.Graph()
test_G.add_edges_from(test_edges)
#take the positive edges that are not in the test set as the training set
train_edges = [edge for edge in edges if edge not in test_edges]
train_G = G.copy()
train_G.remove_edges_from(test_G.edges()) #remove the edges that are in the test set
print(f'There are {train_G.number_of_edges()} positive edges in the training set and {test_G.number_of_edges()} positive edges in the test set')

There are 38640 positive edges in the training set and 9659 positive edges in the test set


In [24]:
#collect all of the negative edges that exist in G
non_edges = list(nx.non_edges(G))

In [25]:
print(f'there are {len(non_edges)} non-edges in the graph')

there are 112849590 non-edges in the graph


In [26]:
#get the n_times_as_many_pos_test_edges more negative edges as there are positive in the test set
n_times_as_many_pos_test_edges = 4
non_edges_test = random.sample(non_edges, len(test_edges)*n_times_as_many_pos_test_edges) #randomly sample from the collected negative edges

In [27]:
print(f'test set has {len(non_edges_test)} negative edges and {len(test_edges)} positive edges')

test set has 38636 negative edges and 9659 positive edges


In [28]:
#create the labels for the test set
test_edges_labels = [True]*len(test_edges) #all of these are positive labels (they did exist in the graph)
non_edge_test_labels = [False]*len(non_edges_test) #all of these are negative labels (they did NOT exist in the graph)

In [29]:
#merge non_edge labels into test labels 
# (nx.jaccard_coefficient outputs predictions in order, so we can just extend them like this)
test_edges_labels.extend(non_edge_test_labels)

In [30]:
#merge non edge test into the test edge set
# (nx.jaccard_coefficient outputs predictions in order, so we can just extend them like this)
test_edges.extend(non_edges_test)

In [31]:
#get the jaccard prectiction on the test set, given the training graph
preds = list(nx.jaccard_coefficient(train_G, test_edges))

In [None]:
#write results to csv so we can save them and use them in process_jaccard_results.py
with open('../data/jaccard_results.csv', 'w') as csvFile:    
    for i in range(len(test_edges_labels)):
        csvFile.write(f'{test_edges_labels[i]},{preds[i][2]}\n')

In [55]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score
import numpy as np

test_positive_edges = random.sample(edges, int(G.number_of_edges()*test_split))

all_auc = []
all_f1 = []
all_precision = []

for n_times_as_many_pos_test_edges in reversed(range(1, 5)):
    test_negative_edges = random.sample(non_edges, len(test_positive_edges) * n_times_as_many_pos_test_edges)
    test_label = [True] * len(test_positive_edges) + [False] * len(test_negative_edges)
    all_test_edges = test_positive_edges + test_negative_edges

    preds = list(nx.jaccard_coefficient(train_G, all_test_edges))
    probs = [x[2] for x in preds]

    auc = roc_auc_score(test_label, probs)
    all_auc.append(auc)

    best_f1 = 0
    best_precision = 0
    thresholds = np.arange(0, 1.01, 0.01)  # Generate thresholds from 0 to 1
    for threshold in thresholds:
        predicted_labels = [p >= threshold for p in probs]
        f1 = f1_score(test_label, predicted_labels)
        precision = precision_score(test_label, predicted_labels)

        # Track the best F1 score and corresponding threshold
        if f1 > best_f1:
            best_f1 = f1
            best_precision = precision

    all_f1.append(best_f1)
    all_precision.append(best_precision)

    print(f'pos/neg ratio 1/{n_times_as_many_pos_test_edges}:')
    print(f'AUC score: {auc}')
    print(f'Best F1 score: {best_f1}')
    print(f'Precision for best F1 score: {best_precision}')
    print('=============')

pos/neg ratio 1/4:
AUC score: 0.6671504515812371
Best F1 score: 0.3333333333333333
Precision for best F1 score: 0.2
pos/neg ratio 1/3:
AUC score: 0.6673903932640858
Best F1 score: 0.4
Precision for best F1 score: 0.25
pos/neg ratio 1/2:
AUC score: 0.6666060193760563
Best F1 score: 0.5
Precision for best F1 score: 0.3333333333333333
pos/neg ratio 1/1:
AUC score: 0.6673988216100489
Best F1 score: 0.6666666666666666
Precision for best F1 score: 0.5


#### AUC

In [54]:
[x.item() for x in all_auc]

[0.6671611928989968,
 0.6679616378992285,
 0.6675719742783746,
 0.6665060046712902]

#### F1

In [56]:
[x.item() for x in all_f1]

[0.3333333333333333, 0.4, 0.5, 0.6666666666666666]

#### Precision

In [57]:
[x.item() for x in all_precision]

[0.2, 0.25, 0.3333333333333333, 0.5]