In [4]:
import re
import numpy as np

from segk import segk
from utils import read_edgelist
from evaluation import evaluate_classification, evaluate_clustering

import warnings
warnings.filterwarnings('ignore')

In [11]:
class_labels = dict()
with open("datasets/enron/employees.txt", "r") as f:
    for line in f:
        tokens = line.split("\t")
        tokens = re.split(r'\s{2,}', tokens[1][:-1])
        if len(tokens) >= 2:
            name = tokens[0].split()[0].lower()+'.'+tokens[0].split()[1].lower()
            class_labels[name] = tokens[1]

name2id = dict()
with open("datasets/enron/email-Enron-full-node-labels.txt", "r") as f:
    for line in f:
        tokens = line.split()
        name = tokens[1].split('@')[0]
        if name in class_labels:
            name2id[name] = int(tokens[0])

jobs = set(class_labels.values())
jobs_count = dict()
for name in class_labels:
    if class_labels[name] in jobs_count:
        jobs_count[class_labels[name]] += 1
    else:
        jobs_count[class_labels[name]] = 1

reduced_labels = dict()
for name in class_labels:
    if name in name2id and class_labels[name] != 'N/A' and class_labels[name] != 'In House Lawyer':
        if class_labels[name] == 'Managing Director':
            reduced_labels[name2id[name]] = 'Director'
        else:
            reduced_labels[name2id[name]] = class_labels[name]


nodes, edgelist = read_edgelist("datasets/enron/email-Enron-full-proj-graph.txt", delimiter=' ', nodetype=int, cols=3)

y = list()
for node in nodes:
    if node in reduced_labels:
        y.append(reduced_labels[node])
y = np.array(y)

E_segk_sp = segk(nodes, edgelist, radius=2, dim=20, kernel='shortest_path')
E_segk_wl = segk(nodes, edgelist, radius=2, dim=20, kernel='weisfeiler_lehman')

algorithms = ["SEGK-SP", "SEGK-WL"]
embeddings = [E_segk_sp, E_segk_wl]

reduced_embeddings = list()
for embedding_matrix in embeddings:
    X = list()
    for i,node in enumerate(nodes):
        if node in reduced_labels:
            X.append(embedding_matrix[i,:])
    X = np.array(X)
    reduced_embeddings.append(X)

In [12]:
homogeneity, completeness, silhouette = evaluate_clustering(reduced_embeddings, y)
accs, f1 = evaluate_classification(reduced_embeddings, y)
                      
for i in range(len(algorithms)):
    print("\n"+algorithms[i])
    print("Homogeneity:", homogeneity[i])
    print("Completeness:", completeness[i])
    print("Silhouette:", silhouette[i])
    print("Accuracy:", accs[i])
    print("F1-score:", f1[i])


SEGK-SP
Homogeneity: 0.23073888883315125
Completeness: 0.08793008843427917
Silhouette: 0.018343633073345317
Accuracy: 0.27812222222222227
F1-score: 0.1647360987953845

SEGK-WL
Homogeneity: 0.2888437396537913
Completeness: 0.06353902966103214
Silhouette: 0.3975811007287109
Accuracy: 0.3499333333333334
F1-score: 0.21774039531896677
