In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

import pickle

from random import sample, seed
from tqdm import tqdm
from utils.propagators.directed import PropagateDirected
from utils.metrics import accuracy, balanced_accuracy

seed(42)

In [5]:
# Load Graph.
G = nx.read_gexf("../../state_files/PyPi Network V4.gexf")
# Keep only giant component.
gc_nodes = sorted(nx.connected_components(G.to_undirected()), key = lambda x: len(x), reverse=True)[0]
not_gc_nodes = set(G.nodes()) - gc_nodes
G.remove_nodes_from(not_gc_nodes)

# Load labels.
with open("../../state_files/PyPi Dataframe V4.pickle", "rb") as f:
    df = pickle.load(f)
labels = df["Topic"].dropna().to_dict()
# Remove 'Other' topic.
for n, l in labels.items():
    if 'Other/Nonlisted Topic' in l:
        l.remove('Other/Nonlisted Topic')
# Filter nodes with no topics.
labels = dict(filter(lambda x: len(x[1]) > 0, labels.items()))
# Keep nodes from giant component.
labels = dict(filter(lambda x: x[0] in gc_nodes, labels.items()))

# Split train and test set.
test_size = 0.1

test_nodes = sample(list(labels.keys()), int(test_size*len(labels.keys())))
train_nodes = list(filter(lambda x: x not in test_nodes, labels.keys()))

train_labels = dict(filter(lambda x: x[0] in train_nodes, labels.items()))
test_labels = dict(filter(lambda x: x[0] in test_nodes, labels.items()))

In [7]:
# Método Probabilístico.
try:
    with open("pl_proba.pickle", "rb") as f:
        pl_proba = pickle.load(f)
        fl_proba = pl_proba.labels
        print("Loaded Proba.")
except FileNotFoundError:
    print("Calculating Proba.")
    pl_proba = PropagateDirected(G, train_labels, method = "probability")
    fl_proba = pl_proba.propagate_all()
    with open("pl_proba.pickle", "wb") as f:
        pickle.dump(pl_proba, f)

acc_proba = accuracy(test_labels, fl_proba)
bal_acc_proba, times_seen_proba = balanced_accuracy(test_labels, fl_proba)

# Método de detección local.
try:
    with open("pl_local.pickle", "rb") as f:
        pl_local = pickle.load(f)
        fl_local = pl_local.labels
        print("Loaded Local.")
except FileNotFoundError:
    print("Calculating Local.")
    pl_local = PropagateDirected(G, train_labels, method = "local")
    fl_local = pl_local.propagate_all()
    with open("pl_local.pickle", "wb") as f:
        pickle.dump(pl_local, f)
    
acc_local = accuracy(test_labels, fl_local)
bal_acc_local, times_seen_local = balanced_accuracy(test_labels, fl_local)

# Método ponderando contribuciones globales.
try:
    with open("pl_global.pickle", "rb") as f:
        pl_global = pickle.load(f)
        fl_global = pl_global.labels
        print("Loaded Global.")
except FileNotFoundError:
    print("Calculating Global.")
    pl_global = PropagateDirected(G, train_labels, method = "global")
    fl_global = pl_global.propagate_all()
    with open("pl_global.pickle", "wb") as f:
        pickle.dump(pl_global, f)

acc_global = accuracy(test_labels, fl_global)
bal_acc_global, times_seen_global = balanced_accuracy(test_labels, fl_global)

Loaded Proba.
Accuracy: 47%
Loaded Local.
Accuracy: 57%
Loaded Global.
Accuracy: 34%


In [9]:
print("Balanced Accuracy: Modelo de propagación aleatoria.")
print(f"{bal_acc_proba*100:.0f}%")

print("Balanced Accuracy: Modelo de propagación con criterios locales.")
print(f"{bal_acc_local*100:.0f}%")

print("Balanced Accuracy: Modelo de propagación con criterios globales.")
print(f"{bal_acc_global*100:.0f}%")

print("Balanced Accuracy: Modelo de tirar un dado de 23 caras")
label_collector = []
for n, l in test_labels.items():
    label_collector += l
topics = set(label_collector)
matches = {t: int(times_seen_local[t]/len(times_seen_local)) for t in topics}
accuracy_per_topic = {t: matches[t]/times_seen_global[t] for t in topics}
print(f"{sum(accuracy_per_topic.values())/len(accuracy_per_topic.values())*100:.0f}%")

print("Balanced Accuracy: Modelo de etiquetar TODO con la etiqueta mas común")
print(f'{1 * (times_seen_local["Software Development"]/sum(times_seen_local.values())) / len(times_seen_local)*100:.0f}%')

Balanced Accuracy: Modelo de propagación aleatoria.
13%
Balanced Accuracy: Modelo de propagación con criterios locales.
14%
Balanced Accuracy: Modelo de propagación con criterios globales.
25%
Balanced Accuracy: Modelo de tirar un dado de 23 caras
3%
Balanced Accuracy: Modelo de etiquetar TODO con la etiqueta mas común
2%
