In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

import pickle

from random import sample, seed
from tqdm import tqdm
from utils.propagators.directed import PropagateDirected
from utils.metrics import accuracy, balanced_accuracy

seed(42)

In [2]:
# Load Graph.
G = nx.read_gexf("../../state_files/PyPi Network V4.gexf")
# Keep only giant component.
gc_nodes = sorted(nx.connected_components(G.to_undirected()), key = lambda x: len(x), reverse=True)[0]
not_gc_nodes = set(G.nodes()) - gc_nodes
G.remove_nodes_from(not_gc_nodes)

# Load labels.
with open("../../state_files/PyPi Dataframe V4 with subtopics.df", "rb") as f:
    df = pickle.load(f)
labels = df["Topics for Propagation"].dropna().to_dict()
# Remove 'Other' topic.
for n, l in labels.items():
    if 'Other/Nonlisted Topic' in l:
        l.remove('Other/Nonlisted Topic')
# Filter nodes with no topics.
labels = dict(filter(lambda x: len(x[1]) > 0, labels.items()))
# Keep nodes from giant component.
labels = dict(filter(lambda x: x[0] in gc_nodes, labels.items()))

# Split train and test set.
test_size = 0.1

test_nodes = sample(list(labels.keys()), int(test_size*len(labels.keys())))
train_nodes = list(filter(lambda x: x not in test_nodes, labels.keys()))

train_labels = dict(filter(lambda x: x[0] in train_nodes, labels.items()))
test_labels = dict(filter(lambda x: x[0] in test_nodes, labels.items()))

In [3]:
# Método Probabilístico.
try:
    with open("pl_sub_proba.pickle", "rb") as f:
        pl_proba = pickle.load(f)
        fl_proba = pl_proba.labels
        print("Loaded Proba.")
except FileNotFoundError:
    print("Calculating Proba.")
    pl_proba = PropagateDirected(G, train_labels, method = "probability")
    fl_proba = pl_proba.propagate_all()
    with open("pl_sub_proba.pickle", "wb") as f:
        pickle.dump(pl_proba, f)

acc_proba = accuracy(test_labels, fl_proba)
bal_acc_proba, times_seen_proba = balanced_accuracy(test_labels, fl_proba)

# Método de detección local.
try:
    with open("pl_sub_local.pickle", "rb") as f:
        pl_local = pickle.load(f)
        fl_local = pl_local.labels
        print("Loaded Local.")
except FileNotFoundError:
    print("Calculating Local.")
    pl_local = PropagateDirected(G, train_labels, method = "local")
    fl_local = pl_local.propagate_all()
    with open("pl_sub_local.pickle", "wb") as f:
        pickle.dump(pl_local, f)
    
acc_local = accuracy(test_labels, fl_local)
bal_acc_local, times_seen_local = balanced_accuracy(test_labels, fl_local)

# Método ponderando contribuciones globales.
try:
    with open("pl_sub_global.pickle", "rb") as f:
        pl_global = pickle.load(f)
        fl_global = pl_global.labels
        print("Loaded Global.")
except FileNotFoundError:
    print("Calculating Global.")
    pl_global = PropagateDirected(G, train_labels, method = "global")
    fl_global = pl_global.propagate_all()
    with open("pl_sub_global.pickle", "wb") as f:
        pickle.dump(pl_global, f)

acc_global = accuracy(test_labels, fl_global)
bal_acc_global, times_seen_global = balanced_accuracy(test_labels, fl_global)

Calculating Proba.


100%|██████████| 38664/38664 [18:38<00:00, 34.56it/s]  
100%|██████████| 101437/101437 [11:50<00:00, 142.75it/s]


140101 nodos etiquetados de 183495.


100%|██████████| 140101/140101 [09:39<00:00, 241.76it/s]  
100%|██████████| 13662/13662 [02:23<00:00, 95.16it/s] 


153763 nodos etiquetados de 183495.


100%|██████████| 153763/153763 [01:39<00:00, 1552.26it/s]   
100%|██████████| 2004/2004 [00:35<00:00, 56.30it/s] 


155767 nodos etiquetados de 183495.


100%|██████████| 155767/155767 [00:12<00:00, 12588.21it/s]  
100%|██████████| 305/305 [00:07<00:00, 40.92it/s]


156072 nodos etiquetados de 183495.


100%|██████████| 156072/156072 [00:02<00:00, 55699.54it/s]  
100%|██████████| 81/81 [00:01<00:00, 45.99it/s]


156153 nodos etiquetados de 183495.


100%|██████████| 156153/156153 [00:00<00:00, 296775.99it/s] 
100%|██████████| 14/14 [00:00<00:00, 19.24it/s]


156167 nodos etiquetados de 183495.


100%|██████████| 156167/156167 [00:00<00:00, 920753.22it/s] 
100%|██████████| 2/2 [00:00<00:00, 77.13it/s]


156169 nodos etiquetados de 183495.


100%|██████████| 156169/156169 [00:00<00:00, 1310258.03it/s]
100%|██████████| 1/1 [00:00<00:00, 55.71it/s]


156170 nodos etiquetados de 183495.


100%|██████████| 156170/156170 [00:00<00:00, 1436595.76it/s]
0it [00:00, ?it/s]


156170 nodos etiquetados de 183495.
No se encontraron nuevas etiquetas.
Accuracy: 31%
Calculating Local.


100%|██████████| 38664/38664 [18:30<00:00, 34.82it/s]  
100%|██████████| 101437/101437 [11:14<00:00, 150.32it/s]


140101 nodos etiquetados de 183495.


100%|██████████| 140101/140101 [09:04<00:00, 257.29it/s]  
100%|██████████| 13662/13662 [02:15<00:00, 101.14it/s]


153763 nodos etiquetados de 183495.


100%|██████████| 153763/153763 [01:30<00:00, 1698.37it/s]   
100%|██████████| 2004/2004 [00:33<00:00, 60.64it/s] 


155767 nodos etiquetados de 183495.


100%|██████████| 155767/155767 [00:11<00:00, 13935.70it/s]  
100%|██████████| 305/305 [00:07<00:00, 42.54it/s]


156072 nodos etiquetados de 183495.


100%|██████████| 156072/156072 [00:02<00:00, 61438.42it/s]  
100%|██████████| 81/81 [00:01<00:00, 48.53it/s]


156153 nodos etiquetados de 183495.


100%|██████████| 156153/156153 [00:00<00:00, 315667.61it/s] 
100%|██████████| 14/14 [00:00<00:00, 19.93it/s]


156167 nodos etiquetados de 183495.


100%|██████████| 156167/156167 [00:00<00:00, 972390.36it/s] 
100%|██████████| 2/2 [00:00<00:00, 80.21it/s]


156169 nodos etiquetados de 183495.


100%|██████████| 156169/156169 [00:00<00:00, 1338754.12it/s]
100%|██████████| 1/1 [00:00<00:00, 55.61it/s]


156170 nodos etiquetados de 183495.


100%|██████████| 156170/156170 [00:00<00:00, 1423146.35it/s]
0it [00:00, ?it/s]


156170 nodos etiquetados de 183495.
No se encontraron nuevas etiquetas.
Accuracy: 37%
Calculating Global.


100%|██████████| 38664/38664 [18:10<00:00, 35.47it/s]  
100%|██████████| 101437/101437 [10:46<00:00, 156.91it/s]


140101 nodos etiquetados de 183495.


100%|██████████| 140101/140101 [10:04<00:00, 231.60it/s]  
100%|██████████| 13662/13662 [02:23<00:00, 95.46it/s] 


153763 nodos etiquetados de 183495.


100%|██████████| 153763/153763 [01:35<00:00, 1612.73it/s]   
100%|██████████| 2004/2004 [00:37<00:00, 52.98it/s] 


155767 nodos etiquetados de 183495.


100%|██████████| 155767/155767 [00:11<00:00, 13071.12it/s]  
100%|██████████| 305/305 [00:07<00:00, 38.82it/s]


156072 nodos etiquetados de 183495.


100%|██████████| 156072/156072 [00:02<00:00, 57599.13it/s]  
100%|██████████| 81/81 [00:01<00:00, 45.26it/s]


156153 nodos etiquetados de 183495.


100%|██████████| 156153/156153 [00:00<00:00, 300508.08it/s] 
100%|██████████| 14/14 [00:00<00:00, 19.18it/s]


156167 nodos etiquetados de 183495.


100%|██████████| 156167/156167 [00:00<00:00, 894767.51it/s] 
100%|██████████| 2/2 [00:00<00:00, 77.12it/s]


156169 nodos etiquetados de 183495.


100%|██████████| 156169/156169 [00:00<00:00, 1252392.88it/s]
100%|██████████| 1/1 [00:00<00:00, 50.14it/s]


156170 nodos etiquetados de 183495.


100%|██████████| 156170/156170 [00:00<00:00, 1283511.04it/s]
0it [00:00, ?it/s]

156170 nodos etiquetados de 183495.





No se encontraron nuevas etiquetas.
Accuracy: 29%


In [4]:
print("Balanced Accuracy: Modelo de propagación aleatoria.")
print(f"{bal_acc_proba*100:.0f}%")

print("Balanced Accuracy: Modelo de propagación con criterios locales.")
print(f"{bal_acc_local*100:.0f}%")

print("Balanced Accuracy: Modelo de propagación con criterios globales.")
print(f"{bal_acc_global*100:.0f}%")

print("Balanced Accuracy: Modelo de tirar un dado de 23 caras")
label_collector = []
for n, l in test_labels.items():
    label_collector += l
topics = set(label_collector)
matches = {t: int(times_seen_local[t]/len(times_seen_local)) for t in topics}
accuracy_per_topic = {t: matches[t]/times_seen_global[t] for t in topics}
print(f"{sum(accuracy_per_topic.values())/len(accuracy_per_topic.values())*100:.0f}%")

print("Balanced Accuracy: Modelo de etiquetar TODO con la etiqueta mas común")
print(f'{1 * (times_seen_local["Software Development"]/sum(times_seen_local.values())) / len(times_seen_local)*100:.0f}%')

Balanced Accuracy: Modelo de propagación aleatoria.
11%
Balanced Accuracy: Modelo de propagación con criterios locales.
12%
Balanced Accuracy: Modelo de propagación con criterios globales.
19%
Balanced Accuracy: Modelo de tirar un dado de 23 caras
0%
Balanced Accuracy: Modelo de etiquetar TODO con la etiqueta mas común


KeyError: 'Software Development'