In [1]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import pickle

from collections import Counter
from tqdm import tqdm
from matplotlib import rcParams

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
rcParams['font.family'] = 'serif'
rcParams['mathtext.fontset'] = 'cm'
rcParams['figure.dpi'] = 100
rcParams['legend.fontsize'] = 10
rcParams['axes.labelsize'] = 'large'


In [4]:
# Load Graph.
G = nx.read_gexf("../../state_files/PyPi Network V4.gexf")
# Keep only giant component.
gc_nodes = sorted(
    nx.connected_components(G.to_undirected()), key=lambda x: len(x), reverse=True
)[0]
not_gc_nodes = set(G.nodes()) - gc_nodes
G.remove_nodes_from(not_gc_nodes)

with open("../../state_files/PyPi Dataframe V4 with subtopics.df", "rb") as f:
    df = pickle.load(f)

labels = df["Topics for Propagation"].dropna().to_dict()
# Remove 'Other' topic.
for n, l in labels.items():
    if "Other/Nonlisted Topic" in l:
        l.remove("Other/Nonlisted Topic")
# Filter nodes with no topics.
labels = dict(filter(lambda x: len(x[1]) > 0, labels.items()))
# Keep nodes from giant component.
labels = dict(filter(lambda x: x[0] in gc_nodes, labels.items()))
labels = {n: set(l) for n, l in labels.items()}

In [5]:
# Contar la cantidad de enlaces que hay entre nodos etiquetados.
edges = list(G.edges())
nodes_with_label = labels.keys()

edges_with_label = [
    e for e in tqdm(edges) if e[0] in nodes_with_label and e[1] in nodes_with_label
]
print(f"Cantidad de enlaces cuyos nodos tienen etiquetas: {len(edges_with_label)}")
print(f"Cantidad de enlaces: {len(edges)}")
print(f"Porcentaje de enlaces a utilizar: {len(edges_with_label)/len(edges)*100:.1f}%")

100%|██████████| 878695/878695 [00:00<00:00, 2488369.39it/s]

Cantidad de enlaces cuyos nodos tienen etiquetas: 154893
Cantidad de enlaces: 878695
Porcentaje de enlaces a utilizar: 17.6%





In [7]:
collector_in_common = []
collector_in_total = []

for e in edges_with_label:
    # Vemos si tienen alguna etiqueta que coincida:
    labels_in_common = labels[e[0]].intersection(labels[e[1]])
    # Guardamos las etiquetas que aparecen.
    labels_in_total = labels[e[0]].union(labels[e[1]])

    collector_in_common += list(labels_in_common)
    collector_in_total += list(labels_in_total)

counter_in_common = dict(Counter(collector_in_common))
counter_in_total = dict(Counter(collector_in_total))

porcentaje = {t: round(counter_in_common[t]/counter_in_total[t]*100, 2) for t in counter_in_total.keys() if t in counter_in_common.keys()}
dict(sorted(porcentaje.items(), key = lambda x: x[1], reverse=True))

{'WWW/HTTP': 28.63,
 'Hardware': 28.6,
 'Z39.50': 20.0,
 'Scientific/Engineering': 19.73,
 'Artificial Intelligence': 19.39,
 'GIS': 19.08,
 'Utilities': 19.03,
 'Office/Business': 18.59,
 'Astronomy': 17.62,
 'Electronic Design Automation (EDA)': 17.55,
 'Security': 16.15,
 'Multimedia': 15.65,
 'Bio-Informatics': 15.4,
 'Medical Science Apps.': 14.2,
 'Database': 12.75,
 'Text Processing': 11.72,
 'Power (UPS)': 11.54,
 'Physics': 11.51,
 'Text Editors': 11.42,
 'Games/Entertainment': 11.04,
 'Mathematics': 11.0,
 'Internet': 10.43,
 'Systems Administration': 10.1,
 'Chemistry': 9.8,
 'Communications': 9.79,
 'Monitoring': 9.63,
 'Visualization': 9.51,
 'Networking': 9.08,
 'Information Analysis': 8.73,
 'Distributed Computing': 8.32,
 'Documentation': 8.03,
 'Image Recognition': 7.79,
 'Boot': 7.06,
 'Artistic Software': 6.52,
 'Filesystems': 6.39,
 'Interface Engine/Protocol Translator': 6.37,
 'Software Distribution': 6.26,
 'Home Automation': 6.02,
 'Operating System Kernels': 6.

In [8]:
gc = G.copy().to_undirected()
gc.remove_nodes_from(set(G.nodes) - set(nodes_with_label)) #Elimino nodos sin etiqueta
neighs_by_node = {n: list(nx.neighbors(gc, n)) for n in gc.nodes} # Busco los vecinos de cada nodo
gc.remove_nodes_from([n[0] for n in filter(lambda x: len(x[1])==0, neighs_by_node.items())]) #Elimino los nodos sin vecinos.
neighs_by_node = {n: list(nx.neighbors(gc, n)) for n in gc.nodes} #Me quedo con los vecinos que si tienen nodos.

In [9]:
topics = set([item for l in labels.values() for item in l])
correct = {t: 0 for t in topics}
total = {t: 0 for t in topics}
for node, node_neighs in neighs_by_node.items():
    collector = []
    for neigh in node_neighs:
        collector += list(labels[neigh])
    counter = dict(sorted(Counter(collector).items(), key = lambda x: x[1], reverse=True))
    if len(counter.keys()) == 0:
        print(node)
    propagated_topic = list(counter.keys())[0]
    if propagated_topic in labels[node]:
        correct[propagated_topic] += 1
    for t in counter.keys():
        total[t] += 1

porcentaje = {t: round(correct[t]/total[t]*100, 2) for t in total.keys()}
dict(sorted(porcentaje.items(), key = lambda x: x[1], reverse=True))

{'WWW/HTTP': 22.46,
 'Hardware': 22.09,
 'Bio-Informatics': 20.54,
 'Electronic Design Automation (EDA)': 18.14,
 'Games/Entertainment': 17.67,
 'Scientific/Engineering': 17.16,
 'Utilities': 16.83,
 'Artificial Intelligence': 16.06,
 'GIS': 15.92,
 'Astronomy': 13.0,
 'Multimedia': 12.89,
 'Medical Science Apps.': 12.56,
 'Security': 10.38,
 'Database': 10.09,
 'Office/Business': 9.73,
 'Physics': 9.3,
 'Communications': 9.01,
 'Internet': 7.75,
 'Text Processing': 6.83,
 'Chemistry': 5.91,
 'Artificial Life': 5.62,
 'Hydrology': 5.34,
 'Monitoring': 5.02,
 'Home Automation': 4.69,
 'Networking': 4.58,
 'Mathematics': 4.32,
 'Distributed Computing': 3.95,
 'Information Analysis': 3.49,
 'Religion': 3.28,
 'Name Service (DNS)': 3.11,
 'Visualization': 3.07,
 'Emulators': 2.97,
 'Operating System Kernels': 2.83,
 'System': 2.82,
 'Operating System': 2.75,
 'Software Distribution': 2.7,
 'Filesystems': 2.38,
 'Atmospheric Science': 2.15,
 'Documentation': 2.1,
 'Image Processing': 2.01,
