In [193]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from collections import deque
from collections import defaultdict
import random
from collections import Counter

# 1. Data

In [2]:
df_hero_net = pd.read_csv(r"hero-network.csv")
df_edges = pd.read_csv(r"edges.csv")
df_nodes = pd.read_csv(r"nodes.csv")

In [3]:
df_hero_net

Unnamed: 0,hero1,hero2
0,"LITTLE, ABNER",PRINCESS ZANDA
1,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
2,BLACK PANTHER/T'CHAL,PRINCESS ZANDA
3,"LITTLE, ABNER",PRINCESS ZANDA
4,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
...,...,...
574462,COLOSSUS II/PETER RA,CALLISTO
574463,CALLISTO,ROGUE /
574464,CALLISTO,CALIBAN/
574465,CALIBAN/,ROGUE /


In [4]:
df_edges

Unnamed: 0,hero,comic
0,24-HOUR MAN/EMMANUEL,AA2 35
1,3-D MAN/CHARLES CHAN,AVF 4
2,3-D MAN/CHARLES CHAN,AVF 5
3,3-D MAN/CHARLES CHAN,COC 1
4,3-D MAN/CHARLES CHAN,H2 251
...,...,...
96099,ZZZAX,H2 326
96100,ZZZAX,H2 327
96101,ZZZAX,M/CP 8/4
96102,ZZZAX,PM 47


In [5]:
df_nodes

Unnamed: 0,node,type
0,2001 10,comic
1,2001 8,comic
2,2001 9,comic
3,24-HOUR MAN/EMMANUEL,hero
4,3-D MAN/CHARLES CHAN,hero
...,...,...
19085,"ZOTA, CARLO",hero
19086,ZOTA,hero
19087,ZURAS,hero
19088,ZURI,hero


In [6]:
df_nodes.shape

(19090, 2)

In [7]:
n_comics = df_nodes[df_nodes.type == 'comic'].count()
n_heros = df_nodes[df_nodes.type == 'hero'].count()
print(n_comics, n_heros)

node    12651
type    12651
dtype: int64 node    6439
type    6439
dtype: int64


## 1.1 Pre-processing 

In [8]:
#We remove extra spaces and '/' at the end of the names in the datasets. 

In [9]:
df_hero_net["hero1"] = df_hero_net["hero1"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)
df_hero_net["hero2"] = df_hero_net["hero2"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)

In [10]:
df_hero_net

Unnamed: 0,hero1,hero2
0,"LITTLE, ABNER",PRINCESS ZANDA
1,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
2,BLACK PANTHER/T'CHAL,PRINCESS ZANDA
3,"LITTLE, ABNER",PRINCESS ZANDA
4,"LITTLE, ABNER",BLACK PANTHER/T'CHAL
...,...,...
574462,COLOSSUS II/PETER RA,CALLISTO
574463,CALLISTO,ROGUE
574464,CALLISTO,CALIBAN
574465,CALIBAN,ROGUE


In [11]:
df_edges["hero"] = df_edges["hero"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)
df_edges["comic"] = df_edges["comic"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)

In [12]:
df_edges

Unnamed: 0,hero,comic
0,24-HOUR MAN/EMMANUEL,AA2 35
1,3-D MAN/CHARLES CHAN,AVF 4
2,3-D MAN/CHARLES CHAN,AVF 5
3,3-D MAN/CHARLES CHAN,COC 1
4,3-D MAN/CHARLES CHAN,H2 251
...,...,...
96099,ZZZAX,H2 326
96100,ZZZAX,H2 327
96101,ZZZAX,M/CP 8/4
96102,ZZZAX,PM 47


In [13]:
df_nodes["node"] = df_nodes["node"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)

In [14]:
df_nodes

Unnamed: 0,node,type
0,2001 10,comic
1,2001 8,comic
2,2001 9,comic
3,24-HOUR MAN/EMMANUEL,hero
4,3-D MAN/CHARLES CHAN,hero
...,...,...
19085,"ZOTA, CARLO",hero
19086,ZOTA,hero
19087,ZURAS,hero
19088,ZURI,hero


 We fix the 'spider-man/peter parker' error in the datasets.

In [15]:
df_hero_net = df_hero_net.replace('SPIDER-MAN/PETER PAR','SPIDER-MAN/PETER PARKER', regex=True)

In [16]:
df_nodes.node = df_nodes.node.replace('SPIDER-MAN/PETER PARKERKER','SPIDER-MAN/PETER PARKER', regex=True)

We check if the error is fixed and if there are other similar errors in the datasets.

At first, we check the hero names.

In [17]:
hero_edges = set(df_edges.hero)
hero_heronet = set(df_hero_net.hero1).union(set(df_hero_net.hero2))
hero_nodes = set(df_nodes.loc[df_nodes.type=='hero']['node'])

print(hero_edges - hero_heronet)
print(hero_heronet - hero_edges)

{'LUNATIK II', 'SHARKSKIN', 'RANDAK', 'FENRIS', 'RED WOLF II', 'GERVASE, LADY ALYSSA', 'DEATHCHARGE', 'BERSERKER II', 'MARVEL BOY II/MARTIN', 'KULL', 'JOHNSON, LYNDON BAIN', 'ZANTOR', 'RUNE', 'SEA LEOPARD', 'BLARE', 'CALLAHAN, DANNY', 'GIURESCU, RADU', 'CLUMSY FOULUP'}
set()


In [18]:
print(hero_edges - hero_nodes)
print(hero_nodes - hero_edges)

set()
set()


In [19]:
print(hero_nodes - hero_heronet)
print(hero_heronet - hero_nodes)

{'LUNATIK II', 'SHARKSKIN', 'RANDAK', 'FENRIS', 'RED WOLF II', 'GERVASE, LADY ALYSSA', 'DEATHCHARGE', 'BERSERKER II', 'MARVEL BOY II/MARTIN', 'KULL', 'JOHNSON, LYNDON BAIN', 'ZANTOR', 'RUNE', 'SEA LEOPARD', 'BLARE', 'CALLAHAN, DANNY', 'GIURESCU, RADU', 'CLUMSY FOULUP'}
set()


Then, we check the comic names.

In [20]:
comic_edges = set(df_edges.comic)
comic_nodes = set(df_nodes.loc[df_nodes.type=='comic']['node'])

In [21]:
print(comic_edges-comic_nodes)
print(comic_nodes-comic_edges)

set()
set()


We notice that some hero names are the same as comic names. We therefore decide to add word 'comic' at the end to the comic names equal to those of the heroes, in order to avoid problems in the creation and management of graphs.

In [22]:
equals = hero_edges.intersection(comic_edges)
equals

{'BLADE', 'REBEL', 'SABRE'}

In [23]:
hero_nodes.intersection(comic_nodes)

{'BLADE', 'REBEL', 'SABRE'}

In [24]:
for i in equals:
    df_edges.comic = df_edges.comic.replace(i, i+' comic', regex=True)


## 1.2 First graph

In [25]:
G1 = nx.MultiGraph()

In [26]:
df_hero_net.apply(lambda row: G1.add_edge(row['hero1'], row['hero2']), axis=1)

0          0
1          0
2          0
3          1
4          1
          ..
574462    27
574463    22
574464     8
574465    11
574466     1
Length: 574467, dtype: int64

In [27]:
#drop sefl-loops
G1.remove_edges_from(nx.selfloop_edges(G1))

In [28]:
list(G1.edges())[:10]

[('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', 'PRINCESS ZANDA'),
 ('LITTLE, ABNER', "BLACK PANTHER/T'CHAL"),
 ('LITTLE, ABNER', "BLACK PANTHER/T'CHAL")]

In [29]:
dicOfOccurances = {}
for h1,h2 in df_hero_net.to_numpy():
    if h1 != h2:
        if (h1,h2) in dicOfOccurances:
            dicOfOccurances[(h1,h2)] += 1
        else:
            dicOfOccurances[(h1,h2)] = 1


In [30]:
G1_weighted = nx.MultiGraph()

In [31]:
i = 0
for edge in list(G1.edges()):
    try:
        G1_weighted.add_edge(edge[0], edge[1], weight = round(1/dicOfOccurances[edge], 3))
    except:
        G1_weighted.add_edge(edge[0], edge[1], weight = round(1/dicOfOccurances[(edge[1], edge[0])],3))


In [32]:
edges, weights = zip(*nx.get_edge_attributes(G1_weighted, 'weight').items())
for i in range(20):
    print((edges[i], weights[i]))

(('LITTLE, ABNER', 'PRINCESS ZANDA', 0), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 1), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 2), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 3), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 4), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 5), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 6), 0.2)
(('LITTLE, ABNER', 'PRINCESS ZANDA', 7), 0.2)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 0), 0.25)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 1), 0.25)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 2), 0.25)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 3), 0.25)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 4), 0.25)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 5), 0.25)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL", 6), 0.25)
(('LITTLE, ABNER', 'CARNIVORE/COUNT ANDR', 0), 0.333)
(('LITTLE, ABNER', 'CARNIVORE/COUNT ANDR', 1), 0.333)
(('LITTLE, ABNER', 'CARNIVORE/COUNT ANDR', 2), 0.333)
(('LITTLE, ABNER', 'CARNIVORE/COUNT ANDR', 3), 0.333)
(('LITTLE, ABNER', 'CARNIVORE/COUNT ANDR', 4)

In [33]:
G1_weighted['LITTLE, ABNER']['PRINCESS ZANDA']

AtlasView({0: {'weight': 0.2}, 1: {'weight': 0.2}, 2: {'weight': 0.2}, 3: {'weight': 0.2}, 4: {'weight': 0.2}, 5: {'weight': 0.2}, 6: {'weight': 0.2}, 7: {'weight': 0.2}})

In [34]:
G1_weighted_graph = nx.Graph(G1_weighted)
G1_weighted_graph['LITTLE, ABNER']['PRINCESS ZANDA']

{'weight': 0.2}

In [35]:
edges, weights = zip(*nx.get_edge_attributes(G1_weighted_graph, 'weight').items())
for i in range(20):
    print((edges[i], weights[i]))

(('LITTLE, ABNER', 'PRINCESS ZANDA'), 0.2)
(('LITTLE, ABNER', "BLACK PANTHER/T'CHAL"), 0.25)
(('LITTLE, ABNER', 'CARNIVORE/COUNT ANDR'), 0.333)
(('LITTLE, ABNER', 'IRON MAN/TONY STARK'), 1.0)
(('LITTLE, ABNER', 'GOLDEN-BLADE'), 1.0)
(('LITTLE, ABNER', 'DIXON, GENERAL'), 1.0)
(('LITTLE, ABNER', 'IRON MAN IV/JAMES R.'), 1.0)
(('LITTLE, ABNER', 'JOCASTA'), 1.0)
(('LITTLE, ABNER', 'FUJIKAWA, RUMIKO'), 1.0)
(('LITTLE, ABNER', 'MADAME MENACE/SUNSET'), 1.0)
(('LITTLE, ABNER', 'JACOBS, GLENDA'), 1.0)
(('LITTLE, ABNER', 'WAR MACHINE II/PARNE'), 1.0)
(('LITTLE, ABNER', 'SAPPER'), 1.0)
(('LITTLE, ABNER', 'HOGAN, VIRGINIA PEPP'), 1.0)
(('LITTLE, ABNER', 'BINARY/CAROL DANVERS'), 1.0)
(('LITTLE, ABNER', 'FIN FANG FOOM/MIDGAR'), 1.0)
(('LITTLE, ABNER', 'MANN, DR. J. VERNON'), 0.5)
(('LITTLE, ABNER', 'THOR/DR. DONALD BLAK'), 1.0)
(('LITTLE, ABNER', 'TEMPEST II/NICOLETTE'), 1.0)
(('LITTLE, ABNER', 'JARVIS, EDWIN'), 1.0)


In [36]:
dicOfWeights = {}
for edge in G1_weighted_graph.edges():
    dicOfWeights[edge] = G1_weighted_graph.get_edge_data(edge[0], edge[1])['weight']


In [37]:
print('Number of nodes of the first graph: ' + str(len(G1_weighted_graph.nodes())))
print('Number of edges of the first graph: ' +  str(len(G1_weighted_graph.edges())))

Number of nodes of the first graph: 6421
Number of edges of the first graph: 167100


## 1.3 Second graph

In [38]:
attributes = {}
for elem in df_edges['hero']:
    attributes[elem] = {'type' : 'hero' } 
for elem in df_edges['comic']:
    attributes[elem] = {'type' : 'comic' } 

print('Number of elements in attributes list: ' + str(len(attributes)))

Number of elements in attributes list: 19090


In [178]:
G2 = nx.from_pandas_edgelist(df_edges, 'hero', 'comic')
nx.set_node_attributes(G2, attributes)
print('Number of nodes of the second graph: ' + str(len(G2.nodes())))
print('Number of edges of the second graph: ' +  str(len(G2.edges())))

Number of nodes of the second graph: 19090
Number of edges of the second graph: 96104


In [40]:
nodes, types = zip(*nx.get_node_attributes(G2, 'type').items())
for i in range(10):
    print(nodes[i], types[i])

24-HOUR MAN/EMMANUEL hero
AA2 35 comic
3-D MAN/CHARLES CHAN hero
AVF 4 comic
AVF 5 comic
COC 1 comic
H2 251 comic
H2 252 comic
M/PRM 35 comic
M/PRM 36 comic


In [218]:
def top_N(df,N):
    return df.groupby("hero", group_keys=False).count().sort_values(by='comic', ascending = False).head(N)

In [262]:
def create_G1_topN(N, df):
    G_topN = G1_weighted_graph.copy()
    most_popular_heros = df.groupby("hero", group_keys=False).count().sort_values(by='comic', ascending = False).head(N)
    notInTopN = set(G1_weighted_graph.nodes()) - set(most_popular_heros.index)
    G_topN.remove_nodes_from(notInTopN)
    return G_topN

In [263]:
def create_G2_topN(N, df):
    df_new = df.groupby(['hero'])['hero'].count().reset_index(name="count").sort_values(by = 'count',ascending=False).head(N)
    nodes=[df_new['hero'][i] for i in df_new.index]
    
    new_nodes = set(nodes)
    for i in nodes:
        app = list(df_edges.loc[df_edges['hero']==i]['comic'])
        new_nodes.update(app)
        
    G_top_N = G2.subgraph(new_nodes)
    return G_top_N

In [264]:
G1_topN = create_G1_topN(50, df_edges)
len(G1_topN.edges())

1194

In [271]:
G2_topN = create_G2_topN(50, df_edges)
# check if the number of nodes is 50
len(G2_topN.nodes())
# change the number of nodes of create_G2_topN(50, df_edges) to 50


9286

# Functionality 1

In [221]:
from collections import Counter


In [222]:
def degree_distG(G):
    degree_G = [G.degree(n) for n in G.nodes()]
    degree_counts = Counter(degree_G)
    return degree_counts


In [223]:
def hubsG(G):
    degreeG1 = [G.degree(n) for n in G.nodes()]
    ex_nodes = [n for n in G.nodes() if G.degree(n) > np.percentile(degreeG1,95)]
    return(ex_nodes)

In [224]:
def hubs(G):
    degreeG2 = [G.degree(n) for n in G.nodes()]
    ex_nodes = [n for n in G.nodes() if G.degree(n) > np.percentile(degreeG2,95) and G.nodes[n].get('type', '') == 'comic']
    return(ex_nodes)

In [225]:
def density_G(G):
    n = G.number_of_nodes()
    m = G.number_of_edges()
    return 2*m/(n*(n-1))

In [253]:
def functionality_1(G,G_type,N):
    G = G.copy()
    if G_type == 1:
        G = create_G1_topN(N,df_edges)
        
        n_nodesG1 = int(G.number_of_nodes())
        degree_G1 = G.degree()
        
        sorted_degree_G1 = sorted(degree_G1, key=lambda x: x[1], reverse=True)
    
        densityG1 = len(G.edges())/(n_nodesG1*(n_nodesG1-1)/2)
        
        degree_sequence = [deg for i, deg in G.degree()]
        avg_degreeG1 = sum(degree_sequence) / len(degree_sequence)
        
        type_graphG1 = ""    
        if density_G(G) >= 0.5:
            type_graphG1 = "DENSE"
        else:
            type_graphG1 = "SPARSE"
            
    
        print("Number of nodes of G1: ", n_nodesG1)
        print("sorted collaborations of each node:", sorted_degree_G1)
        print("The density of G1 is:", densityG1)
        print("the degree distribution of G1 is", degree_distG(G))
        print("Average degree of G1: ", avg_degreeG1)
        print("Nodes with degrees more extensive than the 95th percentile of the degree distribution: ", len(hubs(G)), 
        "\n", sorted(hubsG(G), key=lambda x: G.degree(x), reverse=True))
        print("The type of G1 is: ", type_graphG1)
    G = G.copy()
    if G_type == 2:
        G = create_G2_topN(N,df_edges)
        
        count_hero = 0
        count_comic = 0
        for elements in types:
            if elements == 'hero':
                count_hero += 1
            elif elements == 'comic':
                count_comic += 1
        degree_G = G.degree()
        
        hero_appeard = df_edges.groupby('comic').count().sort_values(by='hero', ascending=False)
    
        densityG2 = len(G.edges())/(count_hero*(count_hero-1)/2)
        
        degree_sequence = [deg for i, deg in G.degree()]
        avg_degreeG2 = sum(degree_sequence) / len(degree_sequence)
        
        type_graphG2 = ""    
        if density_G(G) >= 0.5:
            type_graphG2 = "DENSE"
        else:
            type_graphG2 = "SPARSE"
    
    
        print("Number of nodes of G2: ", count_hero, count_comic)
        print( "the number of heroes that have appeared in each comic of G2:" , "\n", hero_appeard)
        print("The density of G2 is:", densityG2)
        print("the distribution of G2 is", degree_distG(G))
        print("Average degree of G2: ", avg_degreeG2)
        print("Nodes with degrees more extensive than the 95th percentile of the degree distribution: ", len(hubs(G)),
         "\n", sorted(hubs(G), key=lambda x: G.degree(x), reverse=True))    
        print("The type of G2 is: ", type_graphG2)
    return


In [255]:
functionality_1(G1_topN,1,50)
# considering the top 50 nodes of G1:
# the density is 0.97, the value si high, so the graph is dense
# the average degree is 47.76, that means generally a node has 47.76 connections with other nodes
# in this case we don't have hubs, because don't have nodes with a degree more extensive than the 95th percentile of the degree distribution
# the type of graph is dense, so we have a lot of connections between nodes, otherwirs if that was "SPARSE" we would have a lot of nodes with a lower degree

Number of nodes of G1:  50
sorted collaborations of each node: [('IRON MAN/TONY STARK', 49), ('THOR/DR. DONALD BLAK', 49), ('CYCLOPS/SCOTT SUMMER', 49), ('BEAST/HENRY &HANK& P', 49), ('HULK/DR. ROBERT BRUC', 49), ('INVISIBLE WOMAN/SUE', 49), ('THING/BENJAMIN J. GR', 49), ('SCARLET WITCH/WANDA', 49), ('CAPTAIN AMERICA', 49), ('WONDER MAN/SIMON WIL', 49), ('WOLVERINE/LOGAN', 49), ('SHE-HULK/JENNIFER WA', 49), ('SILVER SURFER/NORRIN', 49), ('VISION', 49), ('MARVEL GIRL/JEAN GRE', 49), ('STORM/ORORO MUNROE S', 49), ('MR. FANTASTIC/REED R', 49), ('ANGEL/WARREN KENNETH', 49), ('ICEMAN/ROBERT BOBBY', 49), ('WASP/JANET VAN DYNE', 49), ('ANT-MAN/DR. HENRY J.', 49), ('BLACK WIDOW/NATASHA', 49), ('SPIDER-MAN/PETER PARKER', 49), ('SUB-MARINER/NAMOR MA', 49), ('HUMAN TORCH/JOHNNY S', 49), ('COLOSSUS II/PETER RA', 49), ('DR. STRANGE/STEPHEN', 49), ("BLACK PANTHER/T'CHAL", 48), ('JARVIS, EDWIN', 48), ('HAWK', 48), ('HERCULES [GREEK GOD]', 48), ('FURY, COL. NICHOLAS', 48), ('JONES, RICHARD MILHO', 48)

# Functionality 2 - Find top superheroes!

In [292]:
def fun_betweenness_centrality(G):
    betweenness = defaultdict(int)
    for s in G:
        for t in G:
            if s != t:
                # Calcolo i percorsi minimi tra s e t utilizzando l'algoritmo BFS
                path = nx.shortest_path(G, s, t)
                if path is not None:
                    for v in path:
                        betweenness[v] += 1
    # Normalizza i valori della betweenness
    n = len(G)
    for v in betweenness:
        betweenness[v] /= (n-1)*(n-2)
    return betweenness

In [290]:
def fun_pagerank_centrality(G, alpha=0.85, max_iter=100, tol=1e-6):
    # Inizializza i valori di PageRank di ogni nodo con 1/n, dove n è il numero di nodi del grafo
    n = len(G)
    pagerank = {node: 1/n for node in G}
    # Itera finché non si raggiunge la convergenza o il numero massimo di iterazioni
    for i in range(max_iter):
        diff = 0
        # Calcola il nuovo valore di PageRank per ogni nodo
        for node in pagerank:
            rank = sum(pagerank[neighbor] / len(G[neighbor]) for neighbor in G[node])
            # Aggiorna il valore di PageRank del nodo utilizzando la formula di PageRank
            new_rank = (1 - alpha) / n + alpha * rank
            diff += abs(new_rank - pagerank[node])
            pagerank[node] = new_rank
        # Se la differenza è minore della soglia di tolleranza, interrompi il loop
        if diff < tol:
            break
    return pagerank

In [274]:
def fun_degree_centrality(G, node):
    num_neighbors = len(list(G.neighbors(node)))
    fun_degree_centrality = num_neighbors / (len(G) - 1)
    return fun_degree_centrality

In [275]:
def fun_closeness_centrality(G, node):
  # Inizializza una coda e un dizionario per tenere traccia della distanza di ogni nodo dal nodo in questione
  queue = deque([node])
  distances = {node: 0}

  # Esegue la ricerca in larghezza a partire dal nodo in questione
  while queue:
    current_node = queue.popleft()
    for neighbor in G[current_node]:
      if neighbor not in distances:
        distances[neighbor] = distances[current_node] + 1
        queue.append(neighbor)

  # Calcola la somma delle distanze di tutti i nodi dal nodo in questione
  total_distance = sum(distances.values())
  # Calcola la centralità di prossimità come il numero di nodi del grafo diviso dalla somma delle distanze
  closeness_centrality = len(G) / total_distance
  return closeness_centrality

In [303]:
def functionality_2(G,node,metric,N):
    G = create_G1_topN(N,df_edges)
    for nodo in G.nodes():
        print("the node is: ", nodo)
        if G == G1_topN:
            if metric == "betweenness":
                print(" the betweenness centrality of G1: ", fun_betweenness_centrality(G))
                
            #print(" the degree centrality of G1: ", fun_degree_centrality(G, nodo))
            #print(" the pagerank centrality of G1: ", fun_pagerank_centrality(G1_topN))
            #print(" the closeness centrality of G1: ", fun_closeness_centrality(G,top_N))
        else:
            G == G2_topN
            print(" the betweenness centrality of G2: ", fun_betweenness_centrality(G))
            #print(" the degree centrality of G2: ", fun_degree_centrality(G2, nodo))
            #print(" the pagerank centrality of G2: ", fun_pagerank_centrality(G2_topN))
            #print(" the closeness centrality of G2: ", fun_closeness_centrality(G2, top_N))


In [306]:
Perchè me lo stampa tante volte?
def functionality_2(G,node,metric,N):
    G = create_G1_topN(N,df_edges)
    for nodo in G.nodes():
        print("the node is: ", nodo)
        if G == G1_topN:
            if metric == "betweenness":
                print(" the betweenness centrality of G1: ", fun_betweenness_centrality(G))
        else:
            G == G2_topN
            print(" the betweenness centrality of G2: ", fun_betweenness_centrality(G))
    return 
functionality_2(G1_topN,1,"betweenness",10)

In [307]:
functionality_2(G1_topN,1,"betweenness",10)

the node is:  IRON MAN/TONY STARK
 the betweenness centrality of G2:  defaultdict(<class 'int'>, {'IRON MAN/TONY STARK': 0.25, 'THOR/DR. DONALD BLAK': 0.25, 'HULK/DR. ROBERT BRUC': 0.25, 'INVISIBLE WOMAN/SUE': 0.25, 'THING/BENJAMIN J. GR': 0.25, 'CAPTAIN AMERICA': 0.25, 'WOLVERINE/LOGAN': 0.25, 'MR. FANTASTIC/REED R': 0.25, 'SPIDER-MAN/PETER PARKER': 0.25, 'HUMAN TORCH/JOHNNY S': 0.25})
the node is:  THOR/DR. DONALD BLAK
 the betweenness centrality of G2:  defaultdict(<class 'int'>, {'IRON MAN/TONY STARK': 0.25, 'THOR/DR. DONALD BLAK': 0.25, 'HULK/DR. ROBERT BRUC': 0.25, 'INVISIBLE WOMAN/SUE': 0.25, 'THING/BENJAMIN J. GR': 0.25, 'CAPTAIN AMERICA': 0.25, 'WOLVERINE/LOGAN': 0.25, 'MR. FANTASTIC/REED R': 0.25, 'SPIDER-MAN/PETER PARKER': 0.25, 'HUMAN TORCH/JOHNNY S': 0.25})
the node is:  HULK/DR. ROBERT BRUC
 the betweenness centrality of G2:  defaultdict(<class 'int'>, {'IRON MAN/TONY STARK': 0.25, 'THOR/DR. DONALD BLAK': 0.25, 'HULK/DR. ROBERT BRUC': 0.25, 'INVISIBLE WOMAN/SUE': 0.25, 'T

In [53]:
def functionality_2_nx(G,node,metric,N):
    
    G2_topN = create_G2_topN(G,N,df_edges)

    result = fun_betweenness_centrality(G)
    
    for nodo in G.nodes() and N:
        print("the node is: ", nodo) 
        pagerank = nx.pagerank(G)
        betweenness = nx.betweenness_centrality(G)
        closeness = nx.closeness_centrality(G)
        degree = nx.degree_centrality(G)


## Functionality 3

In [54]:
def top_N(df,N,G):
    df_new = df.groupby(['hero'])['hero'].count().reset_index(name="count")
    df_new= df_new.sort_values(by = 'count',ascending=False)
    df_new = df_new.head(N)
    nodes=[df_new['hero'][i] for i in df_new.index]
    
    new_nodes = set(nodes)
    for i in nodes:
        app = list(df_edges.loc[df_edges['hero']==i]['comic'])
        new_nodes.update(app)
        
    G_top_N = G.subgraph(new_nodes)
    return G_top_N

In [55]:
def shortest_path_two_nodes(node_1, node_2, G, list_nodes):
    if not nx.has_path(G,node_1,node_2):
        return []
    new_list = set(list_nodes)
    new_list.remove(node_1)
    new_list.remove(node_2)
    nodes = [i for i in G.nodes if i not in new_list]
    return list(nx.shortest_path(G.subgraph(nodes), source=node_1, target=node_2, weight=None))

In [56]:
def functionality_3(G2, N, h_1, h_n, h):
    if N:
        N = int(N)
        G2 = top_N(df_edges,N,G2)
    att = nx.get_node_attributes(G2, "type")
    if not nx.is_connected(G2):
        result = 'There is not such path because the graph is not connected'
        return result
    if h_1  not in list(G2.nodes) or h_n not in list(G2.nodes):
        result = 'There is not such path'
        return result
    shortest_path = np.array([h_1])
    s = h_1
    for t in h:
        app = shortest_path_two_nodes(s,t,G2,[h_1]+h+[h_n])
        if app == []:
            return [], G2
        #app = list(nx.shortest_path(G2, source=s, target=t, weight=None))
        s = t
        app = np.array(app)
        shortest_path = np.concatenate((shortest_path,app[1:]),axis=None)
    app = list(nx.shortest_path(G2, source=t, target=h_n, weight=None))
    app = np.array(app)
    shortest_path = np.concatenate((shortest_path,app[1:]),axis=None)
    
    return shortest_path, G2

# Functionality 4

In [57]:
def search_shortest_path(G, source, target):
    try:
        return nx.shortest_path(G, source, target, weight='weight', method='dijkstra')
    except: 
        return 0

In [58]:
def functionality_4(G, heroA, heroB, N, df):
    count = 0
    G_final = create_G_topN(G, N, df)

    while search_shortest_path(G_final, heroA, heroB) != 0:
        shortest_path = search_shortest_path(G_final, heroA, heroB)
        dic = {}
        for i in range(len(shortest_path)-1):
            weight = G.get_edge_data(shortest_path[i], shortest_path[i+1])['weight']
            dic[(shortest_path[i],shortest_path[i+1])] = weight
        
        dic = dict(sorted(dic.items(), key=lambda item: item[1], reverse = True))       
        G_final.remove_edge(list(dic.keys())[0][0], list(dic.keys())[0][1])
        count+=1
    
    colors = ['red' if node == heroA or node == heroB else 'blue' 
                for node in G_final.nodes()]
                
    return G_final, count, colors

# Functionality 5

In [59]:
def top_N_1(df,N,G):
    df_new = df.groupby(['hero'])['hero'].count().reset_index(name="count")
    df_new= df_new.sort_values(by = 'count',ascending=False)
    df_new = df_new.head(N)
    nodes=[df_new['hero'][i] for i in df_new.index]
    G_top_N = G.subgraph(nodes)
    return G_top_N

In [60]:
def edges_to_remove(G):
    
    num_components = nx.number_connected_components(G)
    
    edges_to_remove = 0
    while True:
        if num_components > 1:
                break
        G_dict = {}
        G_dict = nx.edge_betweenness_centrality(G)
        max_EBC = max(G_dict.values())
        edges = []
        for key, value in sorted(G_dict.items(), key=lambda item: item[1], reverse = True):
            if value < max_EBC:
                break
            edges.append(key)
        edges_to_remove += len(edge)
        for e in edges:
            G.remove_edge(e[0], e[1])
        components = nx.connected_components(G)
        num_components = nx.number_connected_components(G)
    return edges_to_remove, G

In [61]:
def functionality_5(G1,N, Hero_1, Hero_2):
    G_top_N = top_N_1(df_edges,N,G1)
    G = nx.Graph(G_top_N)
    
    hero_1_2_together = False
    number_of_edges, G = edges_to_remove(G)
    comunities = []
    for comunity in nx.connected_components(G):
        comunities.append(list(comunity))
        if Hero_1 in comunity and Hero_2 in comunity:
            hero_1_2_together = True
    

    return number_of_edges, G, comunities, hero_1_2_together, G_top_N

# 3. Frontend Implementationc

## Visualization 1 - Visualize some features of the network

In [None]:
hero_appeard = df_edges.groupby('comic').count().sort_values(by='hero', ascending=False)
print( "the number of heroes that have appeared in each comic of G2:" , "\n", hero_appeard)

In [142]:
def functionality_1(G,G_type,N):
    G = G.copy()
    if G_type == 1:
        G = create_G1_topN(G,N,df_edges)
        
        n_nodesG1 = int(G.number_of_nodes())
        degree_G1 = G.degree()
        
        sorted_degree_G1 = sorted(degree_G1, key=lambda x: x[1], reverse=True)
    
        densityG1 = len(G.edges())/(n_nodesG1*(n_nodesG1-1)/2)
        
        degree_sequence = [deg for i, deg in G.degree()]
        avg_degreeG1 = sum(degree_sequence) / len(degree_sequence)
        
        type_graphG1 = ""    
        if density_G(G) >= 0.5:
            type_graphG1 = "DENSE"
        else:
            type_graphG1 = "SPARSE"
            
    
        print("Number of nodes of G1: ", n_nodesG1)
        print("sorted collaborations of each node:", sorted_degree_G1)
        print("The density of G1 is:", densityG1)
        print(degree_distG(G))
        print("Average degree of G1: ", avg_degreeG1)
        print("Nodes with degrees more extensive than the 95th percentile of the degree distribution: ", len(hubs(G)), 
        "\n", sorted(hubs(G), key=lambda x: G.degree(x), reverse=True))
        print("The type of G1 is: ", type_graphG1)
    

        general_info_G1 = {
                "Number of nodes": n_nodesG1,
                "Density": densityG1,
                "Average degree": avg_degreeG1,
                "netowrk type" : type_graphG1,
                }
        # general info of G1
        general_info_G1= pd.DataFrame(general_info_G1, index=[0])
        print("General info of G1","/n", general_info_G1)
        # hubs of G1
        df_hubs = pd.DataFrame(hubsG(G), columns=['Hubs'])
        df_hubs
        print("hubs tabe of G1", df_hubs)
        # number of collaborations of each hero
        plt.figure(figsize=(10,10))
        plt.bar(*zip(*sorted_degree_G1))
        plt.title("Number of collaborations of each hero in descending order of G1")
        plt.xlabel("Hero")
        plt.ylabel("Number of collaborations")
        plt.show()

        # degree distribution of the network
        plt.figure(figsize=(10,5))
        plt.hist(degree_distG(G), bins=50)
        plt.xlabel('degree')
        plt.ylabel('number of nodes')
        plt.show()

    if G_type == 2:
        
        
        count_hero = 0
        count_comic = 0
        for elements in types:
            if elements == 'hero':
                count_hero += 1
            elif elements == 'comic':
                count_comic += 1
        degree_G = G.degree()
        
        hero_appeard = df_edges.groupby('comic').count().sort_values(by='hero', ascending=False)
    
        densityG2 = len(G.edges())/(count_hero*(count_hero-1)/2)
        
        degree_sequence = [deg for i, deg in G.degree()]
        avg_degreeG2 = sum(degree_sequence) / len(degree_sequence)
        
        type_graphG2 = ""    
        if density_G(G) >= 0.5:
            type_graphG2 = "DENSE"
        else:
            type_graphG2 = "SPARSE"
    
    
        print("Number of nodes of G2: ", count_hero, count_comic)
        print( "the number of heroes that have appeared in each comic of G2:" , "\n", hero_appeard)
        print("The density of G2 is:", densityG2)
        print(degree_distG(G))
        print("Average degree of G2: ", avg_degreeG2)
        print("Nodes with degrees more extensive than the 95th percentile of the degree distribution: ", len(hubs(G)),
         "\n", sorted(hubs(G), key=lambda x: G.degree(x), reverse=True))    
        print("The type of G2 is: ", type_graphG2)
       
        general_info_G2 = {
                "Number of nodes": count_hero and count_comic,
                "Density": densityG2,
                "Average degree": avg_degreeG2,
                "netowrk type" : type_graphG2,
                }
            # General info of G2 as a table with pandas
        genaral_info_G2 = pd.DataFrame(general_info_G2, index=[0])
        print("general info of G2",general_info_G2)

            # hubs table
        df_hubs = pd.DataFrame(hubs(G), columns=['Hubs'])
        df_hubs
        print("hubs tabe of G2", df_hubs)

            #number of heroes who appeared in each comic
        plt.figure(figsize=(10,5))
        plt.hist(hero_appeard['hero'], bins=100)
        plt.xlabel('number of heroes')
        plt.ylabel('number of comics')
        plt.show()

            # degree distribution of the network
        plt.figure(figsize=(10,5))
        plt.hist(degree_distG(G), bins=50)
        plt.xlabel('degree')
        plt.ylabel('number of nodes')
        plt.show()
           
        return

In [None]:

functionality_1(G2,2,10)