In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

In [2]:
pip install --upgrade scipy networkx

Collecting scipy
  Downloading scipy-1.9.3-cp39-cp39-win_amd64.whl (40.2 MB)
Collecting networkx
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
Installing collected packages: scipy, networkx
  Attempting uninstall: scipy
    Found existing installation: scipy 1.7.3
    Uninstalling scipy-1.7.3:
      Successfully uninstalled scipy-1.7.3
  Attempting uninstall: networkx
    Found existing installation: networkx 2.7.1
    Uninstalling networkx-2.7.1:
      Successfully uninstalled networkx-2.7.1
Successfully installed networkx-2.8.8 scipy-1.9.3
Note: you may need to restart the kernel to use updated packages.


# 1. Data

In [186]:
df_hero_net = pd.read_csv(r"C:/Users/auror/Desktop/data_science/ADM/HM/5/hero-network.csv")
df_edges = pd.read_csv(r"C:/Users/auror/Desktop/data_science/ADM/HM/5/edges.csv")
df_nodes = pd.read_csv(r"C:/Users/auror/Desktop/data_science/ADM/HM/5/nodes.csv")

In [187]:
df_nodes.loc[df_nodes['node']=='SPIDER-MAN/PETER PARKERKER']

Unnamed: 0,node,type
14618,SPIDER-MAN/PETER PARKERKER,hero


In [188]:
n_comics = df_nodes[df_nodes.type == 'comic'].count()
n_heros = df_nodes[df_nodes.type == 'hero'].count()
print(n_comics, n_heros)

node    12651
type    12651
dtype: int64 node    6439
type    6439
dtype: int64


# NEW pre-processing

In [5]:
df_hero_net.head(1)

Unnamed: 0,hero1,hero2
0,"LITTLE, ABNER",PRINCESS ZANDA


In [6]:
df_edges.head(1)

Unnamed: 0,hero,comic
0,24-HOUR MAN/EMMANUEL,AA2 35


In [8]:
df_nodes.head(1)

Unnamed: 0,node,type
0,2001 10,comic


### delete space and '/'

In [189]:
df_hero_net["hero1"] = df_hero_net["hero1"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)
df_hero_net["hero2"] = df_hero_net["hero2"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)

In [190]:
df_edges["hero"] = df_edges["hero"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)
df_edges["comic"] = df_edges["comic"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)

In [191]:
df_nodes["node"] = df_nodes["node"].apply(lambda x: x[0:-1] if list(x)[-1] in [' ', '/'] else x)

### fixing 'spider-man/peter parker'

In [192]:
df_hero_net = df_hero_net.replace('SPIDER-MAN/PETER PAR','SPIDER-MAN/PETER PARKER', regex=True)

In [193]:
df_nodes.node = df_nodes.node.replace('SPIDER-MAN/PETER PARKERKER','SPIDER-MAN/PETER PARKER', regex=True)

### checking hero names

In [194]:
hero_edges = set(df_edges.hero)
hero_heronet = set(df_hero_net.hero1).union(set(df_hero_net.hero2))
hero_nodes = set(df_nodes.loc[df_nodes.type=='hero']['node'])

In [195]:
print(hero_edges - hero_heronet)
print(hero_heronet - hero_edges)

{'KULL', 'SEA LEOPARD', 'FENRIS', 'CALLAHAN, DANNY', 'SHARKSKIN', 'LUNATIK II', 'BLARE', 'CLUMSY FOULUP', 'GIURESCU, RADU', 'JOHNSON, LYNDON BAIN', 'RED WOLF II', 'GERVASE, LADY ALYSSA', 'MARVEL BOY II/MARTIN', 'RANDAK', 'BERSERKER II', 'ZANTOR', 'DEATHCHARGE', 'RUNE'}
set()


In [196]:
print(hero_edges - hero_nodes)
print(hero_nodes - hero_edges)

set()
set()


In [197]:
print(hero_nodes - hero_heronet)
print(hero_heronet - hero_nodes)

{'KULL', 'SEA LEOPARD', 'FENRIS', 'CALLAHAN, DANNY', 'SHARKSKIN', 'LUNATIK II', 'BLARE', 'CLUMSY FOULUP', 'GIURESCU, RADU', 'JOHNSON, LYNDON BAIN', 'RED WOLF II', 'GERVASE, LADY ALYSSA', 'MARVEL BOY II/MARTIN', 'RANDAK', 'BERSERKER II', 'ZANTOR', 'DEATHCHARGE', 'RUNE'}
set()


### checking comic names

In [198]:
comic_edges = set(df_edges.comic)
comic_nodes = set(df_nodes.loc[df_nodes.type=='comic']['node'])

In [199]:
print(comic_edges-comic_nodes)
print(comic_nodes-comic_edges)

set()
set()


### hero names = comic names

In [200]:
equals = hero_edges.intersection(comic_edges)
equals

{'BLADE', 'REBEL', 'SABRE'}

In [201]:
for i in equals:
    df_edges.comic = df_edges.comic.replace(i, i+' ', regex=True)

# Second graph

In [206]:
attributes = {}
for elem in df_edges['hero']:
    attributes[elem] = {'type' : 'hero' } 
for elem in df_edges['comic']:
    attributes[elem] = {'type' : 'comic' } 

In [207]:
len(attributes)

19090

In [208]:
G2 = nx.from_pandas_edgelist(df_edges, 'hero', 'comic')
nx.set_node_attributes(G2, attributes)

In [209]:
len(G2.nodes)

19090

In [210]:
len(G2.edges)

96104

In [211]:
df_edges.shape

(96104, 2)

# 2. Backend Implementation

# Functionality 3 - Shortest ordered Route

Input:

- The graph data
- A sequence of superheroes h = [h_2, ..., h_n-1]
- Initial node h_1 and an end node h_n
- N: denoting the top N heroes that their data should be considered

Output:

- The shortest walk of comics that you need to read to get from hero_1 to hero_n
Considerations: For this functionality, you need to implement an algorithm that returns the shortest walk that goes from node h_j to h_n, which visits in order the nodes in h. The choice of h_j and h_n can be made randomly (or if it improves the performance of the algorithm, you can also define it in any other way)

Important Notes:

- This algorithm should be run only on the second graph (G2).
- The algorithm needs to handle the case that the graph is not connected. Thus, only some of the nodes in h are reachable from h_1. In such a scenario, it is enough to let the program give in the output the string "There is no such path".
- Since we are dealing with walks, you can pass on the same node h_i more than once, but you have to preserve order. E.g., if you start from Spiderman to reach deadpool, and your path requires you to visit iron-man and colossus, you can go back to any comics any time you want, assuming that the order in which you visit the heroes is still the same.

In [181]:
att = nx.get_node_attributes(G2, "type")

In [182]:
len(att)

19090

In [183]:
len(G2.nodes)

19090

In [184]:
att['SPIDER-MAN/PETER PARKER']

'hero'

In [185]:
print(att['BLADE'],att['BLADE '])

hero comic


In [None]:
# PROVA
nodes = set(nx.all_neighbors(G2,'3-D MAN/CHARLES CHAN' ))
nodes.add('3-D MAN/CHARLES CHAN')
l = list(nx.all_neighbors(G2,'ROSS, GEN. THADDEUS' ))
for i in l:
    nodes.add(i)
nodes.add('ROSS, GEN. THADDEUS')
len(nodes)
G2_new = G2.subgraph(nodes)

In [96]:
def top_N(df,N,G):
    df_new = df.groupby(['hero'])['hero'].count().reset_index(name="count")
    df_new= df_new.sort_values(by = 'count',ascending=False)
    df_new = df_new.head(N)
    nodes=[df_new['hero'][i] for i in df_new.index]
    
    new_nodes = set(nodes)
    for i in nodes:
        app = list(df_edges.loc[df_edges['hero']==i]['comic'])
        new_nodes.update(app)
        
    G_top_N = G.subgraph(new_nodes)
    return G_top_N

In [97]:
def shortest_path(G2,node,path,final_node,visited,shortest_one,att):
    if node == final_node and att[node]=='hero':
        path.append(node)
        if len(path) < len(shortest_one):
            shortest_one = path.copy()
        path.pop(len(path)-1)
        return shortest_one
    
    else:
        for new_node in list(nx.all_neighbors(G2, node)):
            if new_node not in visited:
                if att[new_node]=='comic':
                    path.append(new_node)
                visited.append(new_node)
                shortest_one = shortest_path(G2,new_node,path,final_node,visited,shortest_one,att)
                if att[new_node]=='comic':
                    path.pop(len(path)-1)
                visited.pop(len(visited)-1)   
        return shortest_one

In [98]:
def functionality_3(G2, N, h_1, h_2):
    if N:
        N = int(N)
        G2 = top_N(df_edges,N,G2)
    att = nx.get_node_attributes(G2, "type")
    if not nx.is_connected(G2):
        result = 'There is not such path because the graph is not connected'
        return result
    if h_1  not in list(G2.nodes) or h_2 not in list(G2.nodes):
        result = 'There is not such path'
        return result
    shortest_one = list(range(0,len(G2.nodes)))
    shortest_one = shortest_path(G2,h_1,[h_1],h_2,[h_1],shortest_one,att)
    return shortest_one

In [104]:
N = input()

5


In [105]:
print(functionality_3(G2,N,'SPIDER-MAN/PETER PARKER','THOR/DR. DONALD BLAK'))

['SPIDER-MAN/PETER PARKER', 'CAPTAIN AMERICA', 'IRON MAN/TONY STARK', 'THING/BENJAMIN J. GR', 'THOR/DR. DONALD BLAK']


KeyboardInterrupt: 

In [48]:
print(functionality_3(G2_new,N,'3-D MAN/CHARLES CHAN','ROSS, GEN. THADDEUS'))

['3-D MAN/CHARLES CHAN', 'H2 251', 'ROSS, GEN. THADDEUS']


In [35]:
nx.shortest_path(G2, source='THOR/DR. DONALD BLAK', target='SPIDER-MAN/PETER PARKER', weight=None)

['THOR/DR. DONALD BLAK', 'A 11', 'SPIDER-MAN/PETER PARKER']

In [45]:
nx.shortest_path(G, source='3-D MAN/CHARLES CHAN', target='ROSS, GEN. THADDEUS', weight=None)

['3-D MAN/CHARLES CHAN', 'H2 251', 'ROSS, GEN. THADDEUS']

## Functionality 4 - Disconnecting Graphs (da rivedere/continuare)

Input:

- The graph data
- heroA: a superhero to which will relate sub-graph G_a
- heroB: a superhero to which will relate sub-graph G_b
- N: denoting the top N heroes that their data should be considered


Output:

- The minimum number of links (by considering their weights) required to disconnect the original graph in two disconnected subgraphs: G_a and G_b.

In [69]:
print(nx.is_connected(G1))
print(nx.is_connected(G2))

False
False


In [14]:
# nx.connected_components gets the list of components,
# max() command returns the largest one
components = nx.connected_components(G1)
largest_component = max(components, key=len)

In [16]:
# returns neighbors
sorted(list(G1.neighbors('JOCASTA')))[:10]

['3-D MAN/CHARLES CHAN',
 'AJAK/TECUMOTZIN [ETE',
 'ANGEL/WARREN KENNETH',
 'ANT-MAN II/SCOTT HAR',
 'ANT-MAN/DR. HENRY J.',
 'ANTOINETTE, MARIE',
 'ARABIAN KNIGHT/ABDUL',
 'ARBOGAST, BAMBI',
 'ASTROVIK, NORMA',
 'ATTUMA']

In [105]:
# returns all nodes reachable from source in G
sorted(list(nx.descendants(G1, 'JOCASTA')))[:10]

['24-HOUR MAN/EMMANUEL',
 '3-D MAN/CHARLES CHAN',
 '4-D MAN/MERCURIO',
 '8-BALL',
 'A',
 "A'YIN",
 'ABBOTT, JACK',
 'ABCISSA',
 'ABEL',
 'ABOMINATION | MUTANT']

In [15]:
# add N
def disconneting_graphs(G, heroA, heroB):
    G_a = []
    G_b = []

    for edge in list(G.edges()):
        if heroA in edge:
            G_a.append(edge)

    for edge in list(G.edges()):
        if heroB in edge:
            G_b.append(edge)

    return G_a, G_b, len(G_a) + len(G_b)

In [17]:
G_a, G_b, result = disconneting_graphs(G1, 'JOCASTA', '8-BALL')
print(result)

760


In [102]:
newG1 = G1.copy()
newG1.remove_edges_from(G_a)
newG1.remove_edges_from(G_b)

In [108]:
G1.number_of_edges() - newG1.number_of_edges()

760

In [106]:
print(len(list(newG1.neighbors('JOCASTA'))))
print(len(list(newG1.neighbors('8-BALL'))))


0
0
