### instructions
0) Treat your graph as undirected and unweighted, and work on the resulting largest connected
component. Delete loops
1) Create a function computing CN and one of the topological indices between JI,PA,AA,RA.
Your function should return a pandaframe where each row is a missing link and each column is
an index. You are allowed to use built-in functions from NetworkX for computing individual
indices.
2) Create a third score by adding a column with the arithmetic mean between the two indices.
[NB: the arithmetic mean should be computed after rescaling each column between 0 and 1.]
3) For each of the 3 scores, identify as missing links the node pairs yielding the largest 5 values.
Briefly comment the results.
4) Optional: Invent a new index/score and compare the result

In [10]:
import networkx as nx
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def load_graph(nodes_file_path, edges_file_path):
    G = nx.DiGraph()
    with open(nodes_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_node(row['Id'], label=row['Label'])

    with open(edges_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_edge(row['Source'], row['Target'], weight=1)

    U = G.to_undirected()  # Transform the graph into an undirected and unweighted graph
    U.remove_edges_from(nx.selfloop_edges(U))  # Remove self loops
    
    # Check the size of the largest connected component
    LCCNodes = list(max(nx.connected_components(U), key=len))
    LCC = U.subgraph(LCCNodes)
    print("The number of nodes in the LCC is:", LCC.number_of_nodes())
    print("The number of edges in the LCC is:", LCC.number_of_edges())
load_graph('../Graph/nodes.csv', '../Graph/edges.csv')




The number of nodes in the LCC is: 70
The number of edges in the LCC is: 299


In [19]:
def load_graph(nodes_file_path, edges_file_path):
    G = nx.Graph()  # Assuming undirected graph
    
    with open(nodes_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_node(row['Id'], label=row['Label'])

    with open(edges_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_edge(row['Source'], row['Target'])
    
    return G

def compute_CN_PA(graph):
    # Get all non-edges from the graph (potential future links)
    non_edges = list(nx.non_edges(graph))
    
    # Calculate Common Neighbors (CN) for each non-edge
    CN = [(graph.nodes[u]['label'], graph.nodes[v]['label'], len(list(nx.common_neighbors(graph, u, v)))) for u, v in non_edges]
    
    # Calculate Preferential Attachment (PA) for each non-edge
    PA = [(graph.nodes[u]['label'], graph.nodes[v]['label'], graph.degree(u) * graph.degree(v)) for u, v in non_edges]
    
    # Convert the results to a dataframe
    df = pd.DataFrame({
        'Node1': [u for u, v, _ in CN],
        'Node2': [v for u, v, _ in CN],
        'CN': [index for _, _, index in CN],
        'PA': [index for _, _, index in PA]
    })

    return df

graph = load_graph('../Graph/nodes.csv', '../Graph/edges.csv')
df = compute_CN_PA(graph)
print(df)


          Node1     Node2  CN   PA
0           IDA    GRACIE   3   75
1           IDA  PHILLIPS   0   10
2           IDA      RUTH   4  125
3           IDA    BODINE   0   40
4           IDA     BRIDE   1   20
...         ...       ...  ..  ...
2111  CAMERAMAN    DANIEL   0   14
2112  CAMERAMAN     BROCK   2   10
2113       JACK    DANIEL   6  273
2114       JACK     BROCK   1  195
2115     DANIEL     BROCK   1   35

[2116 rows x 4 columns]
