### instructions
0) Treat your graph as undirected and unweighted, and work on the resulting largest connected
component. Delete loops
1) Create a function computing CN and one of the topological indices between JI,PA,AA,RA.
Your function should return a pandaframe where each row is a missing link and each column is
an index. You are allowed to use built-in functions from NetworkX for computing individual
indices.
2) Create a third score by adding a column with the arithmetic mean between the two indices.
[NB: the arithmetic mean should be computed after rescaling each column between 0 and 1.]
3) For each of the 3 scores, identify as missing links the node pairs yielding the largest 5 values.
Briefly comment the results.
4) Optional: Invent a new index/score and compare the result

In [2]:
import networkx as nx
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def load_graph(nodes_file_path, edges_file_path):
    G = nx.DiGraph()
    with open(nodes_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_node(row['Id'], label=row['Label'])

    with open(edges_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_edge(row['Source'], row['Target'], weight=1)

    U = G.to_undirected()  # Transform the graph into an undirected and unweighted graph
    U.remove_edges_from(nx.selfloop_edges(U))  # Remove self loops
    
    # Check the size of the largest connected component
    LCCNodes = list(max(nx.connected_components(U), key=len))
    LCC = U.subgraph(LCCNodes)
    print("The number of nodes in the LCC is:", LCC.number_of_nodes())
    print("The number of edges in the LCC is:", LCC.number_of_edges())
load_graph('../Graph/nodes.csv', '../Graph/edges.csv')




The number of nodes in the LCC is: 70
The number of edges in the LCC is: 299


In [7]:
def load_graph(nodes_file_path, edges_file_path):
    G = nx.Graph()  # Assuming undirected graph
    
    with open(nodes_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_node(row['Id'], label=row['Label'])

    with open(edges_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_edge(row['Source'], row['Target'])
    
    return G

def compute_CN_PA(graph):
    # Get all non-edges from the graph (potential future links)
    non_edges = list(nx.non_edges(graph))
    
    # Calculate Common Neighbors (CN) for each non-edge
    CN = [(graph.nodes[u]['label'], graph.nodes[v]['label'], len(list(nx.common_neighbors(graph, u, v)))) for u, v in non_edges]
    
    # Calculate Preferential Attachment (PA) for each non-edge
    PA = [(graph.nodes[u]['label'], graph.nodes[v]['label'], graph.degree(u) * graph.degree(v)) for u, v in non_edges]
    
    # Convert the results to a dataframe
    df = pd.DataFrame({
        'Node1': [u for u, v, _ in CN],
        'Node2': [v for u, v, _ in CN],
        'CN': [index for _, _, index in CN],
        'PA': [index for _, _, index in PA]
    })

    return df

graph = load_graph('../Graph/nodes.csv', '../Graph/edges.csv')
lcc_nodes = max(nx.connected_components(graph), key=len)
lcc = graph.subgraph(lcc_nodes)

# Compute metrics on the LCC
df = compute_CN_PA(lcc)
print(df)


       Node1    Node2  CN   PA
0     GRACIE     BELL   1   90
1     GRACIE    BROCK   1   75
2     GRACIE     SVEN   1   60
3     GRACIE  HUSBAND   3   75
4     GRACIE    BUELL   1   90
...      ...      ...  ..  ...
2111  MOTHER     ROSE   1   92
2112  MOTHER      MAN   1   22
2113    ROWE   WAITER   2   30
2114    ROWE      MAN   2   33
2115  WAITER      MAN   4  110

[2116 rows x 4 columns]


In [8]:
def rescale(series):
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val)

def add_mean_column(df):
    # First, we rescale the 'CN' and 'PA' columns
    df['CN_scaled'] = rescale(df['CN'])
    df['PA_scaled'] = rescale(df['PA'])

    # Then, we compute the arithmetic mean of the scaled values
    df['Mean_CN_PA'] = df[['CN_scaled', 'PA_scaled']].mean(axis=1)
    return df

# Load the graph and extract the LCC
graph = load_graph('../Graph/nodes.csv', '../Graph/edges.csv')
lcc_nodes = max(nx.connected_components(graph), key=len)
lcc = graph.subgraph(lcc_nodes)

# Compute CN and PA on the LCC
df = compute_CN_PA(lcc)

# Add the mean column to the DataFrame
df = add_mean_column(df)
print(df)

       Node1    Node2  CN   PA  CN_scaled  PA_scaled  Mean_CN_PA
0     GRACIE     BELL   1   90        0.1   0.161525    0.130762
1     GRACIE    BROCK   1   75        0.1   0.134301    0.117151
2     GRACIE     SVEN   1   60        0.1   0.107078    0.103539
3     GRACIE  HUSBAND   3   75        0.3   0.134301    0.217151
4     GRACIE    BUELL   1   90        0.1   0.161525    0.130762
...      ...      ...  ..  ...        ...        ...         ...
2111  MOTHER     ROSE   1   92        0.1   0.165154    0.132577
2112  MOTHER      MAN   1   22        0.1   0.038113    0.069056
2113    ROWE   WAITER   2   30        0.2   0.052632    0.126316
2114    ROWE      MAN   2   33        0.2   0.058076    0.129038
2115  WAITER      MAN   4  110        0.4   0.197822    0.298911

[2116 rows x 7 columns]


In [9]:
def find_top_missing_links(df, indices):
    predicted_missing_links = {}
    for index in indices:
        # Sort the DataFrame based on the index in descending order to get the top scores
        sorted_df = df.sort_values(by=index, ascending=False).head(5)
        # Extract the Node1 and Node2 columns to get the pairs
        predicted_missing_links[index] = sorted_df[['Node1', 'Node2']].values.tolist()
    return predicted_missing_links

# Ensure that the indices are in a list for iteration
indices = ['CN', 'PA', 'Mean_CN_PA']

# Identify the top missing links for each index
top_missing_links = find_top_missing_links(df, indices)

# Display the results
for index, links in top_missing_links.items():
    print(f"The top 5 missing links for the {index} index are:")
    for link in links:
        print(f"{link[0]} - {link[1]}")
        

The top 5 missing links for the CN index are:
MOLLY - LOVEJOY
MURDOCH - JACK
MURDOCH - ROSE
WOMAN - RUTH
CAL - SMITH
The top 5 missing links for the PA index are:
MURDOCH - ROSE
CAL - SMITH
MURDOCH - JACK
MOLLY - LOVEJOY
LOVETT - JACK
The top 5 missing links for the Mean_CN_PA index are:
MURDOCH - ROSE
MURDOCH - JACK
CAL - SMITH
MOLLY - LOVEJOY
WOMAN - RUTH
