# Week 7

### instructions
0) Treat your graph as undirected and unweighted, and work on the resulting largest connected
component. Delete loops
1) Create a function computing CN and one of the topological indices between JI,PA,AA,RA.
Your function should return a pandaframe where each row is a missing link and each column is
an index. You are allowed to use built-in functions from NetworkX for computing individual
indices.
2) Create a third score by adding a column with the arithmetic mean between the two indices.
[NB: the arithmetic mean should be computed after rescaling each column between 0 and 1.]
3) For each of the 3 scores, identify as missing links the node pairs yielding the largest 5 values.
Briefly comment the results.
4) Optional: Invent a new index/score and compare the result

In [39]:
import networkx as nx
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def load_graph(nodes_file_path='../Graph/nodes.csv', edges_file_path='../Graph/edges.csv'):
    G = nx.DiGraph()  # Create a directed graph
    with open(nodes_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_node(row['Id'], label=row['Label'])  # Add nodes to the graph with the label

    with open(edges_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_edge(row['Source'], row['Target'], weight=1)  # Add edges to the graph with same weight = 1 (unweighted)

    U = G.to_undirected()  # Transform the graph into an undirected
    U.remove_edges_from(nx.selfloop_edges(U))  # Remove self loops
    
    # Check the size of the largest connected component
    LCCNodes = list(max(nx.connected_components(U), key=len)) # Get the nodes of the largest connected component
    LCC = U.subgraph(LCCNodes)  # Create the largest connected component as a subgraph of the original graph
    
    print(f"The number of \033[1mnodes\033[0m in the LCC is: \033[1m{LCC.number_of_nodes()}\033[0m")
    print(f"The number of \033[1medges\033[0m in the LCC is: \033[1m{LCC.number_of_edges()}\033[0m")

    return

load_graph()

The number of [1mnodes[0m in the LCC is: [1m70[0m
The number of [1medges[0m in the LCC is: [1m299[0m


In [40]:
def load_graph(nodes_file_path='../Graph/nodes.csv', edges_file_path='../Graph/edges.csv'):
    G = nx.Graph()  # Create empty undirected graph
    
    with open(nodes_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_node(row['Id'], label=row['Label'])  # Add nodes to the graph with the label

    with open(edges_file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            G.add_edge(row['Source'], row['Target'])  # Add edges to the graph
    
    return G


def compute_CN_PA(graph):
    # Get all non-edges from the graph (potential future links)
    non_edges = list(nx.non_edges(graph))
    
    # Calculate Common Neighbors (CN) for each non-edge
    CN = [(graph.nodes[u]['label'], graph.nodes[v]['label'], len(list(nx.common_neighbors(graph, u, v)))) for u, v in non_edges]
    # Number of common neighbors u and v share for each pair of non-connected nodes
    
    # Calculate Preferential Attachment (PA) for each non-edge
    PA = [(graph.nodes[u]['label'], graph.nodes[v]['label'], graph.degree(u) * graph.degree(v)) for u, v in non_edges]
    # Product of the degrees of u and v - so number of connections they have - for each pair of non-connected nodes
    
    # Convert the results to a dataframe
    df = pd.DataFrame({
        'Node1': [u for u, v, _ in CN],  # Get the first node from the CN list
        'Node2': [v for u, v, _ in CN],  # Get the second node from the CN list
        'CN': [index for _, _, index in CN],  # Get the number of common neighbors from the CN list
        'PA': [index for _, _, index in PA]  # Get the number of neighbors from the PA list
    })

    return df

graph = load_graph()
lcc_nodes = max(nx.connected_components(graph), key=len)  # Get the nodes of the largest connected component
lcc = graph.subgraph(lcc_nodes)  # Create the largest connected component as a subgraph of the original graph

# Compute metrics on the LCC
df = compute_CN_PA(lcc)  # Create the dataframe with the results
print(df)


       Node1                  Node2  CN   PA
0     SEAMAN                MURDOCH   1   12
1     SEAMAN                ANATOLY   0    2
2     SEAMAN                  BROCK   0    5
3     SEAMAN                LOVEJOY   1   19
4     SEAMAN  FIRST OFFICER MURDOCH   0    5
...      ...                    ...  ..  ...
2111   SMITH                HUSBAND   3   75
2112    RUTH                  MOODY   2  200
2113    RUTH                HUSBAND   5  125
2114   MOLLY                HUSBAND   4  115
2115   MOODY                HUSBAND   1   40

[2116 rows x 4 columns]


In [41]:
def rescale(series):
    min_val = series.min()  # Get the minimum value of the series
    max_val = series.max()  # Get the maximum value of the series
    return (series - min_val) / (max_val - min_val)  # Rescale the series, subtracting the minimum and dividing by the difference between the maximum and minimum

def add_mean_column(df):
    # First, we rescale the 'CN' and 'PA' columns
    df['CN_scaled'] = rescale(df['CN'])
    df['PA_scaled'] = rescale(df['PA'])

    # Then, we compute the arithmetic mean of the scaled values
    df['Mean_CN_PA'] = df[['CN_scaled', 'PA_scaled']].mean(axis=1)
    return df

# Load the graph and extract the LCC
graph = load_graph()
lcc_nodes = max(nx.connected_components(graph), key=len)
lcc = graph.subgraph(lcc_nodes)

# Compute CN and PA on the LCC
df = compute_CN_PA(lcc)

# Add the mean column to the DataFrame
df = add_mean_column(df)
print(df)

       Node1                  Node2  CN   PA  CN_scaled  PA_scaled  Mean_CN_PA
0     SEAMAN                MURDOCH   1   12        0.1   0.019964    0.059982
1     SEAMAN                ANATOLY   0    2        0.0   0.001815    0.000907
2     SEAMAN                  BROCK   0    5        0.0   0.007260    0.003630
3     SEAMAN                LOVEJOY   1   19        0.1   0.032668    0.066334
4     SEAMAN  FIRST OFFICER MURDOCH   0    5        0.0   0.007260    0.003630
...      ...                    ...  ..  ...        ...        ...         ...
2111   SMITH                HUSBAND   3   75        0.3   0.134301    0.217151
2112    RUTH                  MOODY   2  200        0.2   0.361162    0.280581
2113    RUTH                HUSBAND   5  125        0.5   0.225045    0.362523
2114   MOLLY                HUSBAND   4  115        0.4   0.206897    0.303448
2115   MOODY                HUSBAND   1   40        0.1   0.070780    0.085390

[2116 rows x 7 columns]


In [42]:
def find_top_missing_links(df, indices):
    predicted_missing_links = {}  # Create an empty dictionary to store the results
    for index in indices:
        # Sort the DataFrame based on the index in descending order to get the top scores
        sorted_df = df.sort_values(by=index, ascending=False).head(5)
        # Extract the Node1 and Node2 columns to get the pairs
        predicted_missing_links[index] = sorted_df[['Node1', 'Node2']].values.tolist()
    return predicted_missing_links

# Ensure that the indices are in a list for iteration
indices = ['CN', 'PA', 'Mean_CN_PA']

# Identify the top missing links for each index
top_missing_links = find_top_missing_links(df, indices)

# Display the results
for index, links in top_missing_links.items():
    print(f"The top 5 missing links for the {index} index are:")
    for link in links:
        print(f"\033[1m{link[0]} - {link[1]}\033[0m")
    print('')
        

The top 5 missing links for the CN index are:
[1mLOVEJOY - MOLLY[0m
[1mMURDOCH - JACK[0m
[1mMURDOCH - ROSE[0m
[1mWOMAN - RUTH[0m
[1mCAL - SMITH[0m

The top 5 missing links for the PA index are:
[1mMURDOCH - ROSE[0m
[1mCAL - SMITH[0m
[1mMURDOCH - JACK[0m
[1mLOVEJOY - MOLLY[0m
[1mLOVETT - JACK[0m

The top 5 missing links for the Mean_CN_PA index are:
[1mMURDOCH - ROSE[0m
[1mMURDOCH - JACK[0m
[1mCAL - SMITH[0m
[1mLOVEJOY - MOLLY[0m
[1mWOMAN - RUTH[0m

