In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import random
import copy
import matplotlib.pyplot as plt

DATA_PATH = 'data/'

In [2]:
# Load graph
shelf_graph = nx.Graph()
shelf_giant = nx.Graph()
NLP_graph = nx.Graph()
NLP_giant = nx.Graph()

In [None]:
# Function to calculate the fraction of neighbors that have the same attribute value
def get_matching_att(graph, att):
    """
    A function that calculates the fraction of neighbors that have the same attribute value

    Args:
        graph (nx.Graph): The graph to be analyzed
        att (str): The attribute to be analyzed

    Returns:
        match_frac (dict): A dictionary with the fraction of neighbors that have the same attribute value
    """
    match_frac = {}
    for node in graph.nodes:
        counter = 0
        neighbors = len(list(graph.neighbors(node)))

        if neighbors == 0: match_frac[node] = 0 # Have to check this, otherwise devising by zero

        else:
            for neighbor in graph.neighbors(node):
                # Check if the attribute value is the same for the node and the neighbor
                if nx.get_node_attributes(graph, att)[neighbor] == nx.get_node_attributes(graph, att)[node]:
                    counter += 1

            match_frac[node] = counter / neighbors

    return match_frac

In [None]:
# Get the average fraction of neighbors that have the same attribute value
top_field_fracs_shelves = get_matching_att(shelf_giant, 'top_genre')
top_field_fracs_NLP = get_matching_att(NLP_giant, 'top_genre')

print(f"(shelves) Average across all nodes {np.mean(list(top_field_fracs_shelves.values()))}")
print(f"(NLP) Average across all nodes {np.mean(list(top_field_fracs_NLP.values()))}")

### Create a new graph, with the same nodes and edges, but where the association between nodes and field is shuffled. Compute the measure above for this randomized graph.

In [None]:
# Function that shuffles the attribute values of a graph
def shuffle_node_att(graph, att):
    """
    A function that shuffles the attribute values of a graph

    Args:
        graph (nx.Graph): The graph to be analyzed
        att (str): The attribute to be analyzed

    Returns:
        graph (nx.Graph): The graph with shuffled attributes
    """
    # Make dictionary with shuffled attributes
    shuffled_atts = nx.get_node_attributes(graph, att)
    temp = list(shuffled_atts.values())
    random.shuffle(temp)
    new_atts = dict(zip(shuffled_atts, temp))

    # Set the shuffled attributes
    nx.set_node_attributes(graph, new_atts, att)

    return graph

In [None]:
# Make a copy of the graph and shuffle the attribute values
shelf_giant_copy = copy.deepcopy(shelf_giant)
shelf_giant_shuffled = shuffle_node_att(shelf_giant_copy, 'top_genre')

NLP_giant_copy = copy.deepcopy(NLP_giant)
NLP_giant_shuffled = shuffle_node_att(NLP_giant_copy, 'top_genre')

print(f"(Shelves) When the fields are randomly assigned, on average an genre has {np.mean(list(get_matching_att(shelf_giant_copy, 'top_genre').values()))} of their neighbors in the same genre")
print(f"(NLP) When the fields are randomly assigned, on average an genre has {np.mean(list(get_matching_att(NLP_giant_shuffled, 'top_genre').values()))} of their neighbors in the same genre")

### Simulate 100 points for the random distributions and compare to the real graphs

In [None]:
try:
    distribution = np.load(DATA_PATH + "avg_same_field_distribution.npy")
except:
    # Shuffles the graph attributes N times and stores the distribution of the average fraction of neighbors that have the same attribute value
    N = 100
    distribution_shelves = np.zeros(N)
    distribution_NLP = np.zeros(N)
    for i in tqdm(range(N)):
        shelf_giant_copy = shuffle_node_att(shelf_giant_copy, 'top_genre')
        avg = np.mean(list(get_matching_att(shelf_giant_copy, 'top_genre').values()))
        distribution_shelves[i] = avg

        NLP_giant_copy = shuffle_node_att(NLP_giant_copy, 'top_genre')
        avg = np.mean(list(get_matching_att(NLP_giant_copy, 'top_genre').values()))
        distribution_NLP[i] = avg

    np.save(DATA_PATH + "avg_same_field_distribution_shelves.npy", distribution_shelves)
    np.save(DATA_PATH + "avg_same_field_distribution_NLP.npy", distribution_NLP)

### Compare values on plot

In [3]:
# Load the distribution if not in memory
if 'distribution_shelves' not in locals() or 'distribution_NLP' not in locals() :
    distribution_shelves = np.load(DATA_PATH + "avg_same_field_distribution_shelves.npy")
    distribution_NLP = np.load(DATA_PATH + "avg_same_field_distribution_NLP.npy")

FileNotFoundError: [Errno 2] No such file or directory: 'data/avg_same_field_distribution.npy'

In [4]:
def plot_avg_frac_neighbors(distribution, top_field_fracs):
    # Setup for plot
    bins = 10
    avg_value = np.mean(list(top_field_fracs.values()))

    # Plot the distribution
    figure, ax = plt.subplots(1)

    ax.hist(distribution, density=True, bins=bins)
    ax.axvline(x=avg_value, color='r', label=f"Observed value: {avg_value:.2f}")
    ax.axvline(x=np.max(distribution), color='b', label=f"Maximum random value: {np.max(distribution):.2f}")
    ax.legend()
    ax.title.set_text("Distribution of the average fraction of neighbors that have the same attribute value")
    plt.xlabel("Fraction of neighbours with the same attribute value")
    plt.ylabel("Quantity")

    plt.show()

In [None]:
# Shelf plot
plot_avg_frac_neighbors(distribution_shelves, top_field_fracs_shelves)

In [None]:
# NLP plot
plot_avg_frac_neighbors(distribution_NLP, top_field_fracs_NLP)

# Assortativity

In [None]:
print(f"(shelf) The association for top genres is {nx.attribute_assortativity_coefficient(shelf_giant, 'top_genres'):.2f}, calculated with the mx library")
print(f"(NLP) The association for top genres is {nx.attribute_assortativity_coefficient(NLP_giant, 'top_genres'):.2f}, calculated with the mx library")

In [5]:
# Very similar to the function above, but this calculates the matrix for the degree instead of attribute
get_i = lambda x, threshold: 1 if x > threshold else 0 # Small function to get the index of the matrix
calculate_assortiative_coef = lambda E: (np.trace(E) - np.sum(E @ E.T)) / (1 - np.sum(E @ E.T))

max_threshold = 10
assosiations = np.zeros(max_threshold - 2)

for threshold in range(2, max_threshold):
    # Get dimensions of matrix
    E = np.zeros((2, 2))

    # Fill the matrix
    for edge in shelf_giant.edges:
        # Get the degree of the node at the ends of the edge
        i = get_i(shelf_giant.degree(edge[0]), threshold)
        j = get_i(shelf_giant.degree(edge[1]), threshold)
        E[i, j] += 1
        E[j, i] += 1

    E = E / (2*shelf_giant.number_of_edges())

    # Store the assosiation
    assosiations[threshold - 2] = calculate_assortiative_coef(E)

plt.plot(range(2, max_threshold), assosiations, "-o")
plt.xlabel("Threshold")
plt.ylabel("Association")
plt.title("Association for different thresholds (shelf)")
plt.show()

NameError: name 'shelf_giant' is not defined