In [None]:
# Import necessary libraries
from collections import Counter
import numpy as np
import pandas as pd
import datetime as dt
import networkx as nx
import json
import urllib.request
from networkx.algorithms import bipartite
import powerlaw
import matplotlib.pyplot as plt
import os

# Set plot parameters
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['axes.facecolor']='white'
plt.rcParams['savefig.facecolor']='white'
plt.rc('axes', axisbelow=True)
%matplotlib inline


In [None]:
def findFilesInFolder(path, pathList, extension, subFolders = True):
    try:   # Trapping a OSError:  File permissions problem I believe
        for entry in os.scandir(path):
            if entry.is_file() and entry.path.endswith(extension):
                pathList.append(entry.path)
            elif entry.is_dir() and subFolders: 
                pathList = findFilesInFolder(entry.path, pathList,
                                             extension, subFolders)
    except OSError as err:
        print(os.getcwd())
        print('Cannot access ' + path +'. Probably a permissions error', err)
    return pathList

os.chdir('/Users/aurora/northeastern/PHYS5116/irish-schools-collection')
dir_name_el = 'data/out'
extension_el = '.json'
pathList_el = []
flist_el = findFilesInFolder(dir_name_el, pathList_el, extension_el, True)
flist_el = sorted(flist_el)
print(len(flist_el),"files total. E.g.:")
flist_el[:5]

In [None]:
G = nx.Graph()

school = set()
files = set()

for file in flist_el:
    with open(file, 'r') as f:
        data = json.load(f)
        
        if file not in files:
            G.add_node(file, bipartite=0)
            files.add(file)
        
        for keys, record in data.items():
            if record['school_info'] not in school:
                G.add_node(record['school_info'], key = keys, bipartite=1)
                school.add(record['school_info'])
            
            # Check if the edge exists
            if G.has_edge(file, record['school_info']):
                # If it does, increment the weight
                G[file][record['school_info']]['weight'] += 1
            else:
                # If it doesn't, add the edge with a weight of 1
                G.add_edge(file, record['school_info'], weight=1)






In [None]:

plt.figure(figsize=(20, 20))  


degree_threshold = 0
filtered_nodes = [node for node, degree in G.degree() if degree > degree_threshold]
subgraph = G.subgraph(filtered_nodes)
set1, set2 = nx.bipartite.sets(subgraph)

# Assign colors
node_colors = ["red" if node in set1 else "skyblue" for node in subgraph.nodes()]
# Layout
pos = nx.spring_layout(subgraph)

# Draw the graph
nx.draw(subgraph, pos,  node_size=30, font_size=8, node_color=node_colors, font_color="black", edge_color="gray", alpha=1)

plt.title("Filtered Bipartite Graph of Schools and Files")
plt.show()

In [None]:

plt.figure(figsize=(20, 20))  


degree_threshold = 0
filtered_nodes = [node for node, degree in G.degree() if degree > degree_threshold]
subgraph = G.subgraph(filtered_nodes)
set1, set2 = nx.bipartite.sets(subgraph)

# Assign colors
node_colors = ["red" if node in set1 else "skyblue" for node in subgraph.nodes()]
# Layout
pos = nx.spring_layout(subgraph)

# Draw the graph
nx.draw(subgraph, pos,  node_size=30, font_size=8, node_color=node_colors, font_color="black", edge_color="gray", alpha=1)
# print number of nodes and edges
print("Number of nodes:", subgraph.number_of_nodes())
print("Number of edges:", subgraph.number_of_edges())

plt.title("Number of nodes: " + str(subgraph.number_of_nodes()) + " Number of edges: " + str(subgraph.number_of_edges()))
plt.show()

In [None]:
print(sorted(school))

In [None]:
# NOTE: this function was taken from the NETS_5116_2023-python-tutorial.ipynb notebook that was used during the NetworkX tutorial class

def plot_degree(degree, number_of_bins=50, log_binning=True, base=2):
    """
    Given a degree sequence, return the y values (probability) and the
    x values (support) of a degree distribution that you're going to plot.

    Parameters
    ----------
    degree (np.ndarray or list):
        a vector of length N that corresponds to the degree, k_i, of every
        node, v_i, in the network

    number_of_bins (int):
        length of output vectors

    log_binning (bool)
        if you are plotting on a log-log axis, then this is useful

    base (int):
        log base, defaults to 2

    Returns
    -------
    x, y (np.ndarray):
        the support and probability values of the degree distribution

    """

    # We need to define the support of our distribution
    lower_bound = min(degree)
    upper_bound = max(degree)

    # And the bins
    if log_binning:
        log = np.log2 if base == 2 else np.log10
        lower_bound = log(lower_bound) if lower_bound >= 1 else 0.0
        upper_bound = log(upper_bound)
        bins = np.logspace(lower_bound,upper_bound,number_of_bins, base = base)
    else:
        bins = np.linspace(lower_bound,upper_bound,number_of_bins)

    # Then we can compute the histogram using numpy
    y, __ = np.histogram(degree,
                         bins=bins,
                         density=True)
    # Now, we need to compute for each y the value of x
    x = bins[1:] - np.diff(bins)/2.0

    return x, y

In [None]:
file_set = {n for n, d in subgraph.nodes(data=True) if d["bipartite"] == 0}
print(file_set)
school_set = set(subgraph) - file_set
#print(sorted(school_set))

In [None]:
deg_tup_schools, deg_tup_files = bipartite.degrees(subgraph, file_set)
print(deg_tup_files)

In [None]:
deg_schools = [pair[1] for pair in list(deg_tup_schools)]
deg_files = [pair[1] for pair in list(deg_tup_files)]

In [None]:
x_school, y_school = plot_degree(deg_schools)
x_topic, y_topic = plot_degree(deg_files)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,4),dpi=150)

ax.loglog(x_school, y_school,'o', color='blue', label='school', alpha=0.8)
ax.loglog(x_topic, y_topic,'s', color='red', label='topic', alpha=0.8)


ax.set_xlabel(r"$k$", fontsize=16)
ax.set_ylabel(r"$P(k)$", fontsize=16)

ax.set_title("Degree distribution of bipartite network by node set")

ax.legend(fontsize=14)

ax.grid(linewidth=1.25, color='#999999', alpha=0.2, linestyle='-')

plt.savefig('./figs/pngs/bipartite_degreedist.png', dpi=425, bbox_inches='tight')
plt.savefig('./figs/pdfs/bipartite_degreedist.pdf', bbox_inches='tight')
plt.show()

In [None]:
all_degrees = list(dict(subgraph.degree).values())
print(all_degrees)

In [None]:
x_all, y_all = plot_degree(all_degrees)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,4),dpi=150)

ax.loglog(x_all, y_all,'o', color='blue', label='node', alpha=0.8)

ax.set_xlabel(r"$k$", fontsize=16)
ax.set_ylabel(r"$P(k)$", fontsize=16)

ax.set_title("Degree distribution of bipartite network")

ax.legend(fontsize=14)

ax.grid(linewidth=1.25, color='#999999', alpha=0.2, linestyle='-')

plt.savefig('./figs/pngs/all_bipartite_degreedist.png', dpi=425, bbox_inches='tight')
plt.savefig('./figs/pdfs/all_bipartite_degreedist.pdf', bbox_inches='tight')
plt.show()

# Gamma estimate

In [None]:
bipartite_gamma = powerlaw.Fit(all_degrees)
print(bipartite_gamma.alpha)

In [None]:
schools_gamma = powerlaw.Fit(deg_schools)
print(schools_gamma.alpha)

In [None]:
topics_gamma = powerlaw.Fit(deg_files)
print(topics_gamma.alpha)

In [None]:
print(nx.bipartite.is_bipartite_node_set(subgraph, school_set))

In [None]:
projected_graph = bipartite.projected_graph(subgraph, school_set)

plt.figure(figsize=(20, 20))

# Draw the projected graph
nx.draw(projected_graph, with_labels=False, node_size=30, font_size=8, node_color="skyblue", font_color="black", edge_color="gray", alpha=1)

plt.title("Number of nodes: " + str(projected_graph.number_of_nodes()) + " Number of edges: " + str(projected_graph.number_of_edges()))
plt.show()

In [None]:
projected_degree = [pair[1] for pair in list(projected_graph.degree)]
print(projected_degree)

In [None]:
x_proj_school, y_proj_school = plot_degree(projected_degree)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,4),dpi=150)

ax.loglog(x_proj_school, y_proj_school,'o', label='degree', alpha=0.8)


ax.set_xlabel(r"$k$", fontsize=16)
ax.set_ylabel(r"$P(k)$", fontsize=16)

ax.legend(fontsize=14)

ax.grid(linewidth=1.25, color='#999999', alpha=0.2, linestyle='-')

plt.savefig('./figs/pngs/og_projected_degreedist.png', dpi=425, bbox_inches='tight')
plt.savefig('./figs/pdfs/og_projected_degreedist.pdf', bbox_inches='tight')
plt.show()

In [None]:
projected_topic_graph = bipartite.projected_graph(subgraph, file_set)

plt.figure(figsize=(20, 20))

# Draw the projected graph
nx.draw(projected_topic_graph, with_labels=False, node_size=30, font_size=8, node_color="skyblue", font_color="black", edge_color="gray", alpha=1)

plt.title("Number of nodes: " + str(projected_topic_graph.number_of_nodes()) + " Number of edges: " + str(projected_topic_graph.number_of_edges()))
plt.show()

In [None]:
projected_topic_degree = [pair[1] for pair in list(projected_topic_graph.degree)]

In [None]:
print(projected_topic_degree)

In [None]:
x_proj_topic, y_proj_topic = plot_degree(projected_topic_degree)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,4),dpi=150)

ax.loglog(x_proj_topic, y_proj_topic,'o', label='degree', alpha=0.8)


ax.set_xlabel(r"$k$", fontsize=16)
ax.set_ylabel(r"$P(k)$", fontsize=16)

ax.set_title("Degree Distribution in the Projected Topic Graph")

ax.legend(fontsize=14)

ax.grid(linewidth=1.25, color='#999999', alpha=0.2, linestyle='-')

plt.savefig('./figs/pngs/og_projected_topic_degreedist.png', dpi=425, bbox_inches='tight')
plt.savefig('./figs/pdfs/og_projected_topic_degreedist.pdf', bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(projected_topic_degree, bins=100, color='blue', alpha=0.7)  # Adjust 'bins' as needed
plt.title("Degree Distribution in the Projected Topic Graph")
plt.xlabel("Degree")
plt.ylabel("Number of Nodes")

plt.savefig('./figs/pngs/og_projected_topic_degreedist_hist.png', dpi=425, bbox_inches='tight')
plt.savefig('./figs/pdfs/og_projected_topic_degreedist_hist.pdf', bbox_inches='tight')

plt.show()

In [None]:
len([deg for deg in projected_topic_degree if deg >= 190])

## With some degree threshold

In [None]:
edge_threshold = 0
filtered_edges = [(source, target) for source, target, weight in subgraph.edges(data="weight") if weight > edge_threshold]

# Create a new Graph with all nodes but only the filtered edges
filtered_graph = nx.Graph()
# filtered_graph.add_nodes_from(subgraph.nodes(data=True))
filtered_graph.add_edges_from(filtered_edges)

# Assign colors
node_colors = ["red" if node in set1 else "skyblue" for node in filtered_graph.nodes()]
# Layout
pos = nx.spring_layout(filtered_graph)
# Adjust node sizes based on degree
node_sizes = [G.degree(node) * 1 for node in filtered_graph.nodes()]

# Adjust edge widths based on weight
edge_widths = [G[u][v]['weight'] * 0.1 for u, v in filtered_graph.edges()]

# Improved layout
pos = nx.spring_layout(filtered_graph, k=0.1, iterations=50)

# Drawing the improved graph
plt.figure(figsize=(20, 20))
nx.draw(filtered_graph, pos, node_size=node_sizes, width=edge_widths, 
        node_color=node_colors, font_color="black", edge_color="gray", alpha=0.7)

plt.title("Number of nodes: " + str(filtered_graph.number_of_nodes()) + " Number of edges: " + str(filtered_graph.number_of_edges()))
plt.show()


In [None]:
# Filter the nodes in set1 to ensure they are in filtered_graph
set1_filtered = [node for node in set2 if node in filtered_graph.nodes()]

# Now use the filtered set for the projection
projected_graph = bipartite.projected_graph(filtered_graph, nodes=set1_filtered)

# Adjusting node sizes and edge widths for the projected graph
node_sizes_projected = [projected_graph.degree(node) * 1 for node in projected_graph.nodes()]
edge_widths_projected = [data['weight'] * 0.1 for u, v, data in projected_graph.edges(data=True) if 'weight' in data]

# Using the spring layout for visualization
pos_projected = nx.spring_layout(projected_graph, k=0.1, iterations=50)

# Drawing the projected graph
plt.figure(figsize=(20, 20))
nx.draw(projected_graph, pos_projected, node_size=node_sizes_projected, width=edge_widths_projected,
        node_color="red", font_color="black", edge_color="gray", alpha=0.7)

plt.title("Projected Graph: Number of nodes: " + str(projected_graph.number_of_nodes()) + " Number of edges: " + str(projected_graph.number_of_edges()))
plt.show()


In [None]:
desired_directory = "/Users/bijinjoseph/Documents/University/Research/Etoro"
os.chdir(desired_directory)
nx.write_gexf(projected_graph, "filtered_graph_with_nodes"+str(edge_threshold)+".gexf")

In [None]:
# Filter the nodes in set1 to ensure they are in filtered_graph
set1_filtered = [node for node in set1 if node in filtered_graph.nodes()]

# Now use the filtered set for the projection
projected_graph = bipartite.projected_graph(filtered_graph, nodes=set1_filtered)

# Adjusting node sizes and edge widths for the projected graph
node_sizes_projected = [projected_graph.degree(node) * np.exp(projected_graph.degree(node)) for node in projected_graph.nodes()]
edge_widths_projected = [data['weight'] * 0.1 for u, v, data in projected_graph.edges(data=True) if 'weight' in data]

# Using the spring layout for visualization
pos_projected = nx.spring_layout(projected_graph, k=0.1, iterations=50)

# Drawing the projected graph
plt.figure(figsize=(20, 20))
nx.draw(projected_graph, pos_projected, node_size=node_sizes_projected, width=edge_widths_projected,
        node_color="red", font_color="black", edge_color="gray", alpha=0.7)

plt.title("Projected Graph: Number of nodes: " + str(projected_graph.number_of_nodes()) + " Number of edges: " + str(projected_graph.number_of_edges()))
plt.show()


In [None]:
import networkx as nx

if nx.is_connected(projected_graph):
    print("The graph is fully connected.")
else:
    print("The graph is not fully connected.")
