In [1]:
#import necessary packages
import networkx as nx
import numpy as np
from community import community_louvain #pip install python-louvain package
import leidenalg as la
import igraph as ig
import pandas as pd
import matplotlib.pyplot as plt

#Read the data

In [2]:
# Load the data
data_file = '../data/openAlex/test/pub2ref.csv'
#data_file = '../data/fake/citation_data_fake.csv'
citation_data = pd.read_csv(data_file)

In [3]:
citation_data.info()
print(citation_data.head(20))

<class 'pandas.DataFrame'>
RangeIndex: 74391 entries, 0 to 74390
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   publication_id  74391 non-null  str  
 1   reference_id    74391 non-null  str  
dtypes: str(2)
memory usage: 5.7 MB
                      publication_id                      reference_id
0   https://openalex.org/W2184854514     https://openalex.org/W9119819
1   https://openalex.org/W2184854514    https://openalex.org/W35899286
2   https://openalex.org/W2184854514   https://openalex.org/W627963485
3   https://openalex.org/W2184854514   https://openalex.org/W768552817
4   https://openalex.org/W2184854514  https://openalex.org/W1499933681
5   https://openalex.org/W2184854514  https://openalex.org/W1549940068
6   https://openalex.org/W2184854514  https://openalex.org/W1565485882
7   https://openalex.org/W2184854514  https://openalex.org/W1574930552
8   https://openalex.org/W2184854514  https://openalex.o

<h1>Perform the community analysis</h1>

In [4]:
# Create a directed graph
G = nx.DiGraph()
G.add_edges_from(citation_data[['publication_id', 'reference_id']].values)
#G.add_edges_from(citation_data[['source', 'target']].values)

# Convert the NetworkX graph to an iGraph object
g = ig.Graph.TupleList(G.edges(), directed=True)

# Perform Leiden community detection
partition = la.find_partition(g, la.ModularityVertexPartition, n_iterations=10)

# Get the community assignments
community_ids = partition.membership

<h1>Get community metrics and print results data</h1>

In [5]:
#Print overview of communities identied
print("")
print("----Partition Overview----")
print("Number of papers:",len(community_ids))
print("Number of communities:",max(community_ids)+1)
print("")

# Create a dictionary mapping nodes to community IDs
node_community = {node: community_id for node, community_id in zip(G.nodes(), community_ids)}

# Calculate community metrics (e.g., size, density)
community_metrics = {}
for community_id in set(community_ids):
    #get number of papers (nodes) in each community
    community_nodes = [node for node, cid in node_community.items() if cid == community_id]
    #get density in each node [what is this?!?!]
    community_graph = G.subgraph(community_nodes)
    community_metrics[community_id] = {
        'size': len(community_nodes),
        'density': nx.density(community_graph)
    }

# Print community metrics (number of papers in each community and the density)
for community_id, metrics in community_metrics.items():
    print(f"Community {community_id}: size={metrics['size']}, density={metrics['density']}")
print("-------------")
print("")


----Partition Overview----
Number of papers: 48984
Number of communities: 466

Community 0: size=3687, density=0.00018160035236943574
Community 1: size=2408, density=0.00015165484943554722
Community 2: size=2154, density=0.00022404013143112697
Community 3: size=1966, density=0.0002042353598968728
Community 4: size=1363, density=0.00020577395246513963
Community 5: size=1316, density=0.0003449790238884972
Community 6: size=1284, density=0.00034236347345954646
Community 7: size=1238, density=0.00028535868345820766
Community 8: size=1192, density=0.00027330256566305454
Community 9: size=1003, density=0.00042586810427002424
Community 10: size=931, density=0.0004284905812919395
Community 11: size=840, density=0.0004994608093535388
Community 12: size=777, density=0.0006567686980058114
Community 13: size=746, density=0.0004822138654479371
Community 14: size=736, density=0.00041407867494824016
Community 15: size=679, density=0.000601700400988787
Community 16: size=647, density=0.00084696694914

<h1>Make figure</h1>

In [None]:
#here would like to obtain the number of edges to each node and size the nodes by this

# Sort out colors to match the community IDs ***not working**
colors = [community_ids]
print("")
#print(colors)
    

#Plotting options
options = {
    'node_size': 10,
    'width': 0.1,
    'arrowsize': 5,
    'with_labels': False
}

nx.draw_networkx(G, pos = nx.spring_layout(G), node_color=colors, cmap='viridis', **options)

plt.savefig("../output/figures/output.png",dpi=300)
plt.show()


