In [23]:
#import necessary packages
import networkx as nx
import numpy as np
from community import community_louvain #pip install python-louvain package
import leidenalg as la
import igraph as ig
import pandas as pd
import matplotlib.pyplot as plt

<h1>Read and format the data</h1>

In [24]:
# Load the exported Web of Science file
wos_data_orig = pd.read_csv('data/wos_export.csv')

#remove NANs from the Cited References Column
wos_data = wos_data_orig[wos_data_orig['Cited References'].notnull()]

# Extract cited references and reformat in to a list of sources (cited paper) and targets (citing paper) **need to check this is correct behaviour**
cited_references = []
for index, row in wos_data.iterrows():
    refs = row['Cited References'].split('; ')
    for ref in refs:
        cited_references.append({
            'source': row['UT (Unique WOS ID)'],
            'target': ref
        })

# Create a DataFrame
citation_data = pd.DataFrame(cited_references)

# Save to a new CSV file
citation_data.to_csv('citation_data.csv', index=False)

<h1>Perform the analysis</h1>

In [86]:
#read in data, in format of source (citing paper) and target (cited paper)
citation_data = pd.read_csv('citation_data_fake.csv')

# Create a directed graph
G = nx.DiGraph()
G.add_edges_from(citation_data[['source', 'target']].values)

# Convert the NetworkX graph to an iGraph object
g = ig.Graph.TupleList(G.edges(), directed=True)

# Perform Leiden community detection
partition = la.find_partition(g, la.ModularityVertexPartition, n_iterations=10)

# Get the community assignments
community_ids = partition.membership

print("")
print("----Partition Overview----")
print("Number of papers:",len(community_ids))
print("Number of communities:",max(community_ids)+1)
print("")

# Create a dictionary mapping nodes to community IDs
node_community = {node: community_id for node, community_id in zip(G.nodes(), community_ids)}

# Calculate community metrics (e.g., size, density)
community_metrics = {}
for community_id in set(community_ids):
    #get number of papers (nodes) in each community
    community_nodes = [node for node, cid in node_community.items() if cid == community_id]
    #get density in each node [what is this?!?!]
    community_graph = G.subgraph(community_nodes)
    community_metrics[community_id] = {
        'size': len(community_nodes),
        'density': nx.density(community_graph)
    }

# Print community metrics (number of papers in each community and the density)
for community_id, metrics in community_metrics.items():
    print(f"Community {community_id}: size={metrics['size']}, density={metrics['density']}")
print("-------------")
print("")

#here would like to obtain the number of edges to each node


# Visualize the community structure
colors = [community_ids]
print("")
print(colors)
    

#Plotting options
options = {
    'node_size': 10,
    'width': 0.1,
    'arrowsize': 5,
    'with_labels': False
}

nx.draw_networkx(G, pos = nx.spring_layout(G), node_color=colors, cmap='viridis', **options)

plt.savefig("output.png",dpi=300)
plt.show()



----Partition Overview----
Number of papers: 26
Number of communities: 5

Community 0: size=7, density=0.9761904761904762
Community 1: size=6, density=0.13333333333333333
Community 2: size=5, density=0.3
Community 3: size=4, density=0.25
Community 4: size=4, density=0.5
-------------



NetworkXError: Node 0 is not in the graph.