In [None]:
# Libraries

import pandas as pd  # Start SEC 1
import networkx as nx  # Start SEC 2
from networkx.algorithms import bipartite  # Start SEC 2
import community as community_louvain  # Start SEC 4
import random, copy  # Start SEC 5
from pyvis.network import Network  # Start SEC 6

In [None]:
## DSC291_Final_Project.ipynb
# GitHub Developer Collaboration Network Analysis
# -------------------------------------------------------------
# This notebook loads your CSV data from the old notebook, preprocesses it into a clean
# pivot table (contributors vs. repos), constructs a bipartite graph and then projects it
# into a contributor network, performs network analysis (centrality, community detection,
# robustness simulation), and exports visualizations for Gephi and Pyvis.
#
# NOTE: Ensure the CSV file ("top_100_stars_nodes_df.csv") is in your working directory.
# -------------------------------------------------------------

## SEC 1. Data Loading & Preprocessing
# Load the CSV file and clean it.

csv_file = "top_100_stars_nodes_df.csv"
df_raw = pd.read_csv(csv_file)
print("Raw Data:")
print(df_raw.head())

# If the CSV is in wide format with a 'contributors' column, set it as index and drop "Unnamed: 0"
if 'contributors' in df_raw.columns:
    df_clean = df_raw.set_index('contributors')
    if 'Unnamed: 0' in df_clean.columns:
        df_clean.drop(columns=['Unnamed: 0'], inplace=True)
    df_clean = df_clean.fillna(0)
    print("Data assumed to be in pivoted wide format:")
else:
    # Otherwise, pivot the data (assume columns: contributor, repo, contributions)
    df_clean = df_raw.pivot_table(index='contributor',
                                  columns='repo',
                                  values='contributions',
                                  fill_value=0)
    print("Pivoted Data (long format to wide):")

print(df_clean.head())

# Ensure the index is unique by grouping duplicates (summing their values)
if not df_clean.index.is_unique:
    df_clean = df_clean.groupby(df_clean.index).sum()

print("Is the index unique now?", df_clean.index.is_unique)
print(df_clean.head())
df_clean.to_csv("top_100_stars_nodes_clean.csv")


In [None]:
## SEC 2. Building the Bipartite Graph and Contributor Projection
# Create a bipartite graph with two sets of nodes: contributors and repositories.
# Then project the graph onto the contributor nodes so that an edge exists between two
# developers if they share at least one repository. The edge weight is the number of shared repos.

B = nx.Graph()

# Get lists of contributors and repos
contributors = list(df_clean.index)
repos = list(df_clean.columns)

# Add contributor nodes (with attribute 'bipartite' = 'contributors')
B.add_nodes_from(contributors, bipartite='contributors')
# Add repository nodes (with attribute 'bipartite' = 'repos')
B.add_nodes_from(repos, bipartite='repos')

# Add edges: for each contributor, add an edge to a repo if the contribution count > 0.
for contributor in contributors:
    for repo in repos:
        # Use .at to ensure a scalar value and convert it to float
        contrib_value = df_clean.at[contributor, repo]
        contrib_count = float(contrib_value)
        if contrib_count > 0:
            B.add_edge(contributor, repo, weight=contrib_count)

print("Bipartite graph created:")
print("Number of contributor nodes:", len(contributors))
print("Number of repo nodes:", len(repos))
print("Total edges in bipartite graph:", B.number_of_edges())

# Project the bipartite graph onto contributor nodes.
G = bipartite.weighted_projected_graph(B, contributors)
print("Contributor collaboration network:")
print("Nodes:", G.number_of_nodes(), "Edges:", G.number_of_edges())

In [None]:
## SEC 3. Network Analysis: Centrality Measures
# Compute various centrality measures (degree, betweenness, closeness, eigenvector)
# and assign these values as node attributes.

deg_centrality = nx.degree_centrality(G)
betw_centrality = nx.betweenness_centrality(G, normalized=True)
closeness_centrality = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)

for node in G.nodes():
    G.nodes[node]['deg_cent'] = deg_centrality[node]
    G.nodes[node]['betw_cent'] = betw_centrality[node]
    G.nodes[node]['clos_cent'] = closeness_centrality[node]
    G.nodes[node]['eig_cent'] = eigenvector_centrality[node]

top_deg = sorted(deg_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 by degree centrality:")
for node, score in top_deg:
    print(f"  {node}: {score:.4f}")

In [None]:
## SEC 4. Community Detection Using the Louvain Method
# Use the python-louvain package to detect communities and add the community
# information as a node attribute.

partition = community_louvain.best_partition(G)
for node, comm in partition.items():
    G.nodes[node]['community'] = comm

num_communities = len(set(partition.values()))
print(f"Detected {num_communities} communities using Louvain.")

In [None]:
## SEC 5. Network Robustness Simulation
# Simulate the impact on the largest connected component (LCC) of the network by
# removing nodes both by targeted (highest degree first) and at random.

import random, copy

def largest_component_size(graph):
    if graph.number_of_nodes() == 0:
        return 0
    return len(max(nx.connected_components(graph), key=len))

G_targeted = copy.deepcopy(G)
G_random = copy.deepcopy(G)

targeted_nodes = sorted(G_targeted.degree(), key=lambda x: x[1], reverse=True)
component_sizes_targeted = []

print("Starting targeted removal simulation:")
for i, (node, _) in enumerate(targeted_nodes):
    G_targeted.remove_node(node)
    if (i + 1) % 100 == 0:
        size = largest_component_size(G_targeted)
        component_sizes_targeted.append(size)
        print(f"  Removed {i+1} nodes, largest component size: {size}")

nodes_random = list(G_random.nodes())
random.shuffle(nodes_random)
component_sizes_random = []

print("Starting random removal simulation:")
for i, node in enumerate(nodes_random):
    G_random.remove_node(node)
    if (i + 1) % 100 == 0:
        size = largest_component_size(G_random)
        component_sizes_random.append(size)
        print(f"  Removed {i+1} nodes, largest component size: {size}")

In [None]:
## SEC 6. Visualization Export
# Export the network for external visualization in G (GEXF format)
# and create an interactive visualization using Pyvis.

gephi_filename = "github_collaboration.gexf"
nx.write_gexf(G, gephi_filename)
print(f"Graph exported to {gephi_filename} for Gephi visualization.")

from pyvis.network import Network

net = Network(height="750px", width="100%", notebook=True)
net.from_nx(G)

for node in net.nodes:
    comm = G.nodes[node['id']].get('community', 0)
    node['color'] = 'red' if comm == 0 else 'blue'

pyvis_filename = "github_collaboration.html"
net.show(pyvis_filename)
print(f"Interactive network saved as {pyvis_filename}")

In [None]:
##  Outline of this notebook:
# - Loads and preprocesses the CSV data.
# - Aggregates duplicate contributors.
# - Constructs a bipartite graph and projects it into a contributor network.
# - Computes centrality measures.
# - Detects communities with the Louvain method.
# - Simulates network robustness via node removal.
# - Exports visualizations for Gephi and Pyvis.
#