# Network Exploration and Baseline Analysis

This notebook performs exploratory data analysis (EDA) on the citation network and establishes baseline metrics for our TransE model. We'll examine:

- Network structure and basic statistics
- Node degrees, clustering, and centrality metrics
- Visualization of the current citation network
- Baseline analysis around the seed paper

In [None]:
# Import required libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from dotenv import load_dotenv

# Import our modules
from src.db import Neo4jConnection
from src.data_extraction import load_citation_graph
from src.visualization import (
    plot_network_overview, 
    plot_network_statistics,
    set_portfolio_style
)

# Set up plotting style
set_portfolio_style()
load_dotenv()

print("Libraries imported successfully!")

## 1. Connect to Neo4j Database

In [None]:
# Connect to Neo4j database
try:
    db = Neo4jConnection()
    if db.test_connection():
        print("✅ Successfully connected to Neo4j database")
    else:
        print("❌ Failed to connect to Neo4j database")
        raise ConnectionError("Database connection failed")
except Exception as e:
    print(f"❌ Database connection error: {e}")
    print("\nPlease check your .env file contains:")
    print("NEO4J_URI=neo4j+s://your-instance.databases.neo4j.io")
    print("NEO4J_USER=your-username")
    print("NEO4J_PWD=your-password")
    raise

## 2. Extract Citation Network Data

In [None]:
# Load citation graph data
print("Extracting citation network from Neo4j...")
extractor = load_citation_graph(db)

print(f"\nDataset Overview:")
print(f"• Papers: {extractor.num_entities:,}")
print(f"• Citations: {len(extractor.citation_edges):,}")
print(f"• Entity mapping size: {len(extractor.paper_to_id):,}")

## 3. Build NetworkX Graph for Analysis

In [None]:
# Build NetworkX graph
print("Building NetworkX graph for analysis...")
G = extractor.build_networkx_graph()

print(f"\nGraph Properties:")
print(f"• Nodes: {G.number_of_nodes():,}")
print(f"• Edges: {G.number_of_edges():,}")
print(f"• Is directed: {G.is_directed()}")
print(f"• Density: {nx.density(G):.6f}")

## 4. Comprehensive Network Statistics

In [None]:
# Get comprehensive dataset statistics
print("Computing comprehensive network statistics...")
stats = extractor.get_dataset_stats()

print("\n📊 Network Statistics:")
print("=" * 40)
for key, value in stats.items():
    if isinstance(value, float):
        print(f"{key:<25}: {value:.4f}")
    else:
        print(f"{key:<25}: {value}")

# Store stats for visualization
network_stats = stats

## 5. Node Degree Analysis

In [None]:
# Analyze degree distribution
degrees = dict(G.degree())
in_degrees = dict(G.in_degree())
out_degrees = dict(G.out_degree())

degree_df = pd.DataFrame({
    'node_id': list(degrees.keys()),
    'total_degree': list(degrees.values()),
    'in_degree': list(in_degrees.values()),
    'out_degree': list(out_degrees.values())
})

print("\n📈 Degree Statistics:")
print(degree_df[['total_degree', 'in_degree', 'out_degree']].describe())

In [None]:
# Plot degree distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Total degree distribution
axes[0].hist(degree_df['total_degree'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Total Degree')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Total Degree Distribution', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# In-degree distribution (citations received)
axes[1].hist(degree_df['in_degree'], bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1].set_xlabel('In-Degree (Citations Received)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('In-Degree Distribution', fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Out-degree distribution (citations given)
axes[2].hist(degree_df['out_degree'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[2].set_xlabel('Out-Degree (Citations Given)')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Out-Degree Distribution', fontweight='bold')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/degree_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Identify High-Impact Papers

In [None]:
# Find most highly cited papers (high in-degree)
top_cited = degree_df.nlargest(10, 'in_degree')

print("\n🏆 Top 10 Most Cited Papers (by in-degree):")
print("=" * 50)
for idx, row in top_cited.iterrows():
    node_id = row['node_id']
    paper_id = extractor.id_to_paper.get(node_id, 'Unknown')
    title = G.nodes[node_id].get('title', 'No title')[:80]
    citations = row['in_degree']
    print(f"{citations:2d} citations: {title}...")

print("\n🔗 Top 10 Papers by Citations Given (out-degree):")
print("=" * 50)
top_citing = degree_df.nlargest(10, 'out_degree')
for idx, row in top_citing.iterrows():
    node_id = row['node_id']
    paper_id = extractor.id_to_paper.get(node_id, 'Unknown')
    title = G.nodes[node_id].get('title', 'No title')[:80]
    citations_given = row['out_degree']
    print(f"{citations_given:2d} refs: {title}...")

## 7. Network Visualization

In [None]:
# Create network overview visualization
print("Creating network overview visualization...")

# For visualization, sample a subset if network is too large
if G.number_of_nodes() > 200:
    # Sample high-degree nodes for visualization
    high_degree_nodes = degree_df.nlargest(100, 'total_degree')['node_id'].tolist()
    medium_degree_nodes = degree_df[
        (degree_df['total_degree'] >= 3) & 
        (~degree_df['node_id'].isin(high_degree_nodes))
    ].sample(n=min(100, len(degree_df)-100), random_state=42)['node_id'].tolist()
    
    viz_nodes = high_degree_nodes + medium_degree_nodes
    G_viz = G.subgraph(viz_nodes).copy()
    print(f"Visualizing subset: {G_viz.number_of_nodes()} nodes, {G_viz.number_of_edges()} edges")
else:
    G_viz = G
    print(f"Visualizing full network: {G_viz.number_of_nodes()} nodes")

# Create and save network overview
fig = plot_network_overview(
    G_viz, 
    title="Citation Network Overview",
    figsize=(14, 8),
    save_path="../outputs/network_overview.png"
)
plt.show()

## 8. Network Statistics Visualization

In [None]:
# Create comprehensive statistics visualization
fig = plot_network_statistics(
    network_stats,
    figsize=(16, 10),
    save_path="../outputs/network_statistics.png"
)
plt.show()

## 9. Paper Metadata Analysis

In [None]:
# Get detailed paper metadata
print("Loading paper metadata...")
try:
    metadata_df = extractor.get_paper_metadata()
    print(f"Loaded metadata for {len(metadata_df)} papers")
    
    # Display sample metadata
    print("\nSample Paper Metadata:")
    print(metadata_df.head())
    
    # Citation count analysis
    if 'citations' in metadata_df.columns:
        citation_stats = metadata_df['citations'].describe()
        print("\n📊 Citation Count Statistics:")
        print(citation_stats)
        
except Exception as e:
    print(f"Could not load metadata: {e}")
    metadata_df = pd.DataFrame()

## 10. Summary and Key Insights

In [None]:
# Generate comprehensive summary
print("\n" + "="*60)
print("📋 NETWORK EXPLORATION SUMMARY")
print("="*60)

print(f"\n🔢 Scale:")
print(f"   • {network_stats['num_papers']:,} papers in the network")
print(f"   • {network_stats['num_citations']:,} citation relationships")
print(f"   • {network_stats['density']:.6f} network density")

print(f"\n🔗 Connectivity:")
print(f"   • {'Connected' if network_stats['is_connected'] else 'Not fully connected'}")
print(f"   • {network_stats['num_components']} connected components")
print(f"   • {network_stats['avg_degree']:.2f} average degree")

if 'avg_path_length' in network_stats:
    print(f"\n📏 Structure:")
    print(f"   • {network_stats['avg_path_length']:.2f} average path length")
    print(f"   • {network_stats['clustering_coefficient']:.4f} clustering coefficient")

print(f"\n📈 Degree Distribution:")
print(f"   • Max in-degree (most cited): {degree_df['in_degree'].max()}")
print(f"   • Max out-degree (most refs): {degree_df['out_degree'].max()}")
print(f"   • Mean degree: {degree_df['total_degree'].mean():.2f}")

print(f"\n🎯 Next Steps for TransE Model:")
print(f"   • Network is suitable for link prediction with {len(extractor.citation_edges):,} positive examples")
print(f"   • Can generate negative samples from {extractor.num_entities * (extractor.num_entities-1) - len(extractor.citation_edges):,} non-citations")
print(f"   • Embedding dimension should capture the network's complexity")

print(f"\n💾 Files saved:")
print(f"   • ../outputs/network_overview.png")
print(f"   • ../outputs/network_statistics.png")
print(f"   • ../outputs/degree_distributions.png")

print("\n✅ Network exploration complete! Ready for model training.")

In [None]:
# Clean up database connection
db.close()
print("Database connection closed.")