In [7]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
import seaborn as sns
import networkx as nx
from typing import List, Dict, Tuple, Union

# Add the scripts directory to the path
scripts_dir = Path("..") / "scripts"
sys.path.append(str(scripts_dir.resolve()))

from network import clean_author_names, analyze_coauthorship_network

%reload_ext autoreload
%autoreload 2

In [2]:
df = pd.read_parquet('../data/processed/arxiv_scientific_dataset_final.parquet')
display(df)

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count,...,title_count,author_count_boxcox,title_count_sqrt,published_year,published_quarter,published_month,updated_year,updated_quarter,updated_month,year_period
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,['M. L. Ginsberg'],'M. L. Ginsberg',Because of their occasional need to return to ...,79,...,2,0.000000,1.414214,1993,1993Q3,1993-08,1993,1993Q3,1993-08,1990s
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,['M. P. Wellman'],'M. P. Wellman',Market price systems constitute a well-underst...,119,...,12,0.000000,3.464102,1993,1993Q3,1993-08,1993,1993Q3,1993-08,1990s
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,1993-09-01,1993-09-01,"['I. P. Gent', 'T. Walsh']",'I. P. Gent',We describe an extensive study of search in GS...,167,...,7,0.715010,2.645751,1993,1993Q3,1993-09,1993,1993Q3,1993-09,1990s
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"['F. Bergadano', 'D. Gunetti', 'U. Trinchero']",'F. Bergadano',As real logic programmers normally use cut (!)...,174,...,8,1.154208,2.828427,1993,1993Q4,1993-11,1993,1993Q4,1993-11,1990s
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"['J. C. Schlimmer', 'L. A. Hermens']",'J. C. Schlimmer',To support the goal of allowing users to recor...,187,...,8,0.715010,2.828427,1993,1993Q4,1993-11,1993,1993Q4,1993-11,1990s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112516,abs-2501.18184v1,Genetic Algorithm with Border Trades (GAB),Machine Learning,cs.LG,2025-01-30,2025-01-30,['Qingchuan Lyu'],'Qingchuan Lyu',This paper introduces a novel approach to impr...,74,...,6,0.000000,2.449490,2025,2025Q1,2025-01,2025,2025Q1,2025-01,2020s
112517,abs-2501.18280v1,Jailbreaking LLMs' Safeguard with Universal Ma...,Computation and Language (Natural Language Pro...,cs.CL,2025-01-30,2025-01-30,"['Haoyu Liang', 'Youran Sun', 'Yunfeng Cai', '...",'Haoyu Liang',The security issue of large language models (L...,150,...,11,1.730617,3.316625,2025,2025Q1,2025-01,2025,2025Q1,2025-01,2020s
108722,abs-2405.20132v4,LLaMEA: A Large Language Model Evolutionary Al...,Neural and Evolutionary Computing,cs.NE,2024-05-30,2025-01-30,"['Niki van Stein', 'Thomas Bäck']",'Niki van Stein',Large Language Models (LLMs) such as GPT-4 hav...,177,...,11,0.715010,3.316625,2024,2024Q2,2024-05,2025,2025Q1,2025-01,2020s
112519,abs-2501.18504v1,CLEAR: Cue Learning using Evolution for Accura...,Computer Vision and Pattern Recognition,cs.CV,2025-01-30,2025-01-30,"['Peter J. Bentley', 'Soo Ling Lim', 'Fuyuki I...",'Peter J. Bentley',Large Language Model (LLM) image recognition i...,170,...,13,1.154208,3.605551,2025,2025Q1,2025-01,2025,2025Q1,2025-01,2020s


# Clean author names

Some authors are full names and some are initials. It is hard to go from initials to full names, so let's clean this to make everyone initials + last name. 

In [3]:
clean_df=clean_author_names(df)
display(clean_df)

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count,...,title_count,author_count_boxcox,title_count_sqrt,published_year,published_quarter,published_month,updated_year,updated_quarter,updated_month,year_period
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,[M. L. Ginsberg],M. L. Ginsberg,Because of their occasional need to return to ...,79,...,2,0.000000,1.414214,1993,1993Q3,1993-08,1993,1993Q3,1993-08,1990s
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,1993-08-01,1993-08-01,[M. P. Wellman],M. P. Wellman,Market price systems constitute a well-underst...,119,...,12,0.000000,3.464102,1993,1993Q3,1993-08,1993,1993Q3,1993-08,1990s
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,1993-09-01,1993-09-01,"[I. P. Gent, T. Walsh]",I. P. Gent,We describe an extensive study of search in GS...,167,...,7,0.715010,2.645751,1993,1993Q3,1993-09,1993,1993Q3,1993-09,1990s
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"[F. Bergadano, D. Gunetti, U. Trinchero]",F. Bergadano,As real logic programmers normally use cut (!)...,174,...,8,1.154208,2.828427,1993,1993Q4,1993-11,1993,1993Q4,1993-11,1990s
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,1993-11-01,1993-11-01,"[J. C. Schlimmer, L. A. Hermens]",J. C. Schlimmer,To support the goal of allowing users to recor...,187,...,8,0.715010,2.828427,1993,1993Q4,1993-11,1993,1993Q4,1993-11,1990s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112516,abs-2501.18184v1,Genetic Algorithm with Border Trades (GAB),Machine Learning,cs.LG,2025-01-30,2025-01-30,[Q. Lyu],Q. Lyu,This paper introduces a novel approach to impr...,74,...,6,0.000000,2.449490,2025,2025Q1,2025-01,2025,2025Q1,2025-01,2020s
112517,abs-2501.18280v1,Jailbreaking LLMs' Safeguard with Universal Ma...,Computation and Language (Natural Language Pro...,cs.CL,2025-01-30,2025-01-30,"[H. Liang, Y. Sun, Y. Cai, J. Zhu, B. Zhang]",H. Liang,The security issue of large language models (L...,150,...,11,1.730617,3.316625,2025,2025Q1,2025-01,2025,2025Q1,2025-01,2020s
108722,abs-2405.20132v4,LLaMEA: A Large Language Model Evolutionary Al...,Neural and Evolutionary Computing,cs.NE,2024-05-30,2025-01-30,"[N.V. Stein, T. Bäck]",N.V. Stein,Large Language Models (LLMs) such as GPT-4 hav...,177,...,11,0.715010,3.316625,2024,2024Q2,2024-05,2025,2025Q1,2025-01,2020s
112519,abs-2501.18504v1,CLEAR: Cue Learning using Evolution for Accura...,Computer Vision and Pattern Recognition,cs.CV,2025-01-30,2025-01-30,"[P.J. Bentley, S.L. Lim, F. Ishikawa]",P.J. Bentley,Large Language Model (LLM) image recognition i...,170,...,13,1.154208,3.605551,2025,2025Q1,2025-01,2025,2025Q1,2025-01,2020s


In [4]:
print(f"Number unique first authors before cleaning: {len(df['first_author'].unique())}")
print(f"Number unique first authors after cleaning: {len(clean_df['first_author'].unique())}")

Number unique first authors before cleaning: 77733
Number unique first authors after cleaning: 51929


In [5]:
# Debug helper to find problematic author entries
problem_rows = []
for i, row in clean_df.iterrows():
    try:
        if 'authors' in row and isinstance(row['authors'], str):
            authors = [author.strip() for author in row['authors'].split(',')]
            # Check for duplicates or empty names
            if len(set(authors)) != len(authors) or '' in authors:
                problem_rows.append((i, row['authors']))
    except Exception as e:
        problem_rows.append((i, f"Error: {e}"))

if problem_rows:
    print(f"Found {len(problem_rows)} potentially problematic rows:")
    for idx, row in problem_rows[:10]:  # Show first 10
        print(f"Row {idx}: {row}")

# Generate network information

In [None]:
# Run the full analysis
graph, influence = analyze_coauthorship_network(
    df, 
    author_column='authors',
    centrality_metrics=['degree', 'eigenvector', 'pagerank'],
    top_n=10,
    visualize=True,
    save_visualization='coauthorship_network.png'  # Optional
)

Starting co-authorship network analysis on 136160 records...


Building network: 100%|██████████| 136160/136160 [00:05<00:00, 23093.89it/s]


Network created with 196969 authors and 1744406 connections
Network density: 0.000090
Number of connected components: 11925
Size of largest component: 160711 authors

Top 10 Authors by Degree Centrality:
1. Bo Li: 0.0094
2. Rui Wang: 0.0085
3. Dian Yu: 0.0083
4. Dan Garrette: 0.0081
5. Lei Zhang: 0.0080
6. Fan Yang: 0.0080
7. Albert Webson: 0.0080
8. Wei Chen: 0.0079
9. Nora Kassner: 0.0078
10. Xi Chen: 0.0078

Top 10 Authors by Eigenvector Centrality:
1. Dan Garrette: 0.0299
2. Albert Webson: 0.0299
3. Dian Yu: 0.0299
4. Nora Kassner: 0.0299
5. Oriol Vinyals: 0.0298
6. Demis Hassabis: 0.0298
7. Koray Kavukcuoglu: 0.0298
8. Noah Fiedel: 0.0298
9. Aakanksha Chowdhery: 0.0298
10. Zhitao Gong: 0.0298

Top 10 Authors by Pagerank Centrality:
1. Yang Liu: 0.0004
2. Yoshua Bengio: 0.0003
3. Jun Wang: 0.0002
4. Wei Wang: 0.0002
5. Dacheng Tao: 0.0002
6. Hao Wang: 0.0002
7. Bo Li: 0.0002
8. Xiang Li: 0.0002
9. Wei Liu: 0.0002
10. Wei Chen: 0.0002

Comparison of Top 10 Authors Across Centrality 