In [1]:
import arxiv
import pandas as pd

In [16]:
def collect_arxiv_data(category="cs.AI", max_results=500):
    # Create client and search
    client = arxiv.Client()
    search = arxiv.Search(
        query=f"cat:{category}",
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending
    )
    
    # Extract results into a list of dictionaries
    data = []
    for result in client.results(search):
        try:
            title = result.title
            abstract = result.summary # arXiv calls abstracts 'summary'
            date = result.published
            if title and abstract and date: # Check for completeness
                data.append({
                    'title': title,
                    'abstract': abstract,
                    'date': date
                })
        except AttributeError:
            print(f"Skipping result: {result}")
            # Skip papers with missing data
            pass
    
    # Create DataFrame from the results
    df = pd.DataFrame(data)

    # Basic Preprocessing - clean abstracts
    df['abstract'] = df['abstract'].apply(lambda x: x.strip().lower())
    
    import os
    os.makedirs('data', exist_ok=True)
    
    df.to_csv(f'data/arxiv_{category}_papers.csv', index=False)
    return df

In [17]:
# Collect data for AI category
df = collect_arxiv_data()

print(f"Collected {len(df)} papers")
print("\nFirst few entries:")
print(df.head()) 

Collected 500 papers

First few entries:
                                               title  \
0  RIG: Synergizing Reasoning and Imagination in ...   
1  UniOcc: A Unified Benchmark for Occupancy Fore...   
2  Any2Caption:Interpreting Any Condition to Capt...   
3  ACPBench Hard: Unrestrained Reasoning about Ac...   
4  Harnessing the Reasoning Economy: A Survey of ...   

                                            abstract                      date  
0  reasoning before action and imagining potentia... 2025-03-31 17:59:52+00:00  
1  we introduce uniocc, a comprehensive, unified ... 2025-03-31 17:59:24+00:00  
2  to address the bottleneck of accurate user int... 2025-03-31 17:59:01+00:00  
3  the acpbench dataset provides atomic reasoning... 2025-03-31 17:58:25+00:00  
4  recent advancements in large language models (... 2025-03-31 17:58:07+00:00  
