In [None]:
import arxiv
import pandas as pd
from tqdm import tqdm
import time
import datetime

def collect_arxiv_data(search_query, max_results=1000, start_year=2019, end_year=2024):

    print(f"Collecte de {max_results} articles maximum depuis arXiv avec la requête: {search_query}")

    search = arxiv.Search(
        query=search_query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending
    )
    
    articles = []
    
    for result in tqdm(search.results(), total=max_results):
        pub_date = result.published
        pub_year = pub_date.year
        
        if pub_year < start_year or pub_year > end_year:
            continue
        
        article = {
            'id': result.entry_id.split('/')[-1],
            'title': result.title,
            'authors': ', '.join([author.name for author in result.authors]),
            'abstract': result.summary.replace('\n', ' '),
            'categories': ', '.join(result.categories),
            'primary_category': result.primary_category,
            'published_date': result.published.strftime('%Y-%m-%d'),
            'year': result.published.year,
            'month': result.published.month,
            'comment': result.comment if hasattr(result, 'comment') else None,
            'doi': result.doi if hasattr(result, 'doi') else None
        }
        
        articles.append(article)
        
        time.sleep(0.1)
    
    df = pd.DataFrame(articles)
    
    print(f"Collecte terminée. {len(df)} articles récupérés.")
    return df

if __name__ == "__main__":
    generative_ai_query = 'ti:"generative AI" OR ti:"generative model" OR ti:"generative adversarial" OR ' \
                          'ti:"diffusion model" OR ti:"large language model" OR ti:"text-to-image" OR ti:"text to image" OR ' \
                          'ti:"image generation" OR ti:"text generation" OR ti:"stable diffusion" OR ' \
                          'ti:"GPT" OR ti:"LLM" OR ti:"generative pre-trained" OR ' \
                          'abs:"generative AI" AND (cat:cs.AI OR cat:cs.CL OR cat:cs.CV OR cat:cs.LG)'
    
    generative_ai_df = collect_arxiv_data(
        search_query=generative_ai_query,
        max_results=2000,
        start_year=2019,
        end_year=2024
    )
    
    output_file = f'data/arxiv_generative_ai_data_{datetime.datetime.now().strftime("%Y%m%d")}.csv'
    generative_ai_df.to_csv(output_file, index=False)
    print(f"Données sauvegardées dans {output_file}")
    
    print("\nRésumé des données collectées:")
    print(f"Nombre total d'articles: {len(generative_ai_df)}")
    
    years_count = generative_ai_df['year'].value_counts().sort_index()
    print("\nDistribution par année:")
    for year, count in years_count.items():
        print(f"  {year}: {count} articles")
    
    categories_count = generative_ai_df['primary_category'].value_counts().head(10)
    print("\nTop 10 des catégories principales:")
    for category, count in categories_count.items():
        print(f"  {category}: {count} articles")

In [None]:
import pandas as pd
arxiv_df = pd.read_csv("data/arxiv_generative_ai_data_20250516.csv")

In [None]:
arxiv_df.isnull().sum()

In [None]:
arxiv_df.dtypes