# OpenAlex

In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import json
import datetime

def collect_openalex_data(query, start_year=2019, end_year=2024, max_results=2000, email=None):
    base_url = "https://api.openalex.org/works"
    
    headers = {}
    if email:
        headers['User-Agent'] = f'Python-Research-Script ({email})'
    
    filters = [
        f"publication_year:{start_year}-{end_year}",
        "type:article",
        "is_paratext:false"
    ]

    params = {
        'search': query,
        'filter': ','.join(filters),
        'sort': 'cited_by_count:desc',
        'per_page': 100,
        'cursor': '*'
    }
    
    all_works = []
    total_collected = 0
    
    print(f"Collecte de données depuis OpenAlex pour la requête: {query}")

    with tqdm(total=max_results) as pbar:
        while total_collected < max_results:
            try:
                response = requests.get(base_url, params=params, headers=headers)
                response.raise_for_status()
                
                data = response.json()
                works = data.get('results', [])
                
                if not works:
                    print("Plus de résultats disponibles.")
                    break
                
                for work in works:
                    work_data = extract_work_data(work)
                    all_works.append(work_data)
                    total_collected += 1
                    pbar.update(1)
                    
                    if total_collected >= max_results:
                        break
                        
                next_cursor = data.get('meta', {}).get('next_cursor')
                if not next_cursor:
                    print("Fin des résultats (pas de curseur suivant).")
                    break
                    
                params['cursor'] = next_cursor
                
                time.sleep(0.3)
                
            except requests.exceptions.RequestException as e:
                print(f"Erreur lors de la requête à l'API: {e}")
                time.sleep(2)
                continue
    
    df = pd.DataFrame(all_works)
    
    print(f"Collecte terminée. {len(df)} articles récupérés depuis OpenAlex.")
    return df

def extract_work_data(work):
    authors = []
    author_countries = []
    author_institutions = []
    
    for author in work.get('authorships', []):
        author_name = author.get('author', {}).get('display_name', '')
        if author_name:
            authors.append(author_name)
            
        for institution in author.get('institutions', []):
            country = institution.get('country_code', '')
            if country:
                author_countries.append(country)
            
            institution_name = institution.get('display_name', '')
            if institution_name:
                author_institutions.append(institution_name)
    
    concepts = []
    concept_scores = []
    for concept in work.get('concepts', []):
        concept_name = concept.get('display_name', '')
        if concept_name:
            concepts.append(concept_name)
            concept_scores.append(concept.get('score', 0))
    
    primary_location = work.get('primary_location') or {}
    source = primary_location.get('source') or {}
    abstract = ""
    abstract_index = work.get('abstract_inverted_index', None)
    if abstract_index:
        index_to_word = []
        for word, positions in abstract_index.items():
            for pos in positions:
                index_to_word.append((pos, word))
        sorted_words = [word for _, word in sorted(index_to_word)]
        abstract = ' '.join(sorted_words)

    work_data = {
        'id': work.get('id', '').replace('https://openalex.org/', ''),
        'title': work.get('title', ''),
        'doi': work.get('doi', ''),
        'authors': '; '.join(authors),
        'publication_date': work.get('publication_date', ''),
        'year': work.get('publication_year', None),
        'open_access': work.get('open_access', {}).get('is_oa', False),
        'cited_by_count': work.get('cited_by_count', 0),
        'abstract': abstract,
        'concepts': '; '.join(concepts[:5]),
        'concept_scores': '; '.join([str(score) for score in concept_scores[:5]]),
        'author_countries': '; '.join(set(author_countries)),
        'author_institutions': '; '.join(set(author_institutions[:3])),
        'referenced_works_count': len(work.get('referenced_works', [])),
        'type': work.get('type', '')
    }

    concepts = [c.get('display_name', '') for c in work.get('concepts', []) if c.get('display_name')]
    work_data['categories'] = ', '.join(concepts)

    
    return work_data

if __name__ == "__main__":
    generative_ai_query = '"generative AI" OR "generative model" OR "diffusion model" OR "large language model" OR "LLM" OR "GANs" OR "transformer model" OR "stable diffusion" OR "text-to-image" OR "GPT" OR "DALL-E" OR "image generation" OR "text generation"'
    
    YOUR_EMAIL = "yassmine123fanid@gmail.com"
    
    openalex_df = collect_openalex_data(
        query=generative_ai_query,
        start_year=2019,
        end_year=2024,
        max_results=2000,
        email=YOUR_EMAIL
    )
    
    output_file = f'data/openalex_generative_ai_data_{datetime.datetime.now().strftime("%Y%m%d")}.csv'
    openalex_df.to_csv(output_file, index=False)
    print(f"Données sauvegardées dans {output_file}")
    print("\nRésumé des données collectées:")
    print(f"Nombre total d'articles: {len(openalex_df)}")
    
    if not openalex_df.empty:
        years_count = openalex_df['year'].value_counts().sort_index()
        print("\nDistribution par année:")
        for year, count in years_count.items():
            print(f"  {year}: {count} articles")
        
        top_cited = openalex_df.nlargest(5, 'cited_by_count')
        print("\nTop 5 des articles les plus cités:")
        for _, row in top_cited.iterrows():
            print(f"  {row['title']} - Citations: {row['cited_by_count']}")

In [None]:
openalex_df.isnull().sum()

In [None]:
openalex_df = pd.read_csv("data/openalex_generative_ai_data_20250516.csv")