In [None]:
import requests
import csv
import os
import time
import shutil
from google.colab import drive  


# Dossier de sortie
csv_filename = "diabetes_papers.csv"
folder_name = "PDFs"
os.makedirs(folder_name, exist_ok=True)

url = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"

# Fonction pour récupérer et sauvegarder les articles
def fetch_and_save_papers(query, year, csv_filename, max_papers=1000):
    cursor = "*"  # Initial cursor
    papers_fetched = 0

    params = {
        'query': query,
        'publicationTypes': 'JournalArticle',
        'year': str(year),
        'fields': 'title,publicationTypes,publicationDate,url,abstract,paperId,corpusId,externalIds,isOpenAccess,openAccessPdf',
        'limit': 100, 
        'cursor': cursor
    }

    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=[
            'Title', 'Publication Type', 'Publication Date', 'Abstract',
            'Full Text URL', 'Paper ID', 'Corpus ID', 'DOI',
            'Is Open Access', 'Open Access PDF'
        ])
        if csvfile.tell() == 0:
            writer.writeheader()

        while papers_fetched < max_papers:
            params['cursor'] = cursor
            try:
                response = requests.get(url, params=params, timeout=10)
                if response.status_code == 200:
                    data = response.json()
                    papers = data.get('data', [])
                    if not papers:
                        print(f"Plus de résultats pour {year}.")
                        break

                    for paper in papers:
                        if papers_fetched >= max_papers:
                            break
                        writer.writerow({
                            'Title': paper.get('title', 'N/A'),
                            'Publication Type': ', '.join(paper.get('publicationTypes', ['N/A'])),
                            'Publication Date': paper.get('publicationDate', 'N/A'),
                            'Abstract': paper.get('abstract', 'N/A'),
                            'Full Text URL': paper.get('url', 'N/A'),
                            'Paper ID': paper.get('paperId', 'N/A'),
                            'Corpus ID': paper.get('corpusId', 'N/A'),
                            'DOI': paper.get('externalIds', {}).get('DOI', 'N/A'),
                            'Is Open Access': paper.get('isOpenAccess', 'N/A'),
                            'Open Access PDF': paper.get('openAccessPdf', {}).get('url') if paper.get('openAccessPdf') else 'N/A'
                        })
                        papers_fetched += 1

                    cursor = data.get('next', None)
                    if not cursor:
                        print(f"Fin des résultats pour {year}.")
                        break
                    time.sleep(1)

                elif response.status_code == 429:
                    print("Trop de requêtes. Pause de 60 secondes...")
                    time.sleep(60)
                else:
                    print(f"Erreur {response.status_code}: {response.text}")
                    break

            except Exception as e:
                print(f"Erreur de requête: {e}")
                time.sleep(5)

    print(f"{papers_fetched} articles récupérés pour {year}.")
    return papers_fetched

# Téléchargement des PDFs
def download_open_access_pdfs(csv_filename, max_pdfs=200):
    papers_downloaded = 0
    with open(csv_filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            pdf_url = row.get('Open Access PDF')
            paper_id = row.get('Paper ID')
            if pdf_url and pdf_url != 'N/A' and papers_downloaded < max_pdfs:
                pdf_path = os.path.join(folder_name, f"{paper_id}.pdf")
                if os.path.exists(pdf_path): 
                    papers_downloaded += 1
                    continue
                try:
                    response = requests.get(pdf_url, timeout=15)
                    if response.status_code == 200:
                        with open(pdf_path, 'wb') as f:
                            f.write(response.content)
                        print(f"✅ Téléchargé: {paper_id}.pdf")
                        papers_downloaded += 1
                    else:
                        print(f"❌ Échec du téléchargement (code {response.status_code}): {pdf_url}")
                except Exception as e:
                    print(f"❌ Erreur pour {paper_id}: {e}")
                time.sleep(0.5)  
            elif papers_downloaded >= max_pdfs:
                break
    print(f"✅ {papers_downloaded} PDFs téléchargés.")
    return papers_downloaded

# === Exécution du pipeline ===
