In [None]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd

# Load environment variables
load_dotenv()

# API endpoint and key
SCOPUS_API_KEY = os.getenv("SCOPUS_API_KEY")
BASE_URL = "https://api.elsevier.com/content/search/scopus"

# Set the total number of papers to fetch per year
papers_per_year = 600

# Query for all subject areas
subject_query = (
    "SUBJAREA(ENGI) OR SUBJAREA(COMP) OR SUBJAREA(MEDI) OR SUBJAREA(CHEM) OR "
    "SUBJAREA(BUSI) OR SUBJAREA(BIOC) OR SUBJAREA(DECI) OR SUBJAREA(MATE) OR "
    "SUBJAREA(PHYS) OR SUBJAREA(ENVI) OR SUBJAREA(AGRI) OR SUBJAREA(ENER) OR "
    "SUBJAREA(SOCI) OR SUBJAREA(VETE) OR SUBJAREA(NEUR) OR SUBJAREA(ECON) OR "
    "SUBJAREA(EART) OR SUBJAREA(MATH) OR SUBJAREA(MULT) OR SUBJAREA(IMMU) OR "
    "SUBJAREA(PHAR) OR SUBJAREA(DENT) OR SUBJAREA(CENG) OR SUBJAREA(NURS) OR "
    "SUBJAREA(HEAL) OR SUBJAREA(PSYC) OR SUBJAREA(ARTS)"
)

# List to store all results
all_results = []

# Loop through each year
for year in range(2018, 2024):  # Years 2018 to 2023 inclusive
    start = 0  # Reset the starting index for each year
    retrieved_papers = 0  # Track how many papers have been retrieved for the current year

    while retrieved_papers < papers_per_year:
        # Query parameters for the current year and pagination
        params = {
            "query": f"{subject_query} AND PUBYEAR = {year}",  # Filter for the current year
            "count": 25,  # Limit to 25 papers per request
            "start": start,  # Starting index for pagination
        }
        
        # Send GET request
        response = requests.get(
            BASE_URL,
            headers={"X-ELS-APIKey": SCOPUS_API_KEY},
            params=params,
        )
        
        if response.status_code == 200:
            data = response.json()  # Parse JSON response
            entries = data.get("search-results", {}).get("entry", [])  # Extract article entries
            
            if not entries:
                # No more results for this year; break the loop
                print(f"No more papers found for year {year}.")
                break
            
            # Add results to the main list
            all_results.extend(entries)
            
            # Update counters
            retrieved_papers += len(entries)
            start += len(entries)  # Move to the next page
            
            print(f"Retrieved {retrieved_papers} papers for year {year} so far.")
        else:
            print(f"Error: {response.status_code} - {response.text}")
            break  # Exit the loop if there's an error

    print(f"Finished retrieving {retrieved_papers} papers for year {year}.")

# Create a DataFrame with all results
df = pd.DataFrame(all_results)

print(df.head())


In [None]:
selected_columns = ['dc:title','prism:publicationName','prism:coverDate',]
df_selected = df[selected_columns]

def save_to_csv(df_selected, filename="scopus_data.csv"):
    df_selected.to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved to {filename}")

# Save the full data to a CSV file
save_to_csv(df_selected)