In [None]:
import requests
from tqdm import tqdm

# Base URL for the OpenAlex API
base_url = "https://api.openalex.org/works"

# Parameters
search_term = "soccer"
type = "article"
results = []
years =  [2000,2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,2013, 2014, 2015, 2016,2017,2018,2019,2020,2021,2022,2023,2024]
months = [("01-01", "12-31")]

# Loop through each year and fetch data for two six-month intervals
for year in tqdm(years):
    for start_date, end_date in months:
        for page in range(1, 100000):
            url = (
                f"{base_url}?mailto=objzuhkelcedujpspo@nbmbb.com&page={page}&"
                f"filter=default.search:{search_term},type:{type},"
                f"from_publication_date:{year}-{start_date},to_publication_date:{year}-{end_date}"
            )
            response = requests.get(url)

            # Check if the request was successful
            if response.status_code == 200:
                data = response.json()

                # Break the loop if no results are returned
                if not data.get('results'):
                    break

                results.extend(data['results'])
            else:
                print(f"Error: Unable to fetch page {page} for {year}-{start_date} to {year}-{end_date}")
                break

        print(f"Total results retrieved for {year}-{start_date} to {year}-{end_date}: {len(results)}")

print(f"Total results retrieved: {len(results)}")

In [None]:
import csv

csv_file = '../../data/data_years.csv'

# Sets to track seen titles and DOIs
seen_titles = set()
seen_dois = set()

# Open the CSV file for writing
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    if results:
        # Dynamically generate the header from the keys of the first result
        header = list(results[0].keys())
        writer.writerow(header)  

        for result in results:
            title = result.get('title', 'No title')
            doi = result.get('doi', 'No doi')

            # Check for duplicates
            if title in seen_titles or doi in seen_dois:
                print(f"Warning: Duplicate entry found for '{title}' with DOI '{doi}'")
                continue  # Skip this entry if it's a duplicate

            # Add title and DOI to seen sets
            seen_titles.add(title)
            seen_dois.add(doi)

            # Write all values in the result dictionary
            writer.writerow([result.get(key, 'No data') for key in header])

print(f"Data has been written to {csv_file}")
