<a href="https://colab.research.google.com/github/dloiacono/ai/blob/main/download_iss_pdfs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
import requests
import os
import json

# Mount Google Drive
drive.mount('/gdrive')

# Function to download PDF from a URL
def download_pdf(url, folder, uuid):
    filename = os.path.join(folder, f"{uuid}.pdf")
    with open(filename, 'wb') as f:
        f.write(requests.get(url).content)

# Function to download metadata for an item
def download_metadata(uuid, folder):
    url = f"https://publ.iss.it/ITA/Items/GetMetadata?uuid={uuid}"
    response = requests.get(url)
    metadata = response.json()
    with open(os.path.join(folder, f"{uuid}.json"), 'w') as f:
        f.write(json.dumps(metadata, indent=4))

# Function to download PDFs and metadata for a given year
def download_pdfs_and_metadata_for_year(year, folder):
    page = 1
    while True:
        # Construct URL for the given year and page
        url = f"https://publ.iss.it/ITA/Items/GetSearchResults?qualifier=year&value={year}&take=10&skip={10 * (page - 1)}&page={page}&pageSize=10"

        # Send a GET request to the URL
        response = requests.get(url)
        data = response.json()

        # Extract PDF links and UUIDs from JSON response
        items = data.get('docs', [])
        pdf_links = [item['bitstream'][0] for item in items if item.get('bitstream') and item['bitstream'][0] == 'true']
        uuids = [item['uuid'] for item in items if item.get('bitstream') and item['bitstream'][0] == 'true']

        # Download each PDF and metadata
        for pdf_link, uuid in zip(pdf_links, uuids):
            pdf_url = f"https://publ.iss.it/ITA/Items/GetPDF?uuid={uuid}"
            download_pdf(pdf_url, folder, uuid)
            print(f"Downloaded for {year} (Page {page}):", pdf_url)
            download_metadata(uuid, folder)
            print(f"Downloaded metadata for {year} (Page {page}):", uuid)

        # Check if there are more pages
        if len(items) < 10:
            break

        # Increment page number
        page += 1

# Set folder path where you want to save the PDFs and metadata
folder = '/gdrive/My Drive/AI/publiiss'

# Ensure the folder exists, otherwise create it
if not os.path.exists(folder):
    os.makedirs(folder)

# Iterate from current year to 2020 and download PDFs and metadata for each year
for year in range(2024, 2019, -1):
    download_pdfs_and_metadata_for_year(year, folder)

print("All PDFs and metadata downloaded successfully!")


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Downloaded metadata for 2022 (Page 90): b5120052-d0d6-4b42-85a8-92e6f8e65bac
Downloaded for 2022 (Page 90): https://publ.iss.it/ITA/Items/GetPDF?uuid=4a8bcd1a-24c8-4b50-a158-ec246b919d0d
Downloaded metadata for 2022 (Page 90): 4a8bcd1a-24c8-4b50-a158-ec246b919d0d
Downloaded for 2022 (Page 90): https://publ.iss.it/ITA/Items/GetPDF?uuid=9b12761d-6f5f-4123-bc67-7ec95dc125b1
Downloaded metadata for 2022 (Page 90): 9b12761d-6f5f-4123-bc67-7ec95dc125b1
Downloaded for 2022 (Page 90): https://publ.iss.it/ITA/Items/GetPDF?uuid=dd693888-dd24-4cff-b0e2-62b83b93ff40
Downloaded metadata for 2022 (Page 90): dd693888-dd24-4cff-b0e2-62b83b93ff40
Downloaded for 2022 (Page 90): https://publ.iss.it/ITA/Items/GetPDF?uuid=c4b35608-49ef-4d55-a122-3881025cf77e
Downloaded metadata for 2022 (Page 90): c4b35608-49ef-4d55-a122-3881025cf77e
Downloaded for 2022 (Page 90): https://publ.iss.it/ITA/Items/GetPDF?uuid=4f6c4e4d-250e-4d19-bdd9-a3da461f2b1f
D