<a href="https://colab.research.google.com/github/dloiacono/ai/blob/main/ai_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import requests
import os
import json

# Mount Google Drive
drive.mount('/gdrive')

# Function to download PDF from a URL
def download_pdf(url, folder, uuid):
    filename = os.path.join(folder, f"{uuid}.pdf")
    with open(filename, 'wb') as f:
        f.write(requests.get(url).content)

# Function to download metadata for an item
def download_metadata(uuid, folder):
    url = f"https://publ.iss.it/ITA/Items/GetMetadata?uuid={uuid}"
    response = requests.get(url)
    metadata = response.json()
    with open(os.path.join(folder, f"{uuid}.json"), 'w') as f:
        f.write(json.dumps(metadata, indent=4))

# Function to download PDFs and metadata for a given year
def download_pdfs_and_metadata_for_year(year, folder):
    page = 1
    while True:
        # Construct URL for the given year and page
        url = f"https://publ.iss.it/ITA/Items/GetSearchResults?qualifier=year&value={year}&take=10&skip={10 * (page - 1)}&page={page}&pageSize=10"

        # Send a GET request to the URL
        response = requests.get(url)
        data = response.json()

        # Extract PDF links and UUIDs from JSON response
        items = data.get('docs', [])
        pdf_links = [item['bitstream'][0] for item in items if item.get('bitstream') and item['bitstream'][0] == 'true']
        uuids = [item['uuid'] for item in items if item.get('bitstream') and item['bitstream'][0] == 'true']

        # Download each PDF and metadata
        for pdf_link, uuid in zip(pdf_links, uuids):
            pdf_url = f"https://publ.iss.it/ITA/Items/GetPDF?uuid={uuid}"
            download_pdf(pdf_url, folder, uuid)
            print(f"Downloaded for {year} (Page {page}):", pdf_url)
            download_metadata(uuid, folder)
            print(f"Downloaded metadata for {year} (Page {page}):", uuid)

        # Check if there are more pages
        if len(items) < 10:
            break

        # Increment page number
        page += 1

# Set folder path where you want to save the PDFs and metadata
folder = '/gdrive/My Drive/AI/publiiss'

# Ensure the folder exists, otherwise create it
if not os.path.exists(folder):
    os.makedirs(folder)

# Iterate from current year to 2020 and download PDFs and metadata for each year
for year in range(2024, 2019, -1):
    download_pdfs_and_metadata_for_year(year, folder)

print("All PDFs and metadata downloaded successfully!")


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
Downloaded for 2024 (Page 1): https://publ.iss.it/ITA/Items/GetPDF?uuid=84213128-a695-40b7-9616-cf7a5129a4b2
Downloaded metadata for 2024 (Page 1): 84213128-a695-40b7-9616-cf7a5129a4b2
Downloaded for 2024 (Page 1): https://publ.iss.it/ITA/Items/GetPDF?uuid=325546f4-4947-4650-92fa-483edd60326e
Downloaded metadata for 2024 (Page 1): 325546f4-4947-4650-92fa-483edd60326e
Downloaded for 2024 (Page 1): https://publ.iss.it/ITA/Items/GetPDF?uuid=9d2bbe48-d029-466d-a416-c2d63b7d119c
Downloaded metadata for 2024 (Page 1): 9d2bbe48-d029-466d-a416-c2d63b7d119c
Downloaded for 2024 (Page 1): https://publ.iss.it/ITA/Items/GetPDF?uuid=cd0ca97c-1fed-41f7-a3db-60031ec3835d
Downloaded metadata for 2024 (Page 1): cd0ca97c-1fed-41f7-a3db-60031ec3835d
Downloaded for 2024 (Page 1): https://publ.iss.it/ITA/Items/GetPDF?uuid=0de7b95b-2886-4ddd-bbfe-be1399798ebb
Downloaded metadata for 2024 (Page 1