In [1]:
import requests
import os

In [2]:
base_path = os.getcwd()
base_path

'/home/daniel/git_code/emestrada_parse'

In [9]:
def download_pdf(url, save_path):
    """
    Downloads a PDF file from the given URL and saves it to the specified path.
    
    :param url: URL of the PDF file to download
    :param save_path: Path where the PDF file should be saved
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url, stream=True)
        
        # Raise an exception for bad status codes
        response.raise_for_status()

        # check if the file exists
        file_already_downloaded = os.path.isfile(save_path)          
        
        if not file_already_downloaded:
            # Open the file and write the content
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
            
            print(f"PDF {save_path} downloaded successfully!")
       
    except Exception as e:
        if response.status_code == 404:
            pass
        else:
            print(f"Error downloading {url}: {e}")

In [4]:
def check_remote_file_exists(url, timeout=10):
    """
    Check if a remote file exists at the given URL.
    
    :param url: URL of the file to check
    :param timeout: Timeout for the request in seconds (default is 10)
    :return: True if the file exists, False otherwise
    """
    try:
        response = requests.head(url, allow_redirects=True, timeout=timeout)
        
        # Check if the request was successful (status code 2xx)
        if response.status_code // 100 == 2:
            return True
        
        # For some servers, HEAD request might not be supported
        # In that case, try a GET request with stream=True
        if response.status_code == 405:  # Method Not Allowed
            response = requests.get(url, stream=True, timeout=timeout)
            return response.status_code // 100 == 2
        
        return False
    
    except requests.RequestException as e:
        print(f"Error checking remote file: {e}")
        return False

## List of topics and their topic number

In [5]:
topics = {'T1' : 'Campo gravitatorio',
          'T2' : 'Campo eléctrico y magnético',
          'T3' : 'Ondas',
          'T4' : 'Óptica geométrica',
          'T5' : 'Física cuántica y nuclear'
}

## Create the folder structure to store the downloaded files

In [6]:
os.makedirs(f'{base_path}/Física', exist_ok = True)
os.chdir(f'{base_path}/Física')

# create subfolders inside parent folder
for dir in topics.values():
    os.makedirs(dir, exist_ok = True)
os.chdir(base_path)

## Check folders with content
Inspect if a remote ftp folder contains files to download

In [7]:
# if the remote folders were previously tested
# load them from file
if os.path.isfile(f'{base_path}/remote_folders.txt'):
    with open('remote_folders.txt', 'r') as f:
        remote_folders = [i.strip() for i in f.readlines() ]

else:       
    # create an empty list to store remote folders containing files
    remote_folders = []

    # try years between 2020 and 2024
    for year_url in range(2024, 2019, -1):
        
        # for every year, inspect the month subfolder
        for month in range(1, 13):
            # add zero padding to month
            if month < 10:
                month = f'0{month}'
            else:
                month = str(month)

            # check for files from 2016 to 2024
            for year in range(2024, 2015, -1):
                # form the url
                base_url = 'https://www.emestrada.org/wp-content/uploads/'
                url = base_url + f'{year_url}/{month}/FIS-T1-{year}.pdf'
                
                # check if there are files in the folder
                content_ok = check_remote_file_exists(url)
                
                # if there are files, add the remote folder to the list
                if content_ok:
                    print(f'Subfolder {year_url}/{month} has files!')
                    remote_folders.append(f'{year_url}/{month}')
                    break
    
    # save the remote folders list to a file
    with open('remote_folders.txt', 'w') as f:
        for folder in remote_folders:
            f.write(f'{folder}\n')

## Download the pdf files

In [8]:
# https://www.emestrada.org/wp-content/uploads/2022/06/FIS-T1-2022.pdf
# base_url + /{year_folder}/{month}FIS-{T}-{year}.pdf

for folder in remote_folders:
    
    base_url = 'https://www.emestrada.org/wp-content/uploads/'
    year_folder, month = folder.split('/')
    
    for T, topic in topics.items():

        # enter the corresponding folder
        os.chdir(f'{base_path}/Física/{topic}')

        for year in range(2016, 2025):
            # check if the file already exists
            save_path = f'{year} - {topic}.pdf'
            if os.path.isfile(save_path):
                # print(f'File already exists: {save_path}')
                continue
            
            # form the url
            url = base_url + f'{year_folder}/{month}/FIS-{T}-{year}.pdf'
            
            # download the file
            download_pdf(url, save_path)      

Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2016.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2017.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2018.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2019.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2020.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2021.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https://www.emestrada.org/wp-content/uploads/2024/06/FIS-T1-2022.pdf
Error downloading the PDF: 404 Client Error: Not Found for url: https