## Download pdf files from website

In [94]:
import requests
import os

In [95]:
base_path = os.getcwd()
base_path

'/home/daniel/git_code/fisica_crawler'

In [3]:
def download_pdf(url, save_path):
    """
    Downloads a PDF file from the given URL and saves it to the specified path.
    
    :param url: URL of the PDF file to download
    :param save_path: Path where the PDF file should be saved
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url, stream=True)
        
        # Raise an exception for bad status codes
        response.raise_for_status()

        # check if the file exists
        file_already_downloaded = os.path.isfile(save_path)          
        
        if not file_already_downloaded:
            # Open the file and write the content
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
            
            print(f"PDF {save_path} downloaded successfully!")
       
    except Exception as e:
        if response.status_code == 404:
            pass
        else:
            print(f"Error downloading {url}: {e}")

## Download the pdf files

### Read the links from file

In [96]:
os.chdir(base_path)

In [97]:
with open('physics_pdf_links.txt', 'r') as file:
    pdf_links = file.readlines()
# Remove leading/trailing whitespace characters
pdf_links = [link.strip() for link in pdf_links]

In [98]:
pdf_links[:5]

['https://www.emestrada.org/wp-content/uploads/2024/12/FIS-2024-JULIO.pdf',
 'https://www.emestrada.org/wp-content/uploads/2024/06/FIS-2024-JUNIO.pdf',
 'https://www.emestrada.org/wp-content/uploads/2023/11/FIS-2023-JULIO.pdf',
 'https://www.emestrada.org/wp-content/uploads/2023/06/FIS-2023-JUNIO.pdf',
 'https://www.emestrada.org/wp-content/uploads/2022/11/FIS-2022-JULIO.pdf']

In [99]:
def extract_year_exam(text):
    """
    Extracts the year and exam from the given text.
    
    :param text: The text containing the year and exam information
    :return: A tuple containing the year and exam
    """
    data_string = text.split('/')[-1].replace('.pdf', '')
    if "FIS" in data_string:
        data_string = data_string.replace('FIS-', '')
    parts = data_string.split('-')
    # Check if the string contains 'RESERVA' or not
    # If 'RESERVA' is present, the format is different
    if 'RESERVA' in data_string:
        year = parts[0].replace('.', '')
        exam = ' '.join(parts[1:3]).title()
    else:
        year = parts[0].replace('.', '')
        exam = parts[1].title()
    
    return year, exam

In [100]:
for i in pdf_links:
    print(extract_year_exam(i))

('2024', 'Julio')
('2024', 'Junio')
('2023', 'Julio')
('2023', 'Junio')
('2022', 'Julio')
('2022', 'Junio')
('2021', 'Julio')
('2021', 'Junio')
('2020', 'Septiembre')
('2020', 'Junio')
('2019', 'Septiembre')
('2019', 'Junio')
('2018', 'Septiembre')
('2018', 'Junio')
('2017', 'Septiembre')
('2017', 'Junio')
('2016', 'Septiembre')
('2016', 'Junio')
('2024', 'Reserva 4')
('2024', 'Reserva 3')
('2024', 'Reserva 2')
('2024', 'Reserva 1')
('2023', 'Reserva 4')
('2023', 'Reserva 3')
('2023', 'Reserva 2')
('2023', 'Reserva 1')
('2022', 'Reserva 2')
('2022', 'Reserva 1')
('2022', 'Reserva 3')
('2022', 'Reserva 4')
('2021', 'Reserva 4')
('2022', 'Reserva 3')
('2021', 'Reserva 1')
('2021', 'Reserva 2')
('2020', 'Reserva 4')
('2020', 'Reserva 3')
('2020', 'Reserva 2')
('2020', 'Reserva 1')
('2019', 'Reserva 4')
('2019', 'Reserva 3')
('2019', 'Reserva 2')
('2019', 'Reserva 1')
('2018', 'Reserva 4')
('2018', 'Reserva 3')
('2018', 'Reserva 2')
('2018', 'Reserva 1')
('2017', 'Reserva 4')
('2017', 'Res

In [101]:
# Classify the exams
exams = {}
for i in pdf_links:
    year, _ = extract_year_exam(i)
    year = int(year)
    if year not in exams:
        exams[year] = []
    # Append the exam to the corresponding year
    exams[year].append(i)

In [102]:
exams[2021]

['https://www.emestrada.org/wp-content/uploads/2021/12/FIS-2021-JULIO.pdf',
 'https://www.emestrada.org/wp-content/uploads/2021/06/FIS-2021-JUNIO.pdf',
 'https://www.emestrada.org/wp-content/uploads/2021/12/FIS-2021-RESERVA-4.pdf',
 'https://www.emestrada.org/wp-content/uploads/2021/12/FIS-2021-RESERVA-1.pdf',
 'https://www.emestrada.org/wp-content/uploads/2021/12/FIS-2021-RESERVA-2.pdf']

In [104]:
for year, links in exams.items():
    # make subdirectory for each year
    os.makedirs(f"{base_path}/{year} - Fisica", exist_ok=True)
    os.chdir(f"{base_path}/{year} - Fisica")
    # Download each PDF file
    for link in links:
        year, exam = extract_year_exam(link)
        filename = f"{year} - {exam}.pdf"
                
        # Download the PDF file
        download_pdf(link, filename)

PDF 2024 - Julio.pdf downloaded successfully!
PDF 2024 - Junio.pdf downloaded successfully!
PDF 2024 - Reserva 4.pdf downloaded successfully!
PDF 2024 - Reserva 3.pdf downloaded successfully!
PDF 2024 - Reserva 2.pdf downloaded successfully!
PDF 2024 - Reserva 1.pdf downloaded successfully!
PDF 2023 - Julio.pdf downloaded successfully!
PDF 2023 - Junio.pdf downloaded successfully!
PDF 2023 - Reserva 4.pdf downloaded successfully!
PDF 2023 - Reserva 3.pdf downloaded successfully!
PDF 2023 - Reserva 2.pdf downloaded successfully!
PDF 2023 - Reserva 1.pdf downloaded successfully!
PDF 2022 - Julio.pdf downloaded successfully!
PDF 2022 - Junio.pdf downloaded successfully!
PDF 2022 - Reserva 2.pdf downloaded successfully!
PDF 2022 - Reserva 1.pdf downloaded successfully!
PDF 2022 - Reserva 3.pdf downloaded successfully!
PDF 2022 - Reserva 4.pdf downloaded successfully!
PDF 2021 - Julio.pdf downloaded successfully!
PDF 2021 - Junio.pdf downloaded successfully!
PDF 2021 - Reserva 4.pdf downloa