Web scrapping from an exams website:  
https://www.emestrada.org/#fisica

In [13]:
import requests
from bs4 import BeautifulSoup

URL to scrape

In [20]:
URL_EXAMS = 'https://www.emestrada.org/#fisica'

In [54]:
def scrape_links_from_web(url, file_extension = None):
    """
    Scrape links from a given webpage and filter them based on specific criteria.

    This custom script is specifically designed to extract links related to physics resources 
    (e.g., links containing 'fisica-andalucia') from a webpage. Optionally, it can filter links 
    by a specific file extension.

    Parameters:
        url (str): The URL of the webpage to scrape.
        file_extension (str, optional): The file extension to filter links (e.g., '.pdf'). 
                                        If None, all links are considered.

    Returns:
        list: A list of filtered links containing 'fisica-andalucia'. If no such links are found, 
              an empty list is returned.

    Notes:
        - The function sends an HTTP GET request to the provided URL and parses the HTML content.
        - Links are filtered to include only those that contain the keyword 'fisica-andalucia'.
        - If the webpage cannot be retrieved, an error message is printed with the HTTP status code.
    """

    # Send a GET request to the website
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all links on the page
        if file_extension:
            # Filter links by file extension
            links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].endswith(file_extension)]
            return links

        else:
            links = [link['href'] for link in soup.find_all('a', href=True)]

        
        # filter links that contain 'fisica-andalucia' or selectividad-fisica
        physics_links = []
        for link in links:
            if 'fisica-andalucia' in link or 'examen-selectividad-fisica' in link:
                physics_links.append(link)
        
        return physics_links

In [55]:
physics_links = scrape_links_from_web(URL_EXAMS)

In [49]:
# separate links with new lines
separated_links = '\n'.join(physics_links)

with open('links.txt', 'w') as file:
    file.writelines(separated_links)

### Store the links in a text file

In [56]:
# separate links with new lines
separated_links = '\n'.join(physics_links)

with open('physics_links.txt', 'w') as file:
    file.writelines(separated_links)

## Get the pdf link from each exam link

In [57]:
pdf_links = []

for exam in physics_links:
    pdf_link = scrape_links_from_web(exam, file_extension='.pdf')[0]
    pdf_links.append(pdf_link)

# Save the PDF links to a file
with open('physics_pdf_links.txt', 'w') as file:
    file.writelines('\n'.join(pdf_links))

In [58]:
# Print the PDF links
print('PDF Links:\n')
print(*pdf_links, sep='\n')


PDF Links:

https://www.emestrada.org/wp-content/uploads/2024/12/FIS-2024-JULIO.pdf
https://www.emestrada.org/wp-content/uploads/2024/06/FIS-2024-JUNIO.pdf
https://www.emestrada.org/wp-content/uploads/2023/11/FIS-2023-JULIO.pdf
https://www.emestrada.org/wp-content/uploads/2023/06/FIS-2023-JUNIO.pdf
https://www.emestrada.org/wp-content/uploads/2022/11/FIS-2022-JULIO.pdf
https://www.emestrada.org/wp-content/uploads/2022/06/FIS-2022-JUNIO.pdf
https://www.emestrada.org/wp-content/uploads/2021/12/FIS-2021-JULIO.pdf
https://www.emestrada.org/wp-content/uploads/2021/06/FIS-2021-JUNIO.pdf
https://www.emestrada.org/wp-content/uploads/2020/10/FIS-2020-SEPTIEMBRE.pdf
https://www.emestrada.org/wp-content/uploads/2020/07/2020-JUNIO.pdf
https://www.emestrada.org/wp-content/uploads/2020/01/2019-SEPTIEMBRE.pdf
https://www.emestrada.org/wp-content/uploads/2020/01/2019-JUNIO.pdf
https://www.emestrada.org/wp-content/uploads/2020/02/2018.-SEPTIEMBRE.pdf
https://www.emestrada.org/wp-content/uploads/2020/02