![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Importing Libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import random
import time
from IPython.display import clear_output

# Functions

In [None]:
def scrape_links_names(base_url, total_expected_products, total_expected_pages):
    """
    Scrape links and names from a website.

    Args:
        base_url (str): The base URL of the website.
        total_expected_products (int): The total number of products you expect to scrape.

    Returns:
        pd.DataFrame: A DataFrame containing the scraped links and names.
    """
    items = []
    current_page_number = 1  # Start with page 1
    start_time = time.time()
    total_downloaded = 0

    while current_page_number <= total_expected_pages:
        # Construct the current page URL
        current_page = f"{base_url}{current_page_number}/"

        # Scrape data from the current page
        response = requests.get(current_page)
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all recipe elements on the page
        recipe_elements = soup.select(".td_module_3")

        # Loop through each recipe element and extract information
        for recipe_element in recipe_elements:
            # Extract the link and name of the recipe
            recipe_link = recipe_element.select_one(".entry-title a")["href"]
            recipe_name = recipe_element.select_one(".entry-title a").get_text()

            items.append({
                "title": recipe_name,
                "link": recipe_link
            })

            total_downloaded += 1

        # Calculate and display progress
        clear_output(wait=True)
        percentage_downloaded = (total_downloaded / total_expected_products) * 100
        current_time = time.time() - start_time
        print(f'Recipes Downloaded: {total_downloaded}/{total_expected_products} ({percentage_downloaded:.2f}%)')
        print(f'Running Time: {int(current_time // 60):02d}:{int(current_time % 60):02d}')

        # Sleep to avoid overloading the server
        
        sleep_time = 5
        random_sleep_timer = random.randint(int(sleep_time * 0.5), int(sleep_time * 1.5))
        print('Sleeping for', random_sleep_timer, 's...')
        time.sleep(random_sleep_timer)

        # Increment the page number
        current_page_number += 1

    # Printing Download Summary
    clear_output(wait=True)
    end_time = time.time()
    total_time = end_time - start_time
    print("Product Data download complete. Total:", len(items))
    print(f'Total Running Time: {int(total_time // 60):02d}:{int(total_time % 60):02d}')

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(items)
    return df


# Scraping The Recipes

In [None]:
base_url = "https://www.teleculinaria.pt/receitas/page/"
total_expected_products = 10397 #30 items by page, last page has 17 items
total_expected_pages = 347

In [None]:
teleculinaria_recipes = scrape_links_names(base_url, total_expected_products, total_expected_pages)

# Printing Dataframe

In [None]:
teleculinaria_recipes

In [None]:
# Dropping links with videos
def drop_video_urls(df):
    # Drop rows where 'video' is present in the URL
    df = df[~df['link'].str.contains('video')]
    return df

In [None]:
teleculinaria_recipes = drop_video_urls(teleculinaria_recipes)

In [None]:
teleculinaria_recipes

In [None]:
teleculinaria_recipes.to_csv('../data/raw/recipe_links.csv', index=False)