![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Importing Libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import random
import time
from IPython.display import clear_output

# Calling CSV File

In [None]:
recipe_links = pd.read_csv('../data/raw/recipe_links.csv')

In [None]:
recipe_links

# Full Scrapping for Features

In [None]:
def scrape_recipe_features(df, total_expected_recipes):
    
    """
    Scrapes recipe features from URLs in a DataFrame.

    Parameters:
        df (pd.DataFrame): DataFrame containing a 'link' column with recipe URLs.
        total_expected_recipes (int): Total number of products to download.

    Returns:
        pd.DataFrame: A DataFrame containing the original data and the scraped recipe features.
    """
    
    start_time = time.time()
    loops_counter = 0
    loops_before_sleep = 100
    scraped_data = []
    search_url = 0
    redirect_count = 0
    
    for index, row in df.iterrows():
        try:
            
            url = row['link']
        
            title = df.at[index, 'title']
        
            search_url += 1
        
            
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')

            rating = soup.find('span', class_="wprm-recipe-rating-average")
            rating = rating.text if rating else ''
            rating_final = f'{rating} out of 5'

            difficulty = soup.find('span', class_='wprm-recipe-de_dificuldade wprm-block-text-normal')
            difficulty = difficulty.text if difficulty else ''

            cost = soup.find('span', class_='wprm-recipe-_da_refeio wprm-block-text-normal')
            cost = cost.text if cost else ''
            
            img_element = soup.select_one('.td-post-featured-image img.entry-thumb')
            if img_element:
                img_url = img_element['src']
        
            hours_element_prep = soup.find("span", class_="wprm-recipe-details wprm-recipe-details-hours wprm-recipe-prep_time wprm-recipe-prep_time-hours")
            minutes_element_prep = soup.find("span", class_="wprm-recipe-details wprm-recipe-details-minutes wprm-recipe-prep_time wprm-recipe-prep_time-minutes")
            hours_prep = hours_element_prep.text if hours_element_prep else "0"
            minutes_prep = minutes_element_prep.text if minutes_element_prep else "0"

            prep_time = f"{hours_prep}h {minutes_prep}min"

            hours_element_total = soup.find('span', class_='wprm-recipe-details wprm-recipe-details-hours wprm-recipe-total_time wprm-recipe-total_time-hours')
            minutes_element_total = soup.find("span", class_="wprm-recipe-details wprm-recipe-details-minutes wprm-recipe-total_time wprm-recipe-total_time-minutes")
            hours_total = hours_element_total.text if hours_element_total else "0"
            minutes_total = minutes_element_total.text if minutes_element_total else "0"

            total_time = f"{hours_total}h {minutes_total}min"

            meal_class = soup.find("span", class_="wprm-recipe-course wprm-block-text-normal")
            meal_class = meal_class.get_text(strip = True) if meal_class else ''

            servings = soup.find('span', class_='wprm-recipe-servings-adjustable-tooltip')
            servings = servings.get_text(strip = True) if servings else "0"
        
            preparation = [step.text for step in soup.find_all("div", class_="wprm-recipe-instruction-text")]

            ingredient_data = []

            ingredients = soup.find_all('li', class_='wprm-recipe-ingredient')
            
            for ing in ingredients:
                amount_element = ing.find("span", class_="wprm-recipe-ingredient-amount")
                unit_element = ing.find("span", class_="wprm-recipe-ingredient-unit")
                name_element = ing.find("a")
                notes_faded_element = ing.find("span", class_="wprm-recipe-ingredient-notes wprm-recipe-ingredient-notes-faded")

                amount = amount_element.text if amount_element else '0'
    
                # Check if notes faded are present and swap name and unit if they exist
                if notes_faded_element:
                    unit = name_element.text if name_element else '-'
                    name = notes_faded_element.text
                else:
                    unit = unit_element.text if unit_element else '-'
                    name = name_element.text if name_element else '-'

                ingredient_data.append({
                    'Amount': amount,
                    'Unit': unit,
                    'Name': name,
                    })

                  # Create a DataFrame for the ingredient data
                ingredient_df = pd.DataFrame(ingredient_data)

                scraped_data.append({
                'Meal Class': meal_class,
                'Difficulty': difficulty,
                'Cost': cost,
                'Rating': rating_final,
                'Title': title,
                'Prep Time': prep_time, 
                'Total Time': total_time,
                'Servings': servings,
                'Ingredient_Amount': amount,
                'Ingredient_Unit': unit,
                'Ingredient_Name': name,
                'Preparations': preparation,
                'Recipe Link': url,  
                'Image URL': img_url})
                
            # Calculate and display progress
        
            percentage_downloaded = search_url / total_expected_recipes * 100
            current_time = time.time() - start_time
            print(f'Time: {int(current_time // 60):02d}:{int(current_time % 60):02d}.', end='\r')
            print()
            print(f'Scraping URL {url}.', end='\r')
            print()
            print(f'Recipes Downloaded: {search_url}/{total_expected_recipes} ({percentage_downloaded:.2f}%).', end='\r')
            print()
            
            # Check if it's time for a sleep
            if loops_counter < loops_before_sleep:
                loops_counter += 1
            else:
                sleep_time = 45
                random_sleep_timer = random.randint(int(sleep_time * 0.5), int(sleep_time * 1.5))
                print(f'Sleeping for {random_sleep_timer} s...', end='\r')
                time.sleep(random_sleep_timer)
                loops_counter = 0  # Reset the counter after sleeping
            clear_output(wait=True)     
                
        except requests.exceptions.TooManyRedirects:
            print(f"Too many redirects for URL: {url}. Skipping...")
            continue  # Skip this URL and continue with the next iteration
        except requests.exceptions.RequestException as e:
            print(f"Error accessing URL: {url}. Error: {e}")
            continue  # Skip this URL and continue with the next iteration
    
    combined_df = pd.DataFrame(scraped_data)   
    
    # Printing Download Summary
    end_time = time.time()
    total_time = end_time - start_time
    downloaded_percentage = (search_url) / (total_expected_recipes) * 100
    print(f'Recipe download complete. Total: ({search_url}/{total_expected_recipes} {downloaded_percentage:.2f}%)')
    print(f'Total Running Time: {int(total_time // 60):02d}:{int(total_time % 60):02d}')

    return combined_df

In [None]:
total_expected_recipes = len(recipe_links)
recipes_clean = scrape_recipe_features(recipe_links, total_expected_recipes)

In [None]:
recipes_clean

# Saving CSV File

In [None]:
recipes_clean.to_csv('../data/raw/recipes_clean.csv', index=False)