In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai

### Data Scraping

In [37]:
query = 'chocolate chip cookie'

query = query.replace(' ', '+')
url_base = 'https://www.allrecipes.com/search?q='
url_query = url_base + query + '&&offset='

In [38]:
#get all links

links_list = []

for page_number in range(5):
  url = url_query + str(page_number * 24) #24 results per page
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  search_results = soup.find('div', {'id':'search-results_1-0'})

  for a in search_results.find_all('a', href=True):
    href = a['href']
    if 'search?chocolate' not in href and not href.startswith('/account/profile#/collections'): #remove header
      links_list.append(href)

links_df = pd.DataFrame({'links': links_list})
links_df.head()

Unnamed: 0,links
0,https://www.allrecipes.com/recipe/10813/best-c...
1,https://www.allrecipes.com/recipe/25037/best-b...
2,https://www.allrecipes.com/recipe/9827/chocola...
3,https://www.allrecipes.com/recipe/10740/pumpki...
4,https://www.allrecipes.com/recipe/24445/chewy-...


In [39]:
# scrape recipes information function

def scrape_info(url):
    #initialize
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    #name
    recipe_name = soup.find("h1", {"id": "article-heading_1-0"}).text.strip()

    #rating
    recipe_rating_elem = soup.find("div", {"id": "mntl-recipe-review-bar__rating_1-0"})
    recipe_rating = recipe_rating_elem.text.strip() if recipe_rating_elem else None

    #rating count
    rating_count_elem = soup.find("div", {"id": "mntl-recipe-review-bar__rating-count_1-0"})
    rating_count = rating_count_elem.text.strip() if rating_count_elem else None
    if rating_count:
        rating_count = int(rating_count.replace(',', '').replace('(', '').replace(')', ''))

    #servings
    servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
    recipe_servings = servings_element.find_next('div', class_='mntl-recipe-details__value').text.strip() if servings_element else None

    #ingredients
    ingredients = {}
    ingredients_list = soup.find("ul", class_="mntl-structured-ingredients__list")
    if ingredients_list:
        ingredient_items = ingredients_list.find_all("li", class_="mntl-structured-ingredients__list-item")
        for item in ingredient_items:
            quantity = item.find("span", {"data-ingredient-quantity": True})
            unit = item.find("span", {"data-ingredient-unit": True})
            name = item.find("span", {"data-ingredient-name": True})
            if name:
                ingredient_name = name.text.strip()
                unit_name = unit.text.strip() if unit else None
                quantity_value = quantity.text.strip() if quantity else None

                ingredients[ingredient_name] = {
                    'unit': unit_name,
                    'quantity': quantity_value
                }

    return {
        'name': recipe_name,
        'rating': recipe_rating,
        'rating_count': rating_count,
        'servings': recipe_servings,
        'ingredients': ingredients
    }

In [40]:
df = pd.DataFrame(columns=['name', 'rating', 'rating_count', 'servings'])

for index, row in links_df.iterrows():
  url = row['links']
  recipe_info = scrape_info(url)
  df = df.append(recipe_info, ignore_index = True)

  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index = True)
  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index = True)
  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index = True)
  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index = True)
  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index = True)
  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index = True)
  servings_element = soup.find('div', class_='mntl-recipe-details__label', text='Servings:')
  df = df.append(recipe_info, ignore_index 

### Data Cleaning

In [41]:
#total number of recipes
total_recipes = len(df)

#number of recipes without a rating
recipes_without_rating = len(df[df['rating'].isna()])

#number of recipes without servings info
recipes_without_servings = len(df[df['servings'].isna()])

print(f"Total number of recipes: {total_recipes}")
print(f"Number of recipes without a rating: {recipes_without_rating}")
print(f"Number of recipes without a serving: {recipes_without_servings}")

Total number of recipes: 120
Number of recipes without a rating: 9
Number of recipes without a serving: 9


In [42]:
filtered_df = df[df['rating'].isna()]
filtered_df

Unnamed: 0,name,rating,rating_count,servings,ingredients
7,Our 25 Best Chocolate Chip Cookie Recipes of A...,,,,{}
8,I Tested 5 Reader Favorite Chocolate Chip Cook...,,,,{}
19,We Tried NYC's Best Chocolate Chip Cookies & W...,,,,{}
22,15 Oatmeal Chocolate Chip Cookie Recipes,,,,{}
33,Who Invented the Chocolate Chip Cookie?,,,,{}
67,Who Created the Chocolate Chip Cookie?,,,,{}
75,50 Desserts That Start With a Bag of Chocolate...,,,,{}
87,Travis Kelce’s Mom’s Chocolate Chip Cookies Ar...,,,,{}
106,We Tried 6 Brands of Chocolate Chips and These...,,,,{}


In [43]:
#drop rows where the 'rating' column is NaN
df = df.dropna(subset=['rating'])

In [44]:
keep_words = ['cookie', 'cookies', 'I', 'II', 'III', 'IV', 'V']

#create a regex pattern to match any of the keep words at the end of the string
pattern = '|'.join(map(re.escape, keep_words))
pattern = f'({pattern})$'

#use the regex pattern to filter the DataFrame, and drop the filtered rows
df = df[df['name'].str.contains(pattern, case=False, regex=True)]
df.reset_index(drop=True, inplace=True)

  df = df[df['name'].str.contains(pattern, case=False, regex=True)]


In [45]:
total_recipes = len(df)
print(f"New total number of recipes: {total_recipes}")

New total number of recipes: 79


In [46]:
df.to_csv('recipe_data2.csv', index=False)

In [47]:
backup_df = df.copy()

In [48]:
#df = backup_df.copy()

In [49]:
df.head()

Unnamed: 0,name,rating,rating_count,servings,ingredients
0,Best Chocolate Chip Cookies,4.6,18878,48,"{'butter, softened': {'unit': 'cup', 'quantity..."
1,"Best Big, Fat, Chewy Chocolate Chip Cookie",4.6,12408,18,"{'all-purpose flour': {'unit': 'cups', 'quanti..."
2,Chocolate Chocolate Chip Cookies,4.7,4221,48,"{'white sugar': {'unit': 'cups', 'quantity': '..."
3,Pumpkin Chocolate Chip Cookies,4.6,1845,24,"{'canned pumpkin': {'unit': 'cup', 'quantity':..."
4,Chewy Chocolate Chip Oatmeal Cookies,4.5,5695,42,"{'butter, softened': {'unit': 'cup', 'quantity..."


### Data Processing

In [50]:
#convert 'rating' and 'rating_count' to numeric
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')  # 'coerce' will replace non-numeric values with NaN
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')

In [51]:
#calculate the bayesian average for each recipe (takes into account the number of reviews)
mean_rating = df['rating'].mean()
df['weighted_score'] = round((df['rating'] * df['rating_count'] + 2 * mean_rating) / (df['rating_count'] + 2),2)

#sort recipes by weighted score
df = df.sort_values(by='weighted_score', ascending=False).reset_index(drop=True)

In [52]:
#define a dictionary for fraction replacements
replacement_dict = {"½": 0.5, "⅓": 0.33, "¼": 0.25, "⅕": 0.20, "⅔": 0.67, "¾": 0.75, "⅛": 0.13, "1/2": 0.5}

def replace_numbers_with_float(text):
    parts = text.split()
    total = 0.0
    for part in parts:
        if part in replacement_dict:
            total += replacement_dict[part]
        else:
            try:
                total += float(part)
            except ValueError:
                # If it's not a number or fraction, skip it
                pass
    return total if total != int(total) else (int(total) if int(total) == total else total)

In [53]:
#replace quantities fractions with floats
df['ingredients_clean'] = df['ingredients'].apply(lambda x: {k: {'unit': v['unit'], 'quantity': replace_numbers_with_float(v['quantity'])} for k, v in x.items()})

In [54]:
#flatten ingredients list
#df['ingredients_clean_flattened'] = df['ingredients_clean'].apply(lambda nested_ingredients: [f"{ingredient} {details['quantity']} {details['unit']}" for ingredient, details in nested_ingredients.items()])

In [55]:
# keep relevant columns
df_clean = df.drop(['rating', 'rating_count', 'ingredients'], axis = 1)
df_clean.drop(['servings'], axis = 1, inplace = True)  #drop servings too to simplify our model

#rename
df_clean.rename(columns={'ingredients_clean': 'ingredients','weighted_score': 'rating'},inplace=True, errors='raise')

In [56]:
df_clean.head()

Unnamed: 0,name,rating,ingredients
0,Chocolate Chip Pretzel Cookies,4.74,"{'all purpose flour': {'unit': 'cups', 'quanti..."
1,Giant Chocolate Chip Cookie,4.7,"{'butter, softened': {'unit': 'cup', 'quantity..."
2,Chocolate Chip Cookies V,4.7,"{'all-purpose flour': {'unit': 'cups', 'quanti..."
3,Pumpkin Oatmeal Chocolate Chip Cookies,4.7,"{'butter, softened': {'unit': 'cups', 'quantit..."
4,Chocolate Chocolate Chip Cookies II,4.7,"{'all-purpose flour': {'unit': 'cups', 'quanti..."


### Model

In [64]:
#filter recipes with rating above 4.5
receitas_filt = [receita for index, receita in df_clean.iterrows() if receita["rating"] > 4.5]

#Creating prompt
prompt = "Now, create the best cookie recipe possible, incorporating the best attributes of the recipes above. Specify the quantities of each ingredient.:\n\n"
for receita in receitas_filt:
    prompt += f"Receita: {', '.join(receita['ingredients'])}\nAvaliação: {receita['rating']} estrelas\n\n"


#request from API
response = openai.Completion.create(
    engine="text-davinci-003",
    prompt=prompt,
    max_tokens=500  # Ajuste conforme necessário
)

#Extract recipe
new_recipe = response['choices'][0]['text']
print(new_recipe)


Melhor Receita de Biscoito Possível:

Ingredientes:

- 2 xícaras de farinha de trigo
- 1/2 colher de chá de bicarbonato de sódio
- 1/4 colher de chá de sal
- 1/2 xícara de manteiga, amolecida
- 1/2 xícara de açúcar branco
- 1 xícara de açúcar mascavo
- 2 ovos
- 1 colher de chá de extrato de baunilha
- 1 1/2 xícaras de chips de chocolate semi-amargo
- 3/4 xícaras de nozes picadas
- 1/4 xícara de pretzels

Preparo:

Pré-aqueça o forno a 375°F. Forre uma assadeira grande com papel-manteiga ou Reynolds® Parchment Paper.

Em um recipiente, misture a farinha, bicarbonato de sódio e sal. Reserve.

Em uma tigela grande, misture a manteiga, açúcares e baunilha. Bata até obter uma mistura homogênea. Acrescente o ovo, batendo até obter uma mistura bem homogênea.

Adicione lentamente a farinha à mistura de manteiga, misturando bem após cada adição. Por último, adicione os chips de chocolate, nozes e pretzels. Misture até obter uma mistura homogênea.

Coloque pequenas colheradas da massa na assade