# PostgreSQL Database

In [1]:
import psycopg2
import os

In [2]:
DB_PASSWORD = os.environ['DB_PASSWORD']

In [None]:
#Weirdly for me the user is my name normally it should be postgres by default 
conn_local = psycopg2.connect(database="cookit", user = "lillykaemmerling", host = "localhost", port = "5432")

In [None]:
cur_local  = conn_local.cursor()

In [3]:
conn = psycopg2.connect(database="d1hsr1c7nk56dl", user = "iadkkqrgljveni", host = "ec2-3-230-61-252.compute-1.amazonaws.com", port = "5432", password=DB_PASSWORD)

In [4]:
cur = conn.cursor()

In [None]:
#Creating the table only ONCE
cur.execute(
    '''CREATE TABLE RECIPES
    (ID INT PRIMARY KEY     NOT NULL,
    TITLE           TEXT    NOT NULL,
    DIFFICULTY      TEXT,
    PREPTIME        TEXT,
    LINK TEXT  NOT NULL,
    PICTURE_URL     TEXT,
    INGREDIENTS     TEXT[]         NOT NULL,
    CUISINE         TEXT[]     NOT NULL);''')

In [None]:
#In Postgres we always have to commit 
conn.commit()

# Scraping lewagon recipes 

In [5]:
from bs4 import BeautifulSoup
import requests
import csv

## Scrape recipe overview

In [6]:
#Scrape the main pages content 
def scrape_from_internet(pages):

    #USE f string to do this for all 63 pages 
    BASIC_URI = "https://recipes.lewagon.com/"
    html_content = bytearray()
    
    #Pagination --> start refers to the page
    for i in range(1,pages):
        url = f"{BASIC_URI}?search[query]=&page={i}"
        
        response = requests.get(url)

        if response.history == []:
            html_content += response.content

    return str(html_content)

In [7]:
#parse the main page content 
def parse(html):
    #Return a list from the overview page 
    #html input = response.content

    soup = BeautifulSoup(html, 'html.parser')
    #Find all recipes on the specific page
    recipes_soup = soup.find_all('div', class_='col-12 col-sm-6 col-md-4 col-lg-3')
    
    recipe_list = []

    for recipe in recipes_soup:
        name = recipe.find('p', class_='text-dark text-truncate w-100 font-weight-bold mb-0 recipe-name').text
        difficulty = recipe.find('span', class_='recipe-difficulty').text
        prep_time = recipe.find('span', class_='recipe-cooktime').text
        
        #Get the link from data-href to be able to go to the recipe page 
        recipe_url = recipe.find('div', class_='recipe my-3').attrs['data-href']
        
        #Get the link to the picture from src 
        picture_url = recipe.find('img', class_='recipe-img').attrs['src']

        recipe_list.append({"name": name, 
                            "difficulty": difficulty, 
                            "prep_time": prep_time,
                            "link": recipe_url,
                            "picture_url": picture_url})
    return recipe_list

In [None]:
recipes = parse(scrape_from_internet(64))

In [None]:
#write the parsed content into a csv 
def write_csv(recipes):
    #dump recipes to a CSV file `recipes.csv`
    with open(f"../raw_data/recipes.csv", "w") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=recipes[0].keys())
        writer.writeheader()
        for recipe in recipes:
            writer.writerow(recipe)

In [None]:
write_csv(recipes)

Now we have the overview of all 750 recipes in a csv
As a next step we need to go through this csv/the original dictionnary and scrape every individual recipe link in order to access ingredients & cuisine 

## Scrape individual recipes 

In [8]:
import pandas as pd

In [9]:
#scrape the individual recipe 
def scrape_recipe(url):

    html_content = bytearray()
    response = requests.get(url)

    if response.history == []:
        html_content += response.content

    return str(html_content)

In [10]:
def parse(recipe_html):
    
    #html input = response.content

    soup = BeautifulSoup(recipe_html, 'html.parser')
    details = {}
    
    #Find all ingredients on the specific page
    ingredients_soup = soup.find_all('div', class_='ingredient')
    ingredients = []

    for ingredient in ingredients_soup:
        description = ingredient.find('p', class_='mb-0').text
        ingredients.append(description)
        
    
    #Find all ingredients on the specific page
    cuisines = []
    cuisines_soup = soup.find_all('span', class_="badge badge-success")
    
    if len(cuisines_soup) < 1:
        cuisines.append("No cuisine specified")
    else:
        for cuisine in cuisines_soup:
            cuisines.append(cuisine.text)
    
    details["ingredients"] = ingredients
    details["cuisine"] = cuisines
    
    return details

In [11]:
recipe = parse(scrape_recipe("https://recipes.lewagon.com/recipes/970"))

In [12]:
recipe

{'ingredients': ['1 package (15 ounces) refrigerated pie crust (2 crusts)',
  "1 can (10 1/2 ounces) Campbell's Chicken Gravy",
  '3 cups cooked cut-up vegetables *',
  '2 ounces cans (4.5 each) Swanson Premium White Chunk Chicken Breast in Water, drained'],
 'cuisine': ['No cuisine specified']}

In [13]:
recipe["ingredients"]

['1 package (15 ounces) refrigerated pie crust (2 crusts)',
 "1 can (10 1/2 ounces) Campbell's Chicken Gravy",
 '3 cups cooked cut-up vegetables *',
 '2 ounces cans (4.5 each) Swanson Premium White Chunk Chicken Breast in Water, drained']

In [14]:
len(recipe["ingredients"])

4

TODO: 
+ Insert insert into csv/ db where this url is present 
+ additionally split amount and ingredient 

In [15]:
recipes = pd.read_csv("../raw_data/recipes.csv")

In [16]:
recipes.head()

Unnamed: 0,name,difficulty,prep_time,link,picture_url
0,10 Minute Brownies,Moderate,45 min,https://recipes.lewagon.com/recipes/668,https://spoonacular.com/recipeImages/631830-55...
1,15 Minute Healthy Dark Chocolate Truffles,Very hard,45 min,https://recipes.lewagon.com/recipes/956,https://spoonacular.com/recipeImages/631841-55...
2,4 Ingredient Chicken Pot Pie,Very easy,45 min,https://recipes.lewagon.com/recipes/970,https://spoonacular.com/recipeImages/631868-55...
3,4 Ingredient Raw Peanut Butter Chocolate Cups,Moderate,45 min,https://recipes.lewagon.com/recipes/748,https://spoonacular.com/recipeImages/792705-55...
4,5-Minute Blueberry Chia Jam,Moderate,45 min,https://recipes.lewagon.com/recipes/523,https://spoonacular.com/recipeImages/631880-55...


In [17]:
int(recipes.loc[0]["prep_time"][:2])

45

## Insert Recipes into recipe database 

In [None]:
#Based on overview csv file 
def insert_into_db(recipes):
    
    for index, row in recipes.iterrows():
        primary_key = index
        title = row["name"]
        difficulty = row["difficulty"]
        prep_time = row["prep_time"]
        url = row["link"]
        picture_url = row["picture_url"]

        recipe = parse(scrape_recipe(url))

        ingredients = recipe["ingredients"]
        cuisine = recipe["cuisine"]
        
        query = """INSERT INTO RECIPES (ID, TITLE, DIFFICULTY, PREPTIME, LINK, PICTURE_URL, INGREDIENTS, CUISINE)
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s)"""
        
        cur.execute(query, (primary_key, title, difficulty, prep_time, url, picture_url, ingredients, cuisine));
        conn.commit()

In [None]:
insert_into_db(recipes)

## Insert recipes into final db in different format

In [18]:
#Based on loaded dataframe
def insert_into_db(recipes):
    
    for index, row in recipes.iterrows():
        primary_key = index+4123
        title = row["name"]
        difficulty = row["difficulty"]
        prep_time = int(recipes.loc[index]["prep_time"][:2])
        url = row["link"]
        picture_url = row["picture_url"]
        instructions = "Please follow the link for instructions"
        
        recipe = parse(scrape_recipe(url))
        
        number_of_ingredients = len(recipe["ingredients"])
        ingredients = recipe["ingredients"]
        cuisine = recipe["cuisine"][0]
        
        calories = "No information available"
        
        
        query = """INSERT INTO COOKIT_RECIPES (ID, TITLE, DIFFICULTY, PREPTIME, NUMBER_OF_INGREDIENTS, 
        INGREDIENTS, CUISINE, CALORIES, LINK, PICTURE_URL, INSTRUCTIONS)
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        
        cur.execute(query, (primary_key, title, difficulty, prep_time, number_of_ingredients, ingredients, cuisine, calories, url, picture_url, instructions));
        conn.commit()

In [None]:
insert_into_db(recipes)

## Testing querying local db 

In [None]:
cur_local.execute('''SELECT *
FROM recipes_new
WHERE title LIKE '%Brownie%';''')

In [None]:
rows = cur_local.fetchall()

In [None]:
for row in rows:
    print(row[6])

In [None]:
cur_local.execute('''SELECT *
FROM recipes_new
WHERE ingredients LIKE '%apple%';''')