# Final Notebook to upload data into database

In [1]:
import psycopg2
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv

## Create table in database

In [2]:
DB_PASSWORD = os.environ['DB_PASSWORD']

In [3]:
conn = psycopg2.connect(database="d1hsr1c7nk56dl", user = "iadkkqrgljveni", host = "ec2-3-230-61-252.compute-1.amazonaws.com", port = "5432", password=DB_PASSWORD)

In [4]:
cur = conn.cursor()

In [5]:
#Creating the table only ONCE
cur.execute(
    '''CREATE TABLE COOKIT_RECIPES
    (ID INT PRIMARY KEY     NOT NULL,
    TITLE           TEXT    NOT NULL,
    DIFFICULTY      TEXT,
    PREPTIME        INT,
    NUMBER_OF_INGREDIENTS INT,
    INGREDIENTS     TEXT[]     NOT NULL,
    CUISINE         TEXT     NOT NULL,
    CALORIES        TEXT,
    LINK            TEXT       NOT NULL,
    PICTURE_URL     TEXT       NOT NULL,
    INSTRUCTIONS    TEXT);''')

conn.commit()

## Upload 750 recipes from lewagon

In [6]:
def scrape_recipe(url):

    html_content = bytearray()
    response = requests.get(url)

    if response.history == []:
        html_content += response.content

    return str(html_content)

In [7]:
def parse(recipe_html):
    
    #html input = response.content

    soup = BeautifulSoup(recipe_html, 'html.parser')
    details = {}
    
    #Find all ingredients on the specific page
    ingredients_soup = soup.find_all('div', class_='ingredient')
    ingredients = []

    for ingredient in ingredients_soup:
        description = ingredient.find('p', class_='mb-0').text
        ingredients.append(description)
        
    
    #Find all ingredients on the specific page
    cuisines = []
    cuisines_soup = soup.find_all('span', class_="badge badge-success")
    
    if len(cuisines_soup) < 1:
        cuisines.append("No cuisine specified")
    else:
        for cuisine in cuisines_soup:
            cuisines.append(cuisine.text)
    
    details["ingredients"] = ingredients
    details["cuisine"] = cuisines
    
    return details

In [8]:
recipes = pd.read_csv("../raw_data/recipes.csv")

In [9]:
recipes.tail()

Unnamed: 0,name,difficulty,prep_time,link,picture_url
745,Yogurt Marinated Lamb Skewers,Easy,45 min,https://recipes.lewagon.com/recipes/649,https://spoonacular.com/recipeImages/665550-55...
746,Yorkshire Pudding,Moderate,45 min,https://recipes.lewagon.com/recipes/779,https://spoonacular.com/recipeImages/665573-55...
747,Zucchini Chicken Omelette,Easy,45 min,https://recipes.lewagon.com/recipes/923,https://spoonacular.com/recipeImages/665734-55...
748,Zucchini Flutes Piped With Basil Ricotta Mousse,Very hard,45 min,https://recipes.lewagon.com/recipes/863,https://spoonacular.com/recipeImages/665744-55...
749,Zucchini Ribbon and Ricotta Pizza,Very easy,45 min,https://recipes.lewagon.com/recipes/469,https://spoonacular.com/recipeImages/665779-55...


In [10]:
#Based on loaded dataframe
def insert_into_db(recipes):
    
    for index, row in recipes.iterrows():
        primary_key = index
        title = row["name"]
        difficulty = row["difficulty"]
        prep_time = int(recipes.loc[index]["prep_time"][:2])
        url = row["link"]
        picture_url = row["picture_url"]
        instructions = "Please follow the link for instructions"
        
        recipe = parse(scrape_recipe(url))
        
        number_of_ingredients = len(recipe["ingredients"])
        ingredients = recipe["ingredients"]
        cuisine = recipe["cuisine"][0]
        
        calories = "No information available"
        
        
        query = """INSERT INTO COOKIT_RECIPES (ID, TITLE, DIFFICULTY, PREPTIME, NUMBER_OF_INGREDIENTS, 
        INGREDIENTS, CUISINE, CALORIES, LINK, PICTURE_URL, INSTRUCTIONS)
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        
        cur.execute(query, (primary_key, title, difficulty, prep_time, number_of_ingredients, ingredients, cuisine, calories, url, picture_url, instructions));
        conn.commit()

In [11]:
insert_into_db(recipes)

## Insert BBC Recipes

In [19]:
recipes_1 = pd.read_csv("../raw_data/recipe_list_1015_bbcfood_4231.csv")

In [20]:
recipes_1.isnull().sum()

title                      0
total_time                 0
yields                     0
ingredients                0
instructions               1
image                      0
calories                   0
url                        0
cuisine                 4231
dietary_restrictions    4231
num_of_ingredients         0
dtype: int64

In [21]:
#Fill NaN of cuisine and dietary restrictions
recipes_1.fillna("Not specified", inplace=True)

In [22]:
#add column for difficulty
for index, row in recipes_1.iterrows():
    if row["total_time"] < 15:
        recipes_1.loc[index, "difficulty"] = "easy"
    elif row["total_time"] < 45:
        recipes_1.loc[index, "difficulty"] = "medium"
    else:
        recipes_1.loc[index, "difficulty"] = "hard" 

In [23]:
#ToDO replace calories with not  specified or so 
recipes_1.drop(columns="calories", inplace=True)

In [24]:
recipes_1["calories"] = "No information available"

In [25]:
recipes_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4231 entries, 0 to 4230
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 4231 non-null   object
 1   total_time            4231 non-null   int64 
 2   yields                4231 non-null   object
 3   ingredients           4231 non-null   object
 4   instructions          4231 non-null   object
 5   image                 4231 non-null   object
 6   url                   4231 non-null   object
 7   cuisine               4231 non-null   object
 8   dietary_restrictions  4231 non-null   object
 9   num_of_ingredients    4231 non-null   int64 
 10  difficulty            4231 non-null   object
 11  calories              4231 non-null   object
dtypes: int64(2), object(10)
memory usage: 396.8+ KB


In [26]:
#Based on loaded dataframe
def insert_into_db(recipes):
    
    for index, row in recipes.iterrows():
        primary_key = index+750+4123
        title = row["title"]
        difficulty = row["difficulty"]
        prep_time = row["total_time"]
        number_of_ingredients = row["num_of_ingredients"]
        
        liste = recipes.iloc[index]["ingredients"][1:-1].split("',")
        ingredient_liste = []
        for element in liste:
            ingredient_liste.append(element.replace("'", "").strip())
        
        ingredients = ingredient_liste
        
        cuisine = row["cuisine"]
        calories = row["calories"]
        url = row["url"]
        picture_url = row["image"]
        instructions = row["instructions"]

        
        query = """INSERT INTO COOKIT_RECIPES (ID, TITLE, DIFFICULTY, PREPTIME, NUMBER_OF_INGREDIENTS, 
        INGREDIENTS, CUISINE, CALORIES, LINK, PICTURE_URL, INSTRUCTIONS)
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        
        cur.execute(query, (primary_key, title, difficulty, prep_time, number_of_ingredients, ingredients, cuisine, calories, url, picture_url, instructions));
        conn.commit()

In [27]:
insert_into_db(recipes_1)

## Insert Jamie Oliver Recipes

In [12]:
recipes_2 = pd.read_csv("../raw_data/recipe_list_2010_JamieOliver.csv")

In [13]:
recipes_2.isnull().sum()

title                      0
total_time                 0
yields                     0
ingredients                0
instructions               0
image                      0
calories                   0
url                        0
cuisine                 4122
dietary_restrictions    4122
num_of_ingredients         0
dtype: int64

In [14]:
#Fill NaN of cuisine and dietary restrictions
recipes_2.fillna("Not specified", inplace=True)

In [15]:
#add column for difficulty
for index, row in recipes_2.iterrows():
    if row["total_time"] < 15:
        recipes_2.loc[index, "difficulty"] = "easy"
    elif row["total_time"] < 45:
        recipes_2.loc[index, "difficulty"] = "medium"
    else:
        recipes_2.loc[index, "difficulty"] = "hard" 

In [16]:
recipes_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4122 entries, 0 to 4121
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 4122 non-null   object
 1   total_time            4122 non-null   int64 
 2   yields                4122 non-null   object
 3   ingredients           4122 non-null   object
 4   instructions          4122 non-null   object
 5   image                 4122 non-null   object
 6   calories              4122 non-null   object
 7   url                   4122 non-null   object
 8   cuisine               4122 non-null   object
 9   dietary_restrictions  4122 non-null   object
 10  num_of_ingredients    4122 non-null   int64 
 11  difficulty            4122 non-null   object
dtypes: int64(2), object(10)
memory usage: 386.6+ KB


In [17]:
#Based on loaded dataframe
def insert_into_db(recipes):
    
    for index, row in recipes.iterrows():
        primary_key = index+750
        title = row["title"]
        difficulty = row["difficulty"]
        prep_time = row["total_time"]
        number_of_ingredients = row["num_of_ingredients"]
        
        liste = recipes.iloc[index]["ingredients"][1:-1].split("',")
        ingredient_liste = []
        for element in liste:
            ingredient_liste.append(element.replace("'", "").strip())
        
        ingredients = ingredient_liste
        
        cuisine = row["cuisine"]
        calories = row["calories"]
        url = row["url"]
        picture_url = row["image"]
        instructions = row["instructions"]

        
        query = """INSERT INTO COOKIT_RECIPES (ID, TITLE, DIFFICULTY, PREPTIME, NUMBER_OF_INGREDIENTS, 
        INGREDIENTS, CUISINE, CALORIES, LINK, PICTURE_URL, INSTRUCTIONS)
        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        
        cur.execute(query, (primary_key, title, difficulty, prep_time, number_of_ingredients, ingredients, cuisine, calories, url, picture_url, instructions));
        conn.commit()

In [18]:
insert_into_db(recipes_2)

## Check size of db

In [28]:
recipes = pd.read_sql("""SELECT * FROM COOKIT_RECIPES""", conn)

In [29]:
len(recipes)

9103