# Recipe Data Loading, Cleaning, and Feature Engineering

In [None]:
# Standard imports
import json
import os
from os import path
import pathlib
import pickle
import platform
import string
import time
import zipfile

#Other models
import gensim
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import spacy
import tabulate
import tensorflow as tf
from textblob import TextBlob
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForQuestionAnswering
)

# Scikit-learn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    precision_recall_curve,
    recall_score,
    f1_score,
    accuracy_score,
    cosine_similarity
)
from sklearn.naive_bayes import MultinomialNB

# NLTK specific setup (often done once)
nltk.download('stopwords')
from nltk.corpus import stopwords

# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


  from .autonotebook import tqdm as notebook_tqdm
2024-03-15 00:10:51.983593: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-15 00:10:52.292670: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-03-15 00:10:52.292705: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-03-15 00:10:52.367581: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-15 00:10:5

## 2. Load Raw Data

Loading the raw recipe data. The data is spread across multiple JSON files (`recipes_raw_nosource_*.json`). We will read each file and concatenate them into a single Pandas DataFrame.

The dataset used for this project is provided by eightportion. It has a collection of 125,000 recipes with over 40% containing images. 


In [None]:
# Read the JSON file into a DataFrame
df1_transposed = pd.read_json("recipes_raw_nosource_ar.json")
df2_transposed = pd.read_json("recipes_raw_nosource_epi.json")
df3_transposed = pd.read_json("recipes_raw_nosource_fn.json")

#Transposing the Dataframe
df1 = df1_transposed.transpose()
df2 = df2_transposed.transpose()
df3 = df3_transposed.transpose()

In [None]:
#Adding titles to the file 
standard_order = ['title', 'ingredients','instructions', 'picture_link']

df1 = df1[standard_order]
df2 = df2[standard_order]
df3 = df3[standard_order]

#Creating a single dataset out of all recipes 
recipe_data = pd.concat([df1, df2, df3], ignore_index=True)

recipe_data.to_csv('recipe_data.csv')


## 3. Initial Data Exploration

Performing a preliminary check on the loaded data to understand its structure, size, data types, and identify any immediate issues like missing values.

In [None]:
# Shape of the data
print('The shape of the recipe dataset is:', recipe_data.shape)

The shape of the recipe dataset is: (125164, 4)


In [None]:
#Checking null values 
recipe_data.isna().sum()


title             569
ingredients       517
instructions      691
picture_link    42571
dtype: int64

In [None]:
#dropping pictures link and the ones with null instructuions 
recipe_f = recipe_data.dropna(subset=['instructions'])

#Dropping the picture links 
recipe_f = recipe_f.drop(columns=['picture_link'])

#checking the shape and the null values 
print('The shape of the recipe dataset is:', recipe_f.shape)
recipe_f.isna().sum()

The shape of the recipe dataset is: (124473, 3)


title           0
ingredients     0
instructions    0
dtype: int64

## 4. Data Cleaning

This section focuses on cleaning the dataset to make it suitable for analysis and modeling. So now, we have a final dataset of 124,473 recipes and with no null values. For the next step, we will perform data cleaning 

- Checking if all the recipes have relevant content in them 

- Removing irrelevant content from the recipes, name and instructions 

In [None]:
recipe_f.head(50)

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...
5,Banana Banana Bread,"[2 cups all-purpose flour ADVERTISEMENT, 1 tea...",Preheat oven to 350 degrees F (175 degrees C)....
6,Chef John's Fisherman's Pie,"[For potato crust: ADVERTISEMENT, 3 russet pot...",Bring a large saucepan of salted water and to ...
7,Mom's Zucchini Bread,"[3 cups all-purpose flour ADVERTISEMENT, 1 tea...",Grease and flour two 8 x 4 inch pans. Preheat ...
8,The Best Rolled Sugar Cookies,"[1 1/2 cups butter, softened ADVERTISEMENT, 2 ...","In a large bowl, cream together butter and sug..."
9,Singapore Chili Crabs,"[Sauce: ADVERTISEMENT, 1/2 cup ketchup ADVERTI...","Whisk ketchup, chicken broth, egg, soy sauce, ..."


In [None]:
# Indexing rows with columns that only contain numbers or punctuation
import string
nc_ingred_index = [index for i, index in zip(recipe_f['ingredients'], recipe_f.index) if all(j.isdigit() or j in string.punctuation for j in i)]
nc_title_index = [index for i, index in zip(recipe_f['title'], recipe_f.index) if all(j.isdigit() or j in string.punctuation for j in i)]
nc_instr_index = [index for i, index in zip(recipe_f['instructions'], recipe_f.index) if all(j.isdigit() or j in string.punctuation for j in i)]
# Checking number of rows in each category that are only punc/nums
index_list = [nc_ingred_index, nc_title_index, nc_instr_index]
[len(x) for x in index_list]

[1520, 0, 39]

In [None]:
# generating unique indices for index_list and dropping from dataframe
# recipes without recipe instructions or ingredients are not useable
from functools import reduce
from operator import add
inds_to_drop = set(reduce(add, index_list))
print(len(inds_to_drop))
recipe_f = recipe_f.drop(index=inds_to_drop).reset_index(drop=True)
recipe_f.shape

1551


(122922, 3)

In [None]:
#Pruning the recipes 
#Showing index of recipes that have less than 30 characters instructions
instr_30less = [index for i, index in zip(recipe_f['instructions'], recipe_f.index) if len(i) < 30]

#Taking a look at the recipes 
instr_30 = recipe_f.iloc[instr_30less]
instr_30.head(20)

Unnamed: 0,title,ingredients,instructions
18316,Terrific Trail Mix,"[1 cup combination diced dried fruit, such as ...",Mix all. Makes 4 cups.\n
21254,Ginger-Cabbage Salad,"[3/4 cup pickled ginger ADVERTISEMENT, 4 cups ...",Combine all.\n
28814,Cherry Vanilla,"[6 ounces 7UP® Cherry ADVERTISEMENT, 1 fluid o...",Serve over 1 cup ice.\n
31146,Shirley Temple from 7UP,"[8 ounces 7UP® ADVERTISEMENT, 1 ounce Rose's® ...",Serve over 1 cup ice.\n
31161,Lucky 7,"[6 ounces 7UP® ADVERTISEMENT, 1 ounce vodka (s...",Serve over 1 cup ice.\n
31163,Electric 7UP,"[8 ounces 7UP® ADVERTISEMENT, 2 ounces gin ADV...",Serve over 1 cup ice.\n
35030,7UP Pom Spritzer,[2 1/2 ounces Rose's Cocktail Infusions® Pomeg...,Serve over 1 cup ice.\n
35032,7 and 7 and 7,"[3 ounces 7UP® ADVERTISEMENT, 3 fluid ounces H...",Serve over 1 cup ice.\n
52284,Royal Icing,"[Using electric mixer, beat 3 1/4 cups powdere...",N/A N/A\nN/A
55149,Very Veggie,"[1 cup kale, 1 cup spinach, 1 cup parsley, 1/4...",Juice.\nJuice.


In [None]:
#Dropping these recipes 
recipe_f = recipe_f.drop(index = instr_30less).reset_index(drop=True)

#checking the shape and the null values 
print('The shape of the recipe dataset is:', recipe_f.shape)

The shape of the recipe dataset is: (122880, 3)


In [None]:
#Doing same for ingredients. 
#Showing index of recipes that have less than 20 characters ingredients
ing_20less = [index for i, index in zip(recipe_f['ingredients'], recipe_f.index) if len(i) < 2]

#Taking a look at the recipes 
ing_20 = recipe_f.iloc[ing_20less]
ing_20.head(25)

#Onl dropping N/A out of the ones because all others might make sense. 

Unnamed: 0,title,ingredients,instructions
40146,Light-and-Crisp Whole-Wheat Bread Crumbs,[4 slices whole-wheat sandwich bread (about 1 ...,Preheat the oven to 350°F. Place the bread in ...
40301,Basic Simple Syrup,[1 cup sugar],1. Combine the sugar with 1/2 cup water in a s...
41174,Grill-Smoked Salmon,"[3 whole gutted salmon, each about 2 1/2 pound...",Build a fire in a smoker/grill for indirect he...
41655,Plain Toasted Croutons,[4 or more thick slices of home-style white br...,Preheat the oven to 350°F. Remove the crusts f...
41743,Hard-Boiled Eggs,[4 large eggs],"Put eggs into a 1-quart saucepan, then add eno..."
41806,Radish Flowers,[20 small radishes with leafy tops],"Trim radish tops, leaving 2 inches of stems wi..."
41910,To Temper Chocolate,[3 ounces bittersweet or milk chocolate],Have ready a 3- to 4-quart saucepan and a meta...
42209,Tempering Chocolate,"[Bittersweet, semisweet, milk, or white chocol...","Tempering determines the final gloss, hardness..."
42912,Chocolate Disks and Lace,[8 ounces bittersweet (not unsweetened) or sem...,Line three cookie sheets with waxed paper. Tra...
43053,Khao Neeo,[3 cups Thai long-grain sticky (glutinous) rice],In a large bowl (large enough to hold at least...


In [None]:
recipe_f.iloc[45747]['ingredients']
#Dropping it because it has N/A in it's ingredients 
recipe_f = recipe_f.drop(index=45747, errors='ignore')
recipe_f.shape

(122879, 3)

- We'll remove word Advertisement from the ingredients

- Try to separate ingredients by removing the measurements from the column and make it a cleaner list 

In [None]:
#Removing ADVERTISEMENT from ingredients 
ingredients = []
for ing_list in recipe_f['ingredients']:
    clean_ings = [ing.replace('ADVERTISEMENT','').strip() for ing in ing_list]
    if '' in clean_ings:
        clean_ings.remove('')
    ingredients.append(clean_ings)
recipe_f['ingredients'] = ingredients

In [None]:
recipe_f.head()

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves, 2...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar, 1/2 cup ketchup, ...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened, 1 cup white sugar, 1 ...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,"[8 ounces whole wheat rotini pasta, 3 cups fre...",Preheat oven to 350 degrees F. Line a 2-quart ...


In [None]:

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

def clean_ingredients(ingredients):
    cleaned_ingredients = []
    for ingredient in ingredients:
        doc = nlp(ingredient)
        food_items = []
        for token in doc:
            # Skip tokens that are likely to be quantities or measurements
            if (token.pos_ not in ['QUANTITY']):
                # Collect tokens that could be food items
                food_items.append(token.text)
        # Attempt to clean up and refine the list of potential food items
        food_item_cleaned = ' '.join(food_items)
        cleaned_ingredients.append(food_item_cleaned)
    return cleaned_ingredients

# Assuming df4 is your DataFrame with a column 'ingredients'
# Apply the clean_ingredients function to each element of the 'ingredients' column
recipe_f['ing_clean'] = recipe_f['ingredients'].apply(clean_ingredients)

# Display the DataFrame to verify the changes
recipe_f.head()


KeyboardInterrupt



## 5. Feature Engineering

Creating new features from the existing data to enhance the recommendation process. This includes:
* Estimating cooking time (if applicable).
* Classifying recipes as vegetarian or non-vegetarian.
* Identifying the cuisine type.
* Calculating the number of ingredients.
* Combining relevant text fields (e.g., title + ingredients + instructions) into a single field for easier processing later.

*(Mention specific tools used, e.g., "We use TextBlob for sentiment analysis and attempt to infer cuisine type..." or "Transformer models are used for...")*

In [None]:
# Counting the number of ingredients used in each recipe
recipe_f['ingredient_count'] = [len(ingredients) for ingredients in recipe_f['ingredients']]

### Adding cooking times

In [None]:
#Adding the cooking time 
def extract_cooking_time(df):
    # Initialize the question-answering pipeline
    model_name = "deepset/roberta-base-squad2"
    nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
    
    cooking_times = []  # A list to store the cooking times
    
    for index, row in df.iterrows():
        # Format the ingredients list into a string
        ingredients_str = ', '.join(row['ingredients'])
        
        # Prepare the context by combining cooking instructions and ingredients
        context = f"{row['instructions']}."
        
        # Prepare the QA input
        QA_input = {
            'question': 'How long is the cooking time in minutes?Answer only in minutes. Convert hours into minutes.',
            'context': context
        }
        
        # Get predictions
        res = nlp(QA_input)
        
        # Extract the answer (cooking time) and append to the list
        cooking_times.append(res['answer'])
    
    # Add the cooking times as a new column to the DataFrame
    df['cooking_time'] = cooking_times
    
    return df

#Adding the cooking times 
recipe_f = extract_cooking_time(recipe_f)

In [None]:
#Converting time into same format
import re

def time_to_minutes(time_str):
    # Dictionary to convert words to numbers
    word_to_number = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}
    
    # Handling textual representation of numbers for hours
    for word, number in word_to_number.items():
        if word in time_str:
            time_str = time_str.replace(word, str(number))
            
    # Find all numbers in the string
    numbers = [int(num) for num in re.findall(r'\d+', time_str)]
    
    if 'hour' in time_str or 'hours' in time_str:
        # Assume 1 hour if no number is found before "hour"
        hours = numbers[0] if numbers else 1
        return hours * 60  # Convert hours to minutes
    elif 'minute' in time_str:
        # If it's a range (e.g., "40 to 60 minutes"), take the average
        if 'to' in time_str and len(numbers) > 1:
            return sum(numbers) / len(numbers)
        return numbers[0]  # Directly return minutes
    else:
        # If no time unit is specified or unrecognized, return None or raise an error
        return numbers[0]*60


recipe_f['cooking_time_min'] = recipe_f['cooking_time'].apply(time_to_minutes)
    


### Adding veg and non-veg as a column

In [None]:
#VEG or NON_VEG
non_veg = [
    'Beef',
    'Pork',
    'Lamb',
    'Veal',
    'Goat',
    'Bison',
    'Rabbit',
    'Chicken',
    'Turkey',
    'Duck',
    'Goose',
    'Quail',
    'Pheasant',
    'Salmon',
    'Tuna',
    'Cod',
    'Trout',
    'Mackerel',
    'Herring',
    'Sardines',
    'Shrimp',
    'Crab',
    'Lobster',
    'Oysters',
    'Mussels',
    'Squid',
    'Octopus',
    'Gelatin',
    'bone', 
    'skin',
    'Lard',
    'fat',
    'Tallow',
    'mutton',
    'beef',
    'Rennet',
    'Fish sauce',
    'fish',
    'Oyster',
    'Oyster sauce',
    'Anchovies',
    'Caviar',
    'snail',
    'Escargot',
    'Sausage',
    'Bacon',
    'ham',
    'breast',
    'thigh',
    'turkey',
    'meat'
]

# Assuming 'df4' is your DataFrame and 'ingredients' is a column containing lists of ingredients for each recipe
recipe_f['type'] = recipe_f['ingredients'].apply(lambda x: 'non-veg' if any(item.lower() in [ingredient.lower() for ingredient in x] for item in non_veg) else 'veg')


## Adding the final column for corpus

In [None]:
# Define a function to combine the three columns
recipe_f['ingredients_str'] = recipe_f['ingredients'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
recipe_f['instructions_str'] = recipe_f['instructions'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
def combine_columns(row):
  return ' '.join([str(row['title']), str(row['ingredients_str']), str(row['instructions_str'])])

# Apply the function to each row of the dataframe
recipe_f['Combined'] = recipe_f.apply(combine_columns, axis=1)


In [None]:
# Clean_text Function
import string
import re

def clean_text(documents):
    cleaned_text = []
    for doc in documents:
        doc = doc.translate(str.maketrans('', '', string.punctuation)) # Remove Punctuation
        doc = re.sub(r'\d+', '', doc) # Remove Digits
        doc = doc.replace('\n',' ') # Remove New Lines
        doc = doc.strip() # Remove Leading White Space
        doc = re.sub(' +', ' ', doc) # Remove multiple white spaces
        cleaned_text.append(doc)
    return cleaned_text

In [None]:
recipe_vec = clean_text(recipe_f['Combined'])

In [None]:
recipe_vec[2]

'Brown Sugar Meatloaf cup packed brown sugar cup ketchup pounds lean ground beef cup milk eggs teaspoons salt teaspoon ground black pepper small onion chopped teaspoon ground ginger cup finely crushed saltine cracker crumbs Preheat oven to degrees F degrees C Lightly grease a x inch loaf pan Press the brown sugar in the bottom of the prepared loaf pan and spread the ketchup over the sugar In a mixing bowl mix thoroughly all remaining ingredients and shape into a loaf Place on top of the ketchup Bake in preheated oven for hour or until juices are clear'

## 6. Save Processed Data

Saving the cleaned, enriched, and preprocessed DataFrame to a file. This file will serve as the input for the next stage: building and evaluating the recipe recommendation model.

In [None]:
recipe_f.to_csv('recipe_after_addingdataandcleaning.csv', index=False)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5cb9474c-6462-4da0-a2dd-0ed9f5cafe8e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>