## Data Preprocessing

In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the data
recipes = pd.read_csv('data/RAW_recipes.csv')
recipes.head()

In [None]:
recipes.info()

In [None]:
recipes.describe()

In [None]:
recipes.isna().sum()

In [None]:
# Going to drop nulls as I want description to be able to display to the user
recipes = recipes.dropna()
recipes.isna().sum()

In [None]:
# Need to look at all the tags, to be used as content features
# Currently a string, need to convert to list
recipes['tags'] = recipes['tags'].apply(ast.literal_eval)
tags = recipes['tags'].explode()
tags

In [None]:
len(tags.unique())

In [None]:
tag_counts = tags.value_counts()
tags_filtered = tag_counts[tag_counts >  10000]
tags_filtered

In [None]:
# Nutrition information (calories (#), total fat (PDV), sugar (PDV) , sodium (PDV) , protein (PDV) , saturated fat (PDV), total carbohydrates (PDV))
# Currently a string, need to convert to list then make into seperate columns
recipes['nutrition'] = recipes['nutrition'].apply(ast.literal_eval)
recipes[['calories', 'fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']] = recipes['nutrition'].apply(pd.Series)
recipes = recipes.drop(columns=['nutrition'])
recipes.head()

In [None]:
# Want to look at how specific steps are, can I use this as a content feature without heavy NLP?
# Currently a string, need to convert to list
recipes['steps'] = recipes['steps'].apply(ast.literal_eval)
recipes['steps'][0]

In [None]:
# I want to use inredients as a content feature as well to search for recipes
# Currently a string, need to convert to list
recipes['ingredients'] = recipes['ingredients'].apply(ast.literal_eval)
recipes['ingredients'][0]

In [None]:
# Looking at counts, lots of one off ingredients will limit based on count, same as tags
ingredients = recipes['ingredients'].explode()
ing_counts = ingredients.value_counts()
ing_counts

In [None]:
# Limit to ingredients that appear more than 2000 times
ing_filetered = ing_counts[ing_counts > 2000]
ing_filetered

In [None]:
# Something to consider for future steps, some ingredients are very similar, could be standardised with NLP
# Is this something I actually want to standardise? chicken broth vs. chicken breast?
# Need to consider this during the ingredient search
for ing in list(ing_filetered.index):
    if 'chicken' in ing:
        print(ing)

In [None]:
# Function to check if a row contains a value and return a boolean
# Will help split out ingredients and tags into seperate columns
def check_values(row, values):
    return pd.Series({value: value in row for value in values})

In [None]:
# Create new columns for each ingredient in the filtered list
new_cols_ing = recipes['ingredients'].apply(check_values, values=list(ing_filetered.index))
new_cols_ing

In [None]:
# Creating new columns for each tag in the filtered list
new_cols_tag = recipes['tags'].apply(check_values, values=list(tags_filtered.index))
new_cols_tag

In [None]:
# Brining together ingredients and tags into one dataframe
tags_ings = pd.concat([new_cols_ing, new_cols_tag], axis=1)
tags_ings.head()

In [None]:
# Bringing togather the features I want to use in the content based filtering and removing the columns I don't want
recipes_feat = recipes.drop(columns=['contributor_id', 'submitted', 'tags', 'steps', 'description',
                                     'ingredients', 'name', 'n_steps', 'n_ingredients', 'minutes'])
recipes_feat = pd.concat([recipes_feat, tags_ings], axis=1)
recipes_feat.head()

In [None]:
# Slight overlap in tags and ingredients, will remove duplicates
duplicate_columns = recipes_feat.columns[recipes_feat.columns.duplicated()].tolist()
duplicate_columns

In [None]:
# Sanity check for duplicate column names
recipes_feat.columns.value_counts()

In [None]:
# Removing duplicate columns
recipes_feat_clean = recipes_feat.loc[:, ~recipes_feat.columns.duplicated()]
recipes_feat_clean.columns.value_counts()

In [None]:
# Save feature matrix as parquet file
recipes_feat_clean.to_parquet('data/recipes_feat.parquet')

In [None]:
# For size limitations seperating the content features and the tags and ingredients
recipes_ingtag = recipes.drop(columns=['name', 'steps', 'description', 'contributor_id', 'submitted', 'n_ingredients', 
                                    'n_steps', 'minutes', 'calories', 'fat', 'sugar', 'sodium', 'protein',
                                    'saturated_fat', 'carbohydrates'])
recipes_ingtag.head()

In [None]:
# Save as parquet file
recipes_ingtag.to_parquet('data/recipes_ingtag.parquet')

In [None]:
# Create a dataframe with steps and description
recipes_steps = recipes.drop(columns=['contributor_id', 'submitted', 'n_ingredients', 'n_steps', 'minutes', 'calories', 'fat',
                                      'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates', 'tags'])
recipes_steps.head()

In [None]:
# Save as parquet file
recipes_steps.to_parquet('data/recipes_steps.parquet')

## Testing Functions for Streamlit

In [None]:
recipes_test = recipes_ingtag.copy()
recipes_test.head()

In [None]:
# Need a function to match if selected tags are present in the row
# We will prompt user for specific tags, and return recipes that contain all of those tags
def all_tags_present(item_tags, selected):
    return all(string in item_tags for string in selected)

In [None]:
recipes_test['tag_match'] = recipes_test['tags'].apply(all_tags_present, selected=['poultry', '30-minutes-or-less'])
recipes_test = recipes_test[recipes_test['tag_match'] == True]

In [None]:
# Ingredient selection, will handle input with streamlit
ing_selected = ['chicken', 'lentils']

In [None]:
# Need to define a function that will return True if all ingredients selected are found in the matched row
def check_ingredients_df(row):
    ing_matched = row
    
    # Join all ingredients into a single lowercase string
    ingredients_str = ' '.join(str(ing).lower() for ing in ing_matched)
    
    # Check each item in all_ings
    for item in ing_selected:
        item = item.strip('s')
        if item not in ingredients_str:
            return False
    
    # If we've made it through all items without returning False, return True
    return True


In [None]:
# Can now apply to dataframe
recipes_test['ing_match'] = recipes_test['ingredients'].apply(check_ingredients_df)
recipes_test[recipes_test['ing_match'] == True]

In [None]:
# How do we want to display the steps to the user?
recipes_steps.head()

In [None]:

rec = recipes_steps[recipes_steps['id'] == 66735]
rec['steps'].values[0], rec['name'].values[0]

In [None]:
name = rec['name'].values[0].replace(' ', '-')
name

In [None]:
# Can use some clever formatted strings to display the link (because they all follow the same format)
link = f"https://www.food.com/recipe/{name}-{rec['id'].values[0]}"
link

In [None]:
# We can use the feature matrix to find similar recipes to the one selected
rec_feat = recipes_feat_clean[recipes_feat_clean['id'] == 66735]
rec_feat = rec_feat.drop(columns=['id']).values.reshape(1, -1)

# Using cosine similarity to compare feature vectors
cosine_sim = cosine_similarity(rec_feat, recipes_feat_clean.drop(columns=['id']))
sim_scores = list(zip(recipes_feat_clean['id'].values, cosine_sim[0]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:6]
rec_indices = [i[0] for i in sim_scores]

recs = recipes_steps[recipes_steps['id'].isin(rec_indices)][['id', 'name', 'description']]
recs
