## Food.com Dataset Reccomender Binary Matrix
Below is data exploration on the foolwoing dataset https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions?select=RAW_recipes.csv 

This dataset consists of 180K+ recipes and 700K+ recipe reviews covering 18 years of user interactions and uploads on Food.com 

In [2]:
import numpy as np
import zipfile
from tabulate import tabulate
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

#from surprise import SVD
#from surprise import Dataset
#from surprise import Reader
#from surprise.model_selection import train_test_split
#from surprise import accuracy
#import tensorflow.keras as tf

%matplotlib inline


## Loading the Dataset
From the zipped dataset we extract the recipes and the interactions (reviews).
Below is how we got the data from the original very large dataset.

In [3]:
# zip_file_path = "archive.zip"
# csv_file_name1 = 'RAW_recipes.csv'
# csv_file_name2 = 'RAW_interactions.csv' 

# # Initialize two DataFrames to store the data from the two CSV files
# df1 = None
# df2 = None

# # Open the zip file and read the first CSV file into the first DataFrame
# with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
#     with zip_file.open(csv_file_name1) as csv_file_in_zip:
#         df_recipes = pd.read_csv(csv_file_in_zip)

# # Open the zip file again and read the second CSV file into the second DataFrame
# with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
#     with zip_file.open(csv_file_name2) as csv_file_in_zip:
#         df_reviews = pd.read_csv(csv_file_in_zip)

# #We renamed the id column to match to id in reviews so we can merge on this later
# df_recipes.rename(columns={'id': 'recipe_id'}, inplace=True)




In [4]:
# #We combine both the dataframes

# combined_df = pd.merge(df_reviews, df_recipes, on='recipe_id', how='inner')


## Data Selection

Below we search the tags that indicate the recipe is for a cocktail. This cuts downs are dataset from 180K recipes to around 4K, significiantly reducing our file size. We have used this exported csv as our base dataset so we don't have to rely on git lfs.

In [5]:
# combined_df = combined_df[combined_df['tags'].str.contains('cocktails', case=False, na=False)]

# combined_df.rename(columns={'id': 'recipe_id'}, inplace=True)

# #We drop columns that we don't need
# columns_to_drop = ['date', 'review','minutes','nutrition','contributor_id', 'description','submitted']

# combined_df = combined_df.drop(columns=columns_to_drop)

# combined_df.to_csv("cocktail_dataset.csv")
# #Exporting a refined dataset to be used in mongoDB

# to_keep = ['recipe_id','rating','name','n_steps','ingredients','steps','n_ingredients']

# mongo_csv = combined_df[to_keep].drop_duplicates(subset=['name'])


# mongo_csv.to_csv("cocktail_data_mongo.csv", index=False)




## Data Exploration

We are exploring the cocktail counts

In [6]:
 combined_df = pd.read_csv("cocktail_dataset.csv")

 cocktail_review_counts = combined_df.groupby('recipe_id')['rating'].count().reset_index()
 cocktail_review_counts.columns = ['recipe_id', 'review_count']
 # Sort the DataFrame by review_count in descending order
 # Sort the review counts DataFrame by review_count in descending order
 cocktail_review_counts_sorted = cocktail_review_counts.sort_values(by='review_count', ascending=False)

 # Merge the review counts DataFrame with the recipes DataFrame to get cocktail names
 table_data = cocktail_review_counts_sorted.merge(combined_df[['recipe_id', 'name']], on='recipe_id', how='inner')

 # Display the table
 table_data = table_data.drop_duplicates(subset='recipe_id')

 table = tabulate(table_data, headers=['Recipe ID','Review Count','Cocktail Name'], tablefmt='pretty', showindex=False)
 print(table)

 print("Average review count is:",cocktail_review_counts_sorted['review_count'].mean())



+-----------+--------------+------------------------------------------------------------------+
| Recipe ID | Review Count |                          Cocktail Name                           |
+-----------+--------------+------------------------------------------------------------------+
|   66947   |     114      |            refreshing mojito  by the pitcher mojitos             |
|   18458   |      87      |                      dana s homemade kahlua                      |
|  234344   |      79      |                 dr  pat s hot toddy cold remedy                  |
|   59148   |      48      |                           ya ya punch                            |
|  176535   |      44      |            kate s afternoon wine cooler  zwt   france            |
|  225347   |      44      |                       best beer margarita                        |
|   29570   |      43      |                    no sugar added iced coffee                    |
|   15420   |      43      |            

## Data Preperation for Matrix

We Strip the regex characters from the ingredients column.

We also need to remove the duplicates since one cocktail can have many reviews, and since we combined the reviews and coctail dataframes, there will be duplicates.

The duplicates were useful for data exploration, but now exploration is done we need only unique values of the cocktails for our prediction matrix.

In [7]:
# Only process if the first item in 'ingredients' column is a string
if isinstance(combined_df['ingredients'].iloc[0], str):
    # Remove [ and ] 
    combined_df['ingredients'] = combined_df['ingredients'].str.replace('[', '').str.replace(']', '')
    # Splitting the ingredients string by commas
    combined_df['ingredients'] = combined_df['ingredients'].str.split(',')


# Drop duplicates based on the 'name' column and assign the result back to combined_df
combined_df = combined_df.drop_duplicates(subset=['name'])

  combined_df['ingredients'] = combined_df['ingredients'].str.replace('[', '').str.replace(']', '')


In [8]:
#Checking if there are any duplicates before processing for the matrix

duplicate_names = combined_df[combined_df['name'].duplicated(keep=False)]
print(duplicate_names)


Empty DataFrame
Columns: [Unnamed: 0, user_id, recipe_id, rating, name, tags, n_steps, steps, ingredients, n_ingredients]
Index: []


In [9]:
# Explode the 'ingredients' column
exploded_df = combined_df.explode('ingredients')

# Find unique ingredients
unique_ingredients = exploded_df['ingredients'].unique()

ingredient_counts = exploded_df['ingredients'].value_counts()

exploded_df['ingredients'] = exploded_df['ingredients'].str.replace('','')

print(ingredient_counts)



 'ice'                           646
 'vodka'                         526
 'sugar'                         487
 'pineapple juice'               468
 'orange juice'                  436
                                ... 
 'sauterne'                        1
 'amaretti cookie'                 1
'chocolate vodka'                  1
'sloes'                            1
 'sugar-free chocolate syrup'      1
Name: ingredients, Length: 2157, dtype: int64


# Cocktail Similarity
## Next We will prepare the features of the data
**Feature Extraction**
The ingredients list will be the primary feature for our content-based filtering.
**One-hot encoding**
This converts our categorical data into a numerical format that machine learning algorithms can understand and process.



In [24]:
# Instantiate the binarizer
mlb = MultiLabelBinarizer()

# Filtering out non-iterable items
filtered_ingredients = [ingredients if isinstance(ingredients, (list, tuple)) else [] for ingredients in combined_df['ingredients']]

# Apply MultiLabelBinarizer on the filtered data
binary_matrix = mlb.fit_transform(filtered_ingredients)

# Convert the binary matrix into a DataFrame for better visualization and manipulation
df_binary = pd.DataFrame(binary_matrix, columns=mlb.classes_)


# Concatenate DataFrames
combined_df = pd.concat([combined_df, df_binary], axis=1)

# Drop rows where 'recipe_id' is NaN
combined_df.dropna(subset=['recipe_id'], inplace=True)

# Convert 'recipe_id' to integer
combined_df['recipe_id'] = combined_df['recipe_id'].astype(int)

# If you also want to convert 'user_id' to integer
combined_df['user_id'] = combined_df['user_id'].astype(int)

combined_df = combined_df.reset_index(drop=True)

import pickle
import gzip

with gzip.open('compressed_combined_df.pkl.gz', 'wb') as f:
    pickle.dump(combined_df, f, protocol=pickle.HIGHEST_PROTOCOL)





combined_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,recipe_id,rating,name,tags,n_steps,steps,ingredients,n_ingredients,...,'white vermouth','white wine','white zinfandel wine','whole cloves','whole milk','wild strawberry gelatin','wine','x-rated fusion liqueur','yellow cake batter','zinfandel'
0,84.0,296027,182985,4.0,watermelon martini,"['60-minutes-or-less', 'time-to-make', 'course...",4.0,['place all ingredients in a shaker and shake ...,"['vodka', 'lime juice', 'triple sec', 'wate...",6.0,...,0,0,0,0,0,0,0,0,0,0
1,691.0,452940,367080,4.0,apple blossom,"['15-minutes-or-less', 'time-to-make', 'course...",3.0,"['pour the brandy , apple juice , and lemon ju...","['brandy', 'apple juice', 'lemon juice', 'l...",4.0,...,0,0,0,0,0,0,0,0,0,0
2,1734.0,303545,259553,5.0,witch s brew,"['30-minutes-or-less', 'time-to-make', 'course...",6.0,['combine champagne and orange juice in large ...,"['sparkling wine', 'orange juice', 'orange s...",8.0,...,0,0,0,0,0,0,0,0,0,0
3,2342.0,538524,280085,5.0,masquerade cocktail,"['15-minutes-or-less', 'time-to-make', 'course...",2.0,"['half fill a cocktail shaker full of ice , ad...","['citrus-infused vodka', 'apple schnapps', '...",6.0,...,0,0,0,0,0,0,0,0,0,0
4,2514.0,276837,56516,2.0,peppermint patty hot chocolate,"['15-minutes-or-less', 'time-to-make', 'course...",7.0,"['combine milk , syrup , and sugar in a sauce ...","['chocolate syrup', 'sugar', 'milk', 'peppe...",5.0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
import h5py

similarity_matrix = cosine_similarity(df_binary)
with h5py.File("compressed_similarity_matrix.h5", "w") as hf:
    hf.create_dataset("similarity_matrix", data=similarity_matrix, compression="gzip", compression_opts=9)



In [None]:
def get_similar_cocktails(input_value, N=5):
    """
    Fetch similar cocktails based on a given cocktail name or ID.
    
    Args:
    - input_value (str or int): Name or ID of the cocktail.
    - N (int): Number of similar cocktails to return. Default is 5.

    Returns:
    - list: Names of top N similar cocktails.
    """
    
    # Determine if input is name or ID
    if isinstance(input_value, str):
        if input_value not in combined_df['name'].values:
            raise ValueError(f"No cocktail named {input_value} found in the dataset.")
        cocktail_index = combined_df[combined_df['name'] == input_value].index[0]
    elif isinstance(input_value, int):  # Assuming ID is an integer
        if input_value not in combined_df['recipe_id'].values:
            raise ValueError(f"No cocktail with ID {input_value} found in the dataset.")
        cocktail_index = combined_df[combined_df['recipe_id'] == input_value].index[0]
    else:
        raise ValueError("Input value must be either a name (string) or an ID (integer).")
    
    # Fetch and enumerate similarity scores for the given cocktail
    similar_scores = list(enumerate(similarity_matrix[cocktail_index]))
    
    # Sort the scores
    sorted_similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N cocktail names excluding the input cocktail itself
    return [combined_df.iloc[i[0]]['name'] for i in sorted_similar_scores[1:N+1]]

get_similar_cocktails(98221)


['frozen mixed berry margarita',
 'world s best margaritas',
 'truth serum  margaritas',
 'margaritas made with beer',
 'easy pitcher of margaritas']

## Web Scraping

The provided code demonstrates an automated process for extracting the first image from Google Image search results based on a list of cocktail-related queries.

---

### Libraries and Initialization:

**Selenium** is used for web automation. It opens a Chrome browser and interacts with web pages.  
**Requests** and **io** libraries facilitate the downloading of images from the internet.  
**PIL** from the Pillow library aids in image manipulation.

---

### Main Functions:

**1. `get_first_image_from_google`:**  
This function searches Google Images with a given query.  
It clicks on the first image thumbnail to view it in full.  
The direct URL of the image is then extracted and returned.

**2. `download_image`:**  
Given an image URL, this function fetches the image using the requests library.  
The image is then saved to the local disk using the Pillow library.

---

### Execution:

**Queries Creation:**  
The code prepares a list of queries named `queries`, where each query is generated by appending "food.com cocktail" to each 'name' from the `combined_df` DataFrame.

**Image Extraction:**  
The code demonstrates two potential methods:  
- Looping through the entire list of queries to download images for each one.
- Looping through a limited number (e.g., first 5) of queries from the list.

In this script, the second method is active. The code performs an image search for the first n cocktails from the list, fetches their first images, and saves them with a numerical filename (0.jpg, 1.jpg, etc.).

After all operations, the automated browser session (`wd`) is closed using `wd.quit()`.




In [None]:
import os

# Set the path to your folder
folder_path = "C:\\Users\\samue\\OneDrive - UTS (1)\\2023\\Spring\\Software Innovation Studio\\Cocktail Images\\"

# List all files with .jpg extension
jpg_files = [f for f in os.listdir(folder_path)]

for jpg_file in jpg_files:
    # Construct the full path of the file
    old_file_path = os.path.join(folder_path, jpg_file)

    # Generate the new filename by removing .jpg from the name but keeping the extension
    new_file_name = jpg_file.replace('.jpg', '') + '.jpg'

    # Construct the new full path for the renamed file
    new_file_path = os.path.join(folder_path, new_file_name)

    # Rename the file
    os.rename(old_file_path, new_file_path)

print("Renaming complete!")



# Assuming your filenames without the .jpg extension correspond to the IDs
# Extract IDs from filenames
file_ids = [int(f.split('.')[0]) for f in jpg_files if f.split('.')[0].isdigit()]


# Check if each ID in the DataFrame has a corresponding file
combined_df['has_file'] = combined_df['recipe_id'].isin(file_ids)

# Filter the DataFrame to get rows where 'has_file' is False
missing_files_df = combined_df[combined_df['has_file'] == False]


print(missing_files_df)


Renaming complete!
      Unnamed: 0     user_id  recipe_id  rating                        name  \
29       13718.0      120647     203720     5.0                x rated rita   
79       27928.0  2001887945     293079     5.0         irish creme liqueur   
92       30111.0       37779      25693     5.0    passionate white sangria   
94       31089.0       89831     163173     5.0  virgin strawberry daiquiri   
128      40372.0      552613     284312     4.0              sesame delight   
...          ...         ...        ...     ...                         ...   
4265   1118424.0     1388317     174763     5.0              real bellini s   
4267   1119100.0  1800145703      43942     0.0                  grape wine   
4278   1122064.0      383346     227907     5.0                  watermelon   
4286   1124953.0       63553     210296     5.0        swampwater   spirits   
4298   1129595.0      383346     270685     3.0              chocolate rose   

                                

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import io
from PIL import Image
import time


# See README for setup of chromedriver

PATH = r"C:\Users\samue\OneDrive\Documents\GitHub\software-innovation-studio\modelGeneration\chromedriver.exe"
wd = webdriver.Chrome(PATH)

def get_first_image_from_google(wd, query):
    # Format the URL with the given query
    url = f"https://www.google.com/search?q={query}&tbm=isch"
    wd.get(url)
    
    try:
        # Find the first image thumbnail and click it
        thumbnail = wd.find_element(By.CLASS_NAME, "Q4LuWd")
        thumbnail.click()
        time.sleep(2.5)  # Wait for the image to load

        # Extract the image URL
        image = wd.find_element(By.CLASS_NAME, "r48jcc")
        if image.get_attribute('src') and 'http' in image.get_attribute('src'):
            return image.get_attribute('src')
    except Exception as e:
        print(f"Error: {e}")

    return None

def download_image(download_path, url, file_name_without_extension):
    try:
        image_content = requests.get(url).content
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file)

        # Identify the format
        img_format = image.format
        
        if img_format not in ["JPEG", "PNG"]:
            raise ValueError(f"Unsupported image format: {img_format}")
        
        # Construct file name with correct extension
        file_name = f"{file_name_without_extension}.{img_format.lower()}"

        file_path = download_path + file_name
        with open(file_path, "wb") as f:
            image.save(f, img_format)

        print("Success:", file_name)
    except Exception as e:
        print('FAILED -', e)


print(combined_df.count())

queries = [f"{row['name']} food.com cocktail" for _, row in combined_df.iterrows()]

# # This Loops through entire dataset of cocktails
# for i in range(len(queries)):
#     url = get_first_image_from_google(wd, queries[i])
#     if url:
#         download_image("", url, f"{row['recipe_id']}.jpg")

#Just does n amount
n = 5
for _, row in missing_files_df.iterrows():
    query = f"{row['name']} food.com cocktail"
    url = get_first_image_from_google(wd, query)
    if url:
        download_image("C:\\Users\\samue\\OneDrive - UTS (1)\\2023\\Spring\\Software Innovation Studio\\Cocktail Images\\", url, f"{row['recipe_id']}.jpg")


wd.quit()


Unnamed: 0                  4308
user_id                     4308
recipe_id                   4308
rating                      4308
name                        4308
                            ... 
'wine'                      1280
'x-rated fusion liqueur'    1280
'yellow cake batter'        1280
'zinfandel'                 1280
has_file                    4308
Length: 2168, dtype: int64
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271B5845EE0>
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271B20F4F40>
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271B5845EE0>
Success: 111316.jpg.png
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271C65E8D10>
FAILED - Unsupported image format: WEBP
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271C65E8D10>
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271B5845EE0>
FAILED - cannot identify image file <_io.BytesIO object at 0x00000271B2F