In [9]:
import numpy as np
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

#from surprise import SVD
#from surprise import Dataset
#from surprise import Reader
#from surprise.model_selection import train_test_split
#from surprise import accuracy



%matplotlib inline

import tensorflow.keras as tf

In [1]:
import pandas as pd
import zipfile

# Specify the path to the zip file and the names of the CSV files within the zip
zip_file_path = "cocktail-data/dataset.zip"
csv_file_name1 = 'RAW_recipes.csv'
csv_file_name2 = 'RAW_interactions.csv'  # Replace with the actual name of your second CSV file

# Initialize two DataFrames to store the data from the two CSV files
df1 = None
df2 = None

# Open the zip file and read the first CSV file into the first DataFrame
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    with zip_file.open(csv_file_name1) as csv_file_in_zip:
        df_recipes = pd.read_csv(csv_file_in_zip)

# Open the zip file again and read the second CSV file into the second DataFrame
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    with zip_file.open(csv_file_name2) as csv_file_in_zip:
        df_reviews = pd.read_csv(csv_file_in_zip)

# Now you have two DataFrames, df1 and df2, containing data from the two CSV files
# You can merge them if needed

df_recipes.rename(columns={'id': 'recipe_id'}, inplace=True)




In [2]:
df_recipes.head()

(df_reviews.head())

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [3]:
combined_df = pd.merge(df_reviews, df_recipes, on='recipe_id', how='inner')
combined_df = combined_df.drop_duplicates(subset=['name'])




In [4]:
combined_df = combined_df[combined_df['tags'].str.contains('cocktails', case=False, na=False)]

combined_df.rename(columns={'id': 'recipe_id'}, inplace=True)

columns_to_drop = ['date', 'review','minutes','nutrition','contributor_id', 'description','submitted']

combined_df = combined_df.drop(columns=columns_to_drop)

to_keep = ['recipe_id','rating','name','n_steps','ingredients','steps','n_ingredients']

mongo_csv = combined_df[to_keep]

mongo_csv.to_csv("cocktail_data_mongo.csv", index=False)




In [5]:
cocktail_review_counts = combined_df.groupby('recipe_id')['rating'].count().reset_index()
cocktail_review_counts.columns = ['recipe_id', 'review_count']
from tabulate import tabulate
# Sort the DataFrame by review_count in descending order
# Sort the review counts DataFrame by review_count in descending order
cocktail_review_counts_sorted = cocktail_review_counts.sort_values(by='review_count', ascending=False)

# Merge the review counts DataFrame with the recipes DataFrame to get cocktail names
table_data = cocktail_review_counts_sorted.merge(df_recipes[['recipe_id', 'name']], on='recipe_id', how='inner')

# Display the table
table = tabulate(table_data, headers=['Recipe ID','Review Count','Cocktail Name'], tablefmt='pretty', showindex=False)
print(table)

print(cocktail_review_counts_sorted['review_count'].mean())



+-----------+--------------+------------------------------------------------------------------+
| Recipe ID | Review Count |                          Cocktail Name                           |
+-----------+--------------+------------------------------------------------------------------+
|   3450    |      1       |                              punch                               |
|  304710   |      1       |                           tko shooter                            |
|  303739   |      1       |                  harley davidson shot  shooter                   |
|  303753   |      1       |                       baileys mocha frapp                        |
|  303800   |      1       |                        creamy margaritas                         |
|  303875   |      1       |                            sour kirby                            |
|  303878   |      1       |                     caribbean queen cocktail                     |
|  303968   |      1       |            

In [6]:

# Only process if the first item in 'ingredients' column is a string
if isinstance(combined_df['ingredients'].iloc[0], str):
    # Remove [ and ] 
    combined_df['ingredients'] = combined_df['ingredients'].str.replace('[', '').str.replace(']', '')
    # Splitting the ingredients string by commas
    combined_df['ingredients'] = combined_df['ingredients'].str.split(',')
    
if isinstance(combined_df['ingredients'].iloc[0], float):
    combined_df['ingredients'] = combined_df['ingredients'].apply(lambda x: [item.strip() for item in x])


combined_df.head()
#ingredient_frequency = combined_df.explode('ingredients')['ingredients'].value_counts()
#print(ingredient_frequency)

combined_df = combined_df.loc[:,~combined_df.columns.duplicated(keep='first')]



#ingredient_frequency.to_csv("ingredient_frequency.csv", index=False)



  combined_df['ingredients'] = combined_df['ingredients'].str.replace('[', '').str.replace(']', '')


In [7]:
duplicate_names = combined_df[combined_df['name'].duplicated(keep=False)]
print(duplicate_names)


Empty DataFrame
Columns: [user_id, recipe_id, rating, name, tags, n_steps, steps, ingredients, n_ingredients]
Index: []


In [18]:
# Explode the 'ingredients' column
exploded_df = combined_df.explode('ingredients')

# Find unique ingredients
unique_ingredients = exploded_df['ingredients'].unique()

ingredient_counts = exploded_df['ingredients'].value_counts()

exploded_df['ingredients'] = exploded_df['ingredients'].str.replace('','')


print(ingredient_counts)



 'ice'                           646
 'vodka'                         526
 'sugar'                         487
 'pineapple juice'               468
 'orange juice'                  435
                                ... 
 'sauterne'                        1
 'amaretti cookie'                 1
'chocolate vodka'                  1
'sloes'                            1
 'sugar-free chocolate syrup'      1
Name: ingredients, Length: 2156, dtype: int64


# Cocktail Similarity
## Next We will prepare the features of the data
**Feature Extraction**
The ingredients list will be the primary feature for our content-based filtering.
**One-hot encoding**
This converts our categorical data into a numerical format that machine learning algorithms can understand and process.



In [10]:
# Instantiate the binarizer
mlb = MultiLabelBinarizer()

# Filtering out non-iterable items
filtered_ingredients = [ingredients if isinstance(ingredients, (list, tuple)) else [] for ingredients in combined_df['ingredients']]

# Apply MultiLabelBinarizer on the filtered data
binary_matrix = mlb.fit_transform(filtered_ingredients)

# Convert the binary matrix into a DataFrame for better visualization and manipulation
df_binary = pd.DataFrame(binary_matrix, columns=mlb.classes_)


combined_df = pd.concat([combined_df, df_binary], axis=1)



combined_df.head()

Unnamed: 0,user_id,recipe_id,rating,name,tags,n_steps,steps,ingredients,n_ingredients,"""captain morgan's spiced rum""",...,'white vermouth','white wine','white zinfandel wine','whole cloves','whole milk','wild strawberry gelatin','wine','x-rated fusion liqueur','yellow cake batter','zinfandel'
84,296027.0,182985.0,4.0,watermelon martini,"['60-minutes-or-less', 'time-to-make', 'course...",4.0,['place all ingredients in a shaker and shake ...,"['vodka', 'lime juice', 'triple sec', 'wate...",6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
691,452940.0,367080.0,4.0,apple blossom,"['15-minutes-or-less', 'time-to-make', 'course...",3.0,"['pour the brandy , apple juice , and lemon ju...","['brandy', 'apple juice', 'lemon juice', 'l...",4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1734,303545.0,259553.0,5.0,witch s brew,"['30-minutes-or-less', 'time-to-make', 'course...",6.0,['combine champagne and orange juice in large ...,"['sparkling wine', 'orange juice', 'orange s...",8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2342,538524.0,280085.0,5.0,masquerade cocktail,"['15-minutes-or-less', 'time-to-make', 'course...",2.0,"['half fill a cocktail shaker full of ice , ad...","['citrus-infused vodka', 'apple schnapps', '...",6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2514,276837.0,56516.0,2.0,peppermint patty hot chocolate,"['15-minutes-or-less', 'time-to-make', 'course...",7.0,"['combine milk , syrup , and sugar in a sauce ...","['chocolate syrup', 'sugar', 'milk', 'peppe...",5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
similarity_matrix = cosine_similarity(df_binary)


In [14]:
def get_similar_cocktails(cocktail_name, N=5):
    # Fetch the index corresponding to the given cocktail name
    cocktail_index = combined_df[combined_df['name'] == cocktail_name].index[0]
    
    # Fetch and enumerate similarity scores for the given cocktail
    similar_scores = list(enumerate(similarity_matrix[cocktail_index]))
    
    # Sort the scores
    sorted_similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    
    # Return the top N cocktail names excluding the input cocktail itself
    return [combined_df.iloc[i[0]]['name'] for i in sorted_similar_scores[1:N+1]]

get_similar_cocktails("witch s brew")


['strawberries romanoff in a glass',
 'amaretto sour jello shots',
 'alice in wonderland',
 'leap year cocktail',
 'jamaican shake']