# **Basket and Recipes**
This script tries to match each basket in the dataset with the corresponding cuisine style

### **Import** section

In [39]:
from multiprocessing import Pool
from random import sample
import numpy as np
import pandas as pd
import itertools
import json
import ast
from statistics import mean 

---
### **Natural Language Toolkit**
#### Used for matching items in the basket dataset and recipes dataset

In [40]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')

ps = PorterStemmer()

# This function is used to convert a string into another similar string
# By removing plurals, stop words, punctuation, and so on...
def itemParser(s):
    
    s = s.replace("\n", "")
    s = re.sub(r'[^\w\s]', '', s)
    s = re.sub(r"(\d)", "", s)
    s = re.sub(r'\([^)]*\)', '', s)
    s = re.sub(u'\w*\u2122', '', s)
    s = s.lower()

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)

    filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
    s = ' '.join(filtered_sentence)
    
    return s

[nltk_data] Downloading package stopwords to /home/elia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/elia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
### **Utility** function

In [41]:
def jaccard_sim(s1, s2):
    return len(s1.intersection(s2))/len(s1.union(s2))

In [42]:
def getRecipeByID(rec, idr):
    data = rec[rec.id.eq(idr)]
    return (ast.literal_eval(data.new_ingredients.values[0]), data.cuisine.values[0])

---

### **DataSet** Reading

In [43]:
baskets = []
with open('../Data/sintetic/test01.csv', 'r') as f:
    for line in f:
        basket = []
        items = line.split(",")
        for item in items:
            parsed = itemParser(item)
            basket.append(parsed)
        basket = set(basket)
        baskets.append(basket)

In order to make the computation faster, we take a sample (10%) of the whole recipes dataset

In [44]:
keep = 0.1
recipes = pd.read_csv('../Data/train_norm.csv')
sample_recipes = recipes.sample(n=int(len(recipes)*keep))

---
### **Parsing** Functions

In [45]:
def parse_ingredients(string_vector):    
    # Compact and remove inner quote symbols
    string_vector = string_vector.replace("', '", ',')    
    # Remove brakets, first and last quote symbols
    string_vector = string_vector[2:-2]    
    # Tokenize and make a set
    string_vector = set(string_vector.split(","))    
    return string_vector

In [46]:
def parse_recipes_dataset(recipes):
    parsed_recipes = []
    for recipe in recipes.itertuples(index=True, name='Pandas'):
        cuisine = getattr(recipe, "cuisine")
        ID = getattr(recipe, "id")
        ingredients = parse_ingredients(getattr(recipe, "new_ingredients"))
        parsed_recipes.append((cuisine, ID, ingredients))        
    return parsed_recipes

---

### **Matching**

In [47]:
def match(basket, recipes):

    # valid is the array containing all those recipes having a positive similarity
    valid = []
    
    # For all the (sampled) recipes in the dataset
    for recipe in recipes:
        
        # Get the data
        cuisine     = recipe[0]
        ID          = recipe[1]
        ingredients = recipe[2]
        
        # Compute the similarity
        sim = jaccard_sim(ingredients, basket)
        if sim > 0:
            valid.append((float(sim), cuisine, ID))
            
    basket = list(basket)
    
    return (basket, valid)

In [48]:
# In order to work properly, the dataset must be parsed (convert string ingredients to lists)
parsed_dataset = parse_recipes_dataset(recipes)

In [49]:
# Wrapper
def matching(basket):
    return match(basket, parsed_dataset)

cores = 6
with Pool(cores) as pool:
    return_values = pool.map(matching, baskets)    
    best_assoc = []
    for value in return_values:
        best_assoc.append(value)

---
### **Cleaning and Saving**

In [50]:
max_basket_size = 0
for basket in baskets:
    if len(basket) > max_basket_size:
        max_basket_size = len(basket)
print("Max Basket Size: " + str(max_basket_size))

Max Basket Size: 27


In [51]:
output = pd.DataFrame(best_assoc)
output.columns = ["Basket", "Recipes"]

In [52]:
display(output)

Unnamed: 0,Basket,Recipes
0,"[pip, rollsbun, yogurt]",[]
1,"[pip, herb, soda]","[(0.08333333333333333, cajun_creole, 27976), (..."
2,"[milk, liquor, veget, domest, water, root, yog...","[(0.058823529411764705, southern_us, 25693), (..."
3,"[candi, misc, butter]","[(0.07142857142857142, filipino, 20130), (0.04..."
4,"[milk, fruitveget, herb, misc]","[(0.07142857142857142, southern_us, 25693), (0..."
...,...,...
9995,"[root, domest, candi, frozen]","[(0.16666666666666666, french, 20195), (0.125,..."
9996,"[fruit, chocol, veget, redblush, curd, misc, b...","[(0.043478260869565216, filipino, 20130), (0.0..."
9997,"[uhtmilk, whippedsour, butter]","[(0.07142857142857142, filipino, 20130), (0.04..."
9998,"[butter, curd, candi]","[(0.07142857142857142, filipino, 20130), (0.04..."


In [53]:
def save_assoc_raw(assoc_list, fname):
    with open(fname, 'w') as out_file:
        json.dump(assoc_list, out_file)
        out_file.close()

In [54]:
def save_assoc_pickle(pd_assoc_list, fname):
    pd_assoc_list.to_pickle(fname)

In [55]:
def load_assoc_raw(fname):
    with open(fname, 'r') as in_file:
        content = json.load(in_file)
        in_file.close()
        return content

In [56]:
def load_assoc_pickle(fname):
    return pd.read_pickle(fname)

In [40]:
save_assoc_raw(best_assoc, '../Data/raw_assoc_basket_recipes.txt')
raw = load_assoc_raw('../Data/raw_assoc_basket_recipes.txt')

In [27]:
save_assoc_pickle(output, '../Data/basket_recipes.pkl')
pickle = load_assoc_pickle('../Data/basket_recipes.pkl')

In [42]:
display(test2)

Unnamed: 0,Basket,Recipes
0,"[margarin, citru fruit, readi soup, semifinish...","[(0.05, irish, 31027), (0.1, french, 18643), (..."
1,"[yogurt, tropic fruit, coff]","[(0.125, french, 36148), (0.1111111111111111, ..."
2,[whole milk],"[(0.14285714285714285, mexican, 25164), (0.058..."
3,"[yogurt, pip fruit, meat spread, cream chee]",[]
4,"[long life bakeri product, conden milk, veget,...","[(0.1, mexican, 25164), (0.05, moroccan, 27858..."
...,...,...
19665,"[napkin, salti snack, whippedsour cream, hambu...","[(0.03571428571428571, filipino, 20130), (0.02..."
19666,[cook chocol],[]
19667,"[veget, rum, cling filmbag, citru fruit, butte...","[(0.047619047619047616, filipino, 20130), (0.0..."
19668,"[bottl beer, bottl water, soda, semifinish bread]","[(0.07142857142857142, indian, 18452), (0.0555..."


---

### **Sorting Recipes by Similarity**

In [57]:
sorted_assoc = []
for association in best_assoc:
    
    basket = association[0]
    detail = association[1]
    
    # Sort detail with respect to the similarity
    detail = sorted(detail, key=lambda x: x[0], reverse=True)
    sorted_assoc.append((basket, detail))
    

In [58]:
output = pd.DataFrame(sorted_assoc)
output.columns = ["Basket", "Recipes"]
display(output)

Unnamed: 0,Basket,Recipes
0,"[pip, rollsbun, yogurt]",[]
1,"[pip, herb, soda]","[(0.2, southern_us, 46178), (0.166666666666666..."
2,"[milk, liquor, veget, domest, water, root, yog...","[(0.23076923076923078, russian, 37983), (0.222..."
3,"[candi, misc, butter]","[(0.3333333333333333, indian, 41124), (0.33333..."
4,"[milk, fruitveget, herb, misc]","[(0.25, french, 21158), (0.2222222222222222, i..."
...,...,...
9995,"[root, domest, candi, frozen]","[(0.16666666666666666, french, 20195), (0.125,..."
9996,"[fruit, chocol, veget, redblush, curd, misc, b...","[(0.15384615384615385, brazilian, 46423), (0.1..."
9997,"[uhtmilk, whippedsour, butter]","[(0.3333333333333333, indian, 41124), (0.33333..."
9998,"[butter, curd, candi]","[(0.3333333333333333, indian, 41124), (0.33333..."


In [21]:
clustered = pd.read_pickle("../Data/Clustered_Basket.pkl")

In [23]:
output["K_Means"] = pd.Series(clustered["Cluster"])
#output["DB_Scan"] = pd.Series(clustered["DB_Scan"])

In [24]:
display(output)

Unnamed: 0,Basket,Recipes,K_Means
0,"[semifinish bread, citru fruit, margarin, read...","[(0.14285714285714285, mexican, 3500), (0.1428...",2
1,"[tropic fruit, coff, yogurt]","[(0.125, french, 36148), (0.1111111111111111, ...",3
2,[whole milk],"[(0.5, indian, 32030), (0.5, indian, 9488), (0...",7
3,"[yogurt, pip fruit, meat spread, cream chee]",[],3
4,"[veget, whole milk, conden milk, long life bak...","[(0.2, indian, 32030), (0.2, indian, 9488), (0...",9
...,...,...,...
9830,"[whole milk, redblush wine, chocol, whippedsou...","[(0.19047619047619047, southern_us, 38721), (0...",9
9831,[cook chocol],[],2
9832,"[frozen dessert, butter, veget, citru fruit, r...","[(0.15384615384615385, italian, 6539), (0.1428...",4
9833,"[bottl beer, bottl water, semifinish bread, soda]","[(0.16666666666666666, southern_us, 46178), (0...",0


In [158]:
final = []
for o in output.values:
    basket = o[0]
    
    recipes = o[1]
    
    if len(recipes) > 0:
        tmp = {}
        for recipe in recipes:
            if recipe[1] not in tmp.keys():
                tmp[recipe[1]] = []
            tmp[recipe[1]].append(recipe[0])

        for item in tmp:
            tmp[item] = len(tmp[item]) * mean(tmp[item])

        tmp = {k: v for k, v in sorted(tmp.items(), key=lambda item: item[1], reverse=True)}
        best = (list(tmp.keys())[0], tmp[list(tmp.keys())[0]])

        cuisine = best[0]
        score = best[1]
    
    else:
        cuisine = None
        score = 0
        
    
    kmeans = o[2]
    dbs = o[3]
    
    final.append([basket, cuisine, score, kmeans, dbs])

In [159]:
best_recipes = pd.DataFrame(final)
best_recipes.columns = ["Basket", "Cuisine", "Confidence", "K_Means", "DB_Scan"]
display(best_recipes)

Unnamed: 0,Basket,Cuisine,Confidence,K_Means,DB_Scan
0,"[semifinish bread, readi soup, margarin, citru...",southern_us,4.545730,3,0
1,"[tropic fruit, yogurt, coff]",french,0.125000,0,0
2,[whole milk],southern_us,19.588293,2,1
3,"[pip fruit, cream chee, meat spread, yogurt]",,0.000000,0,0
4,"[conden milk, whole milk, long life bakeri pro...",southern_us,15.074193,9,1
...,...,...,...,...,...
9830,"[whole milk, beef, hamburg meat, citru fruit, ...",southern_us,72.196639,1,1
9831,[cook chocol],,0.000000,3,0
9832,"[citru fruit, veget, cling filmbag, rum, butte...",southern_us,74.624349,14,2
9833,"[semifinish bread, bottl water, bottl beer, soda]",southern_us,0.752915,5,0


In [170]:
best_recipes.to_pickle("../Data/Best_recipes.pkl")

---
### **Analysis**

In [171]:
#sorted_assoc = load_assoc_raw('../Data/raw_assoc_basket_recipes.txt')

In [176]:
grouped_recipes = best_recipes.groupby(["K_Means", "Cuisine"]).count().reset_index()

display(grouped_recipes)

Unnamed: 0,K_Means,Cuisine,Basket,Confidence,DB_Scan
0,0,cajun_creole,4,4,4
1,0,chinese,34,34,34
2,0,filipino,2,2,2
3,0,french,65,65,65
4,0,indian,55,55,55
...,...,...,...,...,...
121,18,italian,10,10,10
122,18,southern_us,75,75,75
123,19,indian,4,4,4
124,19,italian,7,7,7


In [183]:
grouped_recipes = grouped_recipes.iloc[:,:3]

In [173]:
x = {}
for row in grouped_recipes.values:
    if row[0] not in x.keys():
        x[row[0]] = []
    x[row[0]].append([row[1], row[2]])

In [174]:
for cluster in x:
    cuisines = x[cluster]
    maxv = [None, -1]
    for c in cuisines:
        if maxv[1] < c[1]:
            maxv = [c[0], c[1]]
    print(maxv)

['southern_us', 121]
['southern_us', 180]
['southern_us', 950]
['mexican', 587]
['southern_us', 316]
['southern_us', 596]
['italian', 86]
['chinese', 216]
['southern_us', 35]
['southern_us', 263]
['southern_us', 33]
['italian', 58]
['southern_us', 114]
['southern_us', 40]
['chinese', 25]
['chinese', 85]
['southern_us', 122]
['italian', 13]
['southern_us', 75]
['southern_us', 40]


In [43]:
single_recipes = []
for basket in sorted_assoc:
    if len(basket[1]) == 1:
        bask = basket[0]
        
        # Get data
        cuisine = basket[1][0][1]
        rec_id = basket[1][0][2]
        
        det = (cuisine, rec_id)
        single_recipes.append((bask, det))

In [44]:
single_recipes

[(['rollsbun', 'bottl water', 'frankfurt'], ('french', 13111)),
 (['dessert', 'frankfurt'], ('french', 13111)),
 (['rollsbun', 'frankfurt'], ('french', 13111)),
 (['rollsbun', 'frankfurt'], ('french', 13111)),
 (['rollsbun', 'frankfurt', 'fruitveget juic', 'bottl water', 'hygien articl'],
  ('french', 13111)),
 (['rollsbun', 'frankfurt'], ('french', 13111)),
 (['frankfurt', 'yogurt'], ('french', 13111)),
 (['frankfurt'], ('french', 13111)),
 (['rollsbun', 'frankfurt'], ('french', 13111)),
 (['rollsbun', 'frankfurt'], ('french', 13111)),
 (['bottl beer', 'frankfurt'], ('french', 13111)),
 (['frozen fruit', 'deterg'], ('jamaican', 34911)),
 (['frozen meal', 'shop bag', 'semifinish bread', 'frankfurt'],
  ('french', 13111)),
 (['rollsbun', 'pip fruit', 'bottl water', 'frankfurt'], ('french', 13111)),
 (['hamburg meat', 'frankfurt'], ('french', 13111)),
 (['fruitveget juic', 'rollsbun', 'frankfurt'], ('french', 13111)),
 (['frankfurt'], ('french', 13111)),
 (['domest egg', 'frankfurt'], ('

In [45]:
getRecipeByID(recipes, 13111)

(['pork loin',
  'champagn',
  'frankfurt',
  'garlic clove',
  'sauerkraut',
  'brat',
  'knockwurst',
  'ground black pepper',
  'salt pork',
  'onion'],
 'french')