# **Basket and Recipes**
This script tries to match each basket in the dataset with the corresponding cuisine style

### **Import** section

In [2]:
from multiprocessing import Pool
from random import sample
import numpy as np
import pandas as pd
import itertools
import json
import ast

---
### **Natural Language Toolkit**
#### Used for matching items in the basket dataset and recipes dataset

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')

ps = PorterStemmer()

# This function is used to convert a string into another similar string
# By removing plurals, stop words, punctuation, and so on...
def itemParser(s):
    
    s = s.replace("\n", "")
    s = re.sub(r'[^\w\s]', '', s)
    s = re.sub(r"(\d)", "", s)
    s = re.sub(r'\([^)]*\)', '', s)
    s = re.sub(u'\w*\u2122', '', s)
    s = s.lower()

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)

    filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
    s = ' '.join(filtered_sentence)
    
    return s

[nltk_data] Downloading package stopwords to /home/elia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/elia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
### **Utility** function

In [4]:
def jaccard_sim(s1, s2):
    return len(s1.intersection(s2))/len(s1.union(s2))

In [5]:
def getRecipeByID(rec, idr):
    data = rec[rec.id.eq(idr)]
    return (ast.literal_eval(data.new_ingredients.values[0]), data.cuisine.values[0])

---

### **DataSet** Reading

In [4]:
baskets = []
with open('./norm-dataset/groceries.txt', 'r') as f:
    for line in f:
        basket = []
        items = line.split(",")
        for item in items:
            parsed = itemParser(item)
            basket.append(parsed)
        basket = set(basket)
        baskets.append(basket)

In order to make the computation faster, we take a sample (10%) of the whole recipes dataset

In [6]:
keep = 0.1
recipes = pd.read_csv('./data/train-clean.csv')
sample_recipes = recipes.sample(n=int(len(recipes)*keep))

---
### **Parsing** Functions

In [7]:
def parse_ingredients(string_vector):    
    # Compact and remove inner quote symbols
    string_vector = string_vector.replace("', '", ',')    
    # Remove brakets, first and last quote symbols
    string_vector = string_vector[2:-2]    
    # Tokenize and make a set
    string_vector = set(string_vector.split(","))    
    return string_vector

In [8]:
def parse_recipes_dataset(recipes):
    parsed_recipes = []
    for recipe in recipes.itertuples(index=True, name='Pandas'):
        cuisine = getattr(recipe, "cuisine")
        ID = getattr(recipe, "id")
        ingredients = parse_ingredients(getattr(recipe, "new_ingredients"))
        parsed_recipes.append((cuisine, ID, ingredients))        
    return parsed_recipes

---

### **Matching**

In [9]:
def match(basket, recipes):

    # valid is the array containing all those recipes having a positive similarity
    valid = []
    
    # For all the (sampled) recipes in the dataset
    for recipe in recipes:
        
        # Get the data
        cuisine     = recipe[0]
        ID          = recipe[1]
        ingredients = recipe[2]
        
        # Compute the similarity
        sim = jaccard_sim(ingredients, basket)
        if sim > 0:
            valid.append((float(sim), cuisine, ID))
            
    basket = list(basket)
    
    return (basket, valid)

In [10]:
# In order to work properly, the dataset must be parsed (convert string ingredients to lists)
parsed_dataset = parse_recipes_dataset(recipes)

In [10]:
# Wrapper
def matching(basket):
    return match(basket, parsed_dataset)

cores = 6
with Pool(cores) as pool:
    return_values = pool.map(matching, baskets)    
    best_assoc = []
    for value in return_values:
        best_assoc.append(value)

---
### **Cleaning and Saving**

In [11]:
max_basket_size = 0
for basket in baskets:
    if len(basket) > max_basket_size:
        max_basket_size = len(basket)
print("Max Basket Size: " + str(max_basket_size))

Max Basket Size: 32


In [12]:
output = pd.DataFrame(best_assoc)
output.columns = ["Basket", "Recipes"]

In [13]:
display(output)

Unnamed: 0,Basket,Recipes
0,"[citru fruit, semifinish bread, readi soup, ma...","[(0.05, irish, 31027), (0.1, french, 18643), (..."
1,"[yogurt, tropic fruit, coff]","[(0.125, french, 36148), (0.1111111111111111, ..."
2,[whole milk],"[(0.14285714285714285, mexican, 25164), (0.058..."
3,"[meat spread, yogurt, pip fruit, cream chee]",[]
4,"[conden milk, veget, whole milk, long life bak...","[(0.1, mexican, 25164), (0.05, moroccan, 27858..."
...,...,...
19665,"[chocol, citru fruit, whippedsour cream, beef,...","[(0.03571428571428571, filipino, 20130), (0.02..."
19666,[cook chocol],[]
19667,"[citru fruit, frozen dessert, yogurt, veget, c...","[(0.047619047619047616, filipino, 20130), (0.0..."
19668,"[bottl beer, semifinish bread, soda, bottl water]","[(0.07142857142857142, indian, 18452), (0.0555..."


In [12]:
def save_assoc_raw(assoc_list, fname):
    with open(fname, 'w') as out_file:
        json.dump(assoc_list, out_file)
        out_file.close()

In [13]:
def save_assoc_pickle(pd_assoc_list, fname):
    pd_assoc_list.to_pickle(fname)

In [14]:
def load_assoc_raw(fname):
    with open(fname, 'r') as in_file:
        content = json.load(in_file)
        in_file.close()
        return content

In [15]:
def load_assoc_pickle(fname):
    return pd.read_pickle(fname)

In [40]:
save_assoc_raw(best_assoc, './data/raw_assoc_basket-recipes.txt')
test1 = load_assoc_raw('./data/raw_assoc_basket-recipes.txt')

In [18]:
save_assoc_pickle(output, './data/basket-recipes.pkl')
test2 = load_assoc_pickle('./data/basket-recipes.pkl')

In [42]:
display(test2)

Unnamed: 0,Basket,Recipes
0,"[margarin, citru fruit, readi soup, semifinish...","[(0.05, irish, 31027), (0.1, french, 18643), (..."
1,"[yogurt, tropic fruit, coff]","[(0.125, french, 36148), (0.1111111111111111, ..."
2,[whole milk],"[(0.14285714285714285, mexican, 25164), (0.058..."
3,"[yogurt, pip fruit, meat spread, cream chee]",[]
4,"[long life bakeri product, conden milk, veget,...","[(0.1, mexican, 25164), (0.05, moroccan, 27858..."
...,...,...
19665,"[napkin, salti snack, whippedsour cream, hambu...","[(0.03571428571428571, filipino, 20130), (0.02..."
19666,[cook chocol],[]
19667,"[veget, rum, cling filmbag, citru fruit, butte...","[(0.047619047619047616, filipino, 20130), (0.0..."
19668,"[bottl beer, bottl water, soda, semifinish bread]","[(0.07142857142857142, indian, 18452), (0.0555..."


---

### **Sorting Recipes by Similarity**

In [26]:
sorted_assoc = []
for association in best_assoc:
    
    basket = association[0]
    detail = association[1]
    
    # Sort detail with respect to the similarity
    detail = sorted(detail, key=lambda x: x[0], reverse=True)
    sorted_assoc.append((basket, detail))
    

In [34]:
output = pd.DataFrame(sorted_assoc)
output.columns = ["Basket", "Recipes"]
save_assoc_pickle(output, './data/basket-recipes - sorted.pkl')
display(output)

Unnamed: 0,Basket,Recipes
0,"[citru fruit, semifinish bread, readi soup, ma...","[(0.14285714285714285, mexican, 3500), (0.1428..."
1,"[yogurt, tropic fruit, coff]","[(0.125, french, 36148), (0.1111111111111111, ..."
2,[whole milk],"[(0.5, indian, 32030), (0.5, indian, 9488), (0..."
3,"[meat spread, yogurt, pip fruit, cream chee]",[]
4,"[conden milk, veget, whole milk, long life bak...","[(0.2, indian, 32030), (0.2, indian, 9488), (0..."
...,...,...
19665,"[chocol, citru fruit, whippedsour cream, beef,...","[(0.19047619047619047, southern_us, 38721), (0..."
19666,[cook chocol],[]
19667,"[citru fruit, frozen dessert, yogurt, veget, c...","[(0.15384615384615385, italian, 6539), (0.1428..."
19668,"[bottl beer, semifinish bread, soda, bottl water]","[(0.16666666666666666, southern_us, 46178), (0..."


---
### **Analysis**

In [23]:
sorted_assoc = load_assoc_raw('./data/raw_assoc_basket-recipes.txt')

In [25]:
single_recipes = []
for basket in sorted_assoc:
    if len(basket[1]) == 1:
        bask = basket[0]
        
        # Get data
        cuisine = basket[1][0][1]
        rec_id = basket[1][0][2]
        
        det = (cuisine, rec_id)
        single_recipes.append((bask, det))

In [26]:
single_recipes

[(['frankfurt', 'bottl water', 'rollsbun'], ('french', 13111)),
 (['frankfurt', 'dessert'], ('french', 13111)),
 (['frankfurt', 'rollsbun'], ('french', 13111)),
 (['frankfurt', 'rollsbun'], ('french', 13111)),
 (['frankfurt', 'hygien articl', 'bottl water', 'rollsbun', 'fruitveget juic'],
  ('french', 13111)),
 (['frankfurt', 'rollsbun'], ('french', 13111)),
 (['yogurt', 'frankfurt'], ('french', 13111)),
 (['frankfurt'], ('french', 13111)),
 (['frankfurt', 'rollsbun'], ('french', 13111)),
 (['frankfurt', 'rollsbun'], ('french', 13111)),
 (['bottl beer', 'frankfurt'], ('french', 13111)),
 (['deterg', 'frozen fruit'], ('jamaican', 34911)),
 (['frozen meal', 'frankfurt', 'shop bag', 'semifinish bread'],
  ('french', 13111)),
 (['pip fruit', 'frankfurt', 'bottl water', 'rollsbun'], ('french', 13111)),
 (['hamburg meat', 'frankfurt'], ('french', 13111)),
 (['frankfurt', 'rollsbun', 'fruitveget juic'], ('french', 13111)),
 (['frankfurt'], ('french', 13111)),
 (['frankfurt', 'domest egg'], ('

In [70]:
getRecipeByID(recipes, 13111)

(['pork loin',
  'champagn',
  'frankfurt',
  'garlic clove',
  'sauerkraut',
  'brat',
  'knockwurst',
  'ground black pepper',
  'salt pork',
  'onion'],
 'french')