# **Basket and Recipes**
This script tries to match each basket in the dataset with the corresponding cuisine style

### **Import** section

In [170]:
from multiprocessing import Pool
from random import sample
import numpy as np
import pandas as pd
import itertools
import json

---
### **Natural Language Toolkit**
#### Used for matching items in the basket dataset and recipes dataset

In [171]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import re

nltk.download('stopwords')
nltk.download('punkt')

ps = PorterStemmer()

# This function is used to convert a string into another similar string
# By removing plurals, stop words, punctuation, and so on...
def itemParser(s):
    
    s = s.replace("\n", "")
    s = re.sub(r'[^\w\s]', '', s)
    s = re.sub(r"(\d)", "", s)
    s = re.sub(r'\([^)]*\)', '', s)
    s = re.sub(u'\w*\u2122', '', s)
    s = s.lower()

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)

    filtered_sentence = [ps.stem(w) for w in word_tokens if not w in stop_words]
    s = ' '.join(filtered_sentence)
    
    return s

[nltk_data] Downloading package stopwords to /home/elia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/elia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
### **Utility** function

In [172]:
def jaccard_sim(s1, s2):
    return len(s1.intersection(s2))/len(s1.union(s2))

---

### **DataSet** Reading

In [173]:
baskets = []
with open('./data/groceries.csv', 'r') as f:
    for line in f:
        basket = []
        items = line.split(",")
        for item in items:
            parsed = itemParser(item)
            basket.append(parsed)
        basket = set(basket)
        baskets.append(basket)

In order to make the computation faster, we take a sample (10%) of the whole recipes dataset

In [174]:
keep = 0.1
recipes = pd.read_csv('./data/train-clean.csv')
sample_recipes = recipes.sample(n=int(len(recipes)*keep))

---
### **Parsing** Functions

In [175]:
def parse_ingredients(string_vector):    
    # Compact and remove inner quote symbols
    string_vector = string_vector.replace("', '", ',')    
    # Remove brakets, first and last quote symbols
    string_vector = string_vector[2:-2]    
    # Tokenize and make a set
    string_vector = set(string_vector.split(","))    
    return string_vector

In [176]:
def parse_recipes_dataset(recipes):
    parsed_recipes = []
    for recipe in recipes.itertuples(index=True, name='Pandas'):
        cuisine = getattr(recipe, "cuisine")
        ID = getattr(recipe, "id")
        ingredients = parse_ingredients(getattr(recipe, "new_ingredients"))
        parsed_recipes.append((cuisine, ID, ingredients))        
    return parsed_recipes

---

### **Matching**

In [201]:
def match(basket, recipes):

    # valid is the array containing all those recipes having a positive similarity
    valid = []
    
    # For all the (sampled) recipes in the dataset
    for recipe in recipes:
        
        # Get the data
        cuisine     = recipe[0]
        ID          = recipe[1]
        ingredients = recipe[2]
        
        # Compute the similarity
        sim = jaccard_sim(ingredients, basket)
        if sim > 0:
            valid.append((float(sim), cuisine, ID))
            
    basket = list(basket)
    
    if len(valid) == 0:
        return (basket, "None")
    else:
        
        # Sort the values with respect to the similarty
        valid = sorted(valid, key = lambda x: x[0], reverse=True)
        
        if len(valid) == 1:
            return (basket, (valid[0][0], valid[0][1], valid[0][2]), None, None, None)
        elif len(valid) == 2:
            return (basket, (valid[0][0], valid[0][1], valid[0][2]), (valid[1][0], valid[1][1], valid[1][2]), None, None)        
        elif len(valid) == 3:
            return (basket, (valid[0][0], valid[0][1], valid[0][2]), (valid[1][0], valid[1][1], valid[1][2]), (valid[2][0], valid[2][1], valid[2][2]), None)
        else:
            return (basket, (valid[0][0], valid[0][1], valid[0][2]), (valid[1][0], valid[1][1], valid[1][2]), (valid[2][0], valid[2][1], valid[2][2]), (valid[3][0], valid[3][1], valid[3][2]))

In [202]:
# In order to work properly, the dataset must be parsed (convert string ingredients to lists)
parsed_dataset = parse_recipes_dataset(recipes)

In [203]:
# Wrapper
def matching(basket):
    return match(basket, parsed_dataset)

cores = 6
with Pool(cores) as p:
    return_values = p.map(matching, baskets)    
    best_assoc = []
    for value in return_values:
        best_assoc.append(value)

---
### **Cleaning and Saving**

In [204]:
max_basket_size = 0
for basket in baskets:
    if len(basket) > max_basket_size:
        max_basket_size = len(basket)
print("Max Basket Size: " + str(max_basket_size))

Max Basket Size: 32


In [205]:
output = pd.DataFrame(best_assoc)
output.columns = ["Basket", "Cuisine 1", "Cuisine 2", "Cuisine 3", "Cuisine 4"]

In [206]:
display(output)

Unnamed: 0,Basket,Cuisine 1,Cuisine 2,Cuisine 3,Cuisine 4
0,"[margarin, semifinish bread, citru fruit, read...","(0.14285714285714285, mexican, 3500)","(0.14285714285714285, british, 48044)","(0.14285714285714285, brazilian, 6677)","(0.14285714285714285, chinese, 28154)"
1,"[tropic fruit, yogurt, coffe]","(0.25, french, 42120)","(0.25, greek, 39296)","(0.25, cajun_creole, 19588)","(0.25, vietnamese, 39799)"
2,[whole milk],"(0.5, indian, 32030)","(0.5, indian, 9488)","(0.5, indian, 30060)","(0.5, indian, 199)"
3,"[yogurt, pip fruit, meat spread, cream chees]","(0.2, brazilian, 42490)","(0.16666666666666666, british, 11757)","(0.16666666666666666, italian, 6809)","(0.16666666666666666, mexican, 1835)"
4,"[long life bakeri product, whole milk, condens...","(0.2857142857142857, indian, 37190)","(0.25, filipino, 9049)","(0.25, brazilian, 42981)","(0.2222222222222222, jamaican, 29610)"
...,...,...,...,...,...
9830,"[chocol, hygien articl, redblush wine, chicken...","(0.19047619047619047, southern_us, 38721)","(0.15, french, 16622)","(0.15, mexican, 8541)","(0.14285714285714285, southern_us, 41903)"
9831,[cook chocol],,,,
9832,"[yogurt, chicken, butter, domest egg, frozen d...","(0.15384615384615385, italian, 6539)","(0.14285714285714285, mexican, 6181)","(0.14285714285714285, southern_us, 45257)","(0.14285714285714285, greek, 6406)"
9833,"[bottl water, bottl beer, soda, semifinish bread]","(0.16666666666666666, southern_us, 46178)","(0.14285714285714285, southern_us, 11876)","(0.1111111111111111, southern_us, 27994)","(0.1, indian, 11494)"


In [207]:
def save_assoc_raw(assoc_list):
    with open('./data/raw_assoc_basket-recipes.txt', 'w') as out_file:
        json.dump(assoc_list, out_file)

In [208]:
def save_assoc_pickle(pd_assoc_list):
    pd_assoc_list.to_pickle("./data/basket-receipts.pkl")

In [209]:
def load_assoc_raw():
    with open('./data/raw_assoc_basket-recipes.txt', 'r') as in_file:
        return json.load(in_file)

In [210]:
def load_assoc_pickle():
    return pd.read_pickle("./data/basket-receipts.pkl")

In [211]:
save_assoc_raw(best_assoc)
test1 = load_assoc_raw()

In [212]:
save_assoc_pickle(output)
test2 = load_assoc_pickle()

In [213]:
display(test2)

Unnamed: 0,Basket,Cuisine 1,Cuisine 2,Cuisine 3,Cuisine 4
0,"[margarin, semifinish bread, citru fruit, read...","(0.14285714285714285, mexican, 3500)","(0.14285714285714285, british, 48044)","(0.14285714285714285, brazilian, 6677)","(0.14285714285714285, chinese, 28154)"
1,"[tropic fruit, yogurt, coffe]","(0.25, french, 42120)","(0.25, greek, 39296)","(0.25, cajun_creole, 19588)","(0.25, vietnamese, 39799)"
2,[whole milk],"(0.5, indian, 32030)","(0.5, indian, 9488)","(0.5, indian, 30060)","(0.5, indian, 199)"
3,"[yogurt, pip fruit, meat spread, cream chees]","(0.2, brazilian, 42490)","(0.16666666666666666, british, 11757)","(0.16666666666666666, italian, 6809)","(0.16666666666666666, mexican, 1835)"
4,"[long life bakeri product, whole milk, condens...","(0.2857142857142857, indian, 37190)","(0.25, filipino, 9049)","(0.25, brazilian, 42981)","(0.2222222222222222, jamaican, 29610)"
...,...,...,...,...,...
9830,"[chocol, hygien articl, redblush wine, chicken...","(0.19047619047619047, southern_us, 38721)","(0.15, french, 16622)","(0.15, mexican, 8541)","(0.14285714285714285, southern_us, 41903)"
9831,[cook chocol],,,,
9832,"[yogurt, chicken, butter, domest egg, frozen d...","(0.15384615384615385, italian, 6539)","(0.14285714285714285, mexican, 6181)","(0.14285714285714285, southern_us, 45257)","(0.14285714285714285, greek, 6406)"
9833,"[bottl water, bottl beer, soda, semifinish bread]","(0.16666666666666666, southern_us, 46178)","(0.14285714285714285, southern_us, 11876)","(0.1111111111111111, southern_us, 27994)","(0.1, indian, 11494)"


---