In [14]:
import pandas as pd
import sqlite3
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
import joblib

In [2]:
sqlite_db = '../database/recipes_db.sqlite'
conn = sqlite3.connect(sqlite_db)
c = conn.cursor()
c.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(c.fetchall())

[('recipes',)]


In [3]:
recipes = pd.read_sql('SELECT * FROM recipes', con=conn, index_col='index')
c.close()
conn.close()

In [4]:
recipes.ingredients

index
0      400g Rigatoni\n1 Butternut Squash\n100g of Par...
1      2 Large Butternut Squash\n1 Red Chilli\n70g Fl...
2      1kg Potatoes\n2 Onions\n2 Cloves of Garlic\n16...
3      500g Parsnips \n500g Carrots  \n2 Sweet Potato...
4      1 kg Chicken Wings\n1 Tbsp of Baking Powder\n3...
                             ...                        
409    3 Avocados - £3.00\nRed Chillies - £0.60\n2 Pa...
410    2 Bags of Cherry Tomatoes - £1.59\n1 Red Chill...
411    3 x 400 Gram Tins of Chopped Tomatoes - £1.20\...
412    1 Red Chilli - £0.60\n4 Spring Onions - £0.49\...
413    1 Loaf of Sourdough - £1.50\n1 Pint of Whole M...
Name: ingredients, Length: 413, dtype: object

In [5]:
def remove_numbers(s):
    result = ''.join([i for i in s if not i.isdigit()])
    return result

In [6]:
def remove_single(s):
    result = ' '.join([w for w in s.split() if len(w) > 1])
    return result

In [7]:
stop = stopwords.words('english')
other_stop = [
    'kg',
    'grams',
    'cans',
    'of',
    'bags',
    'tbsp',
    'tsp',
    'pinch',
    'pint',
    'pack',
    'packs',
    'fresh',
    'large',
    'small',
    'gram',
    'tins',
    'knob',
    'chopped',
    'smoked',
    'extra',
    'very',
    'lazy',
    'crushed',
    'ml',
    'litre',
    'bunch',
    'dried',
    'teaspoon',
    'dried',
    'tablespoon',
    'tablespoons',
    'bag',
    'half',
    'red',
    'green',
    'yellow',
    'pink',
    'orange',
    'total cost  covers absolutely everything',
    'tbsps'
    "mob's",
    'creamy',
    'curshed',
    'flat',
    'jar',
    'tin',
    'pitted',
    'bottle',
    'seasoning',
    'only',
    "n'",
    'toasted',
    'giant',
    'sliced',
    'diced',
    'minced',
    'knob',
    'knobs',
    'method',
    'frozen',
    'skinless',
    'boneless',
    'loaf',
    'grated',
    'jar',
    'nice',
    'good',
]
stop.extend(other_stop)

symbols = ['.', '£', '-', '!', '(', ')', ':', ',']

In [8]:
def remove_stop(s, fuzzy=False):
    words = s.split(' ')
    if fuzzy:
        out = []
        for i in words:
            include = True
            for st in stop:
                if fuzz.ratio(words, stop) > 50:
                    include = False
            if include:
                out.append(i)
        return ' '.join(out)
    elif not fuzzy:
        result = ' '.join([i for i in words if i not in stop])
        return result


def remove_symbol(s):
    for i in symbols:
        s = s.replace(i, '')
    return s.strip()

In [9]:
def extract_entities(s):
    l_words = s.lower().split('\n')

    l_words = [remove_numbers(s) for s in l_words]
    l_words = [remove_single(s) for s in l_words]
    l_words = [remove_stop(s) for s in l_words]
    l_words = [remove_symbol(s) for s in l_words]
    l_words = [i for i in l_words if i != '']
    return l_words

In [10]:
ing_entities = recipes.ingredients.map(extract_entities)

In [16]:
joblib.dump(ing_entities, '../database/ing_parts.z')

['../database/ing_parts.z']

In [108]:
ing_entities = recipes.ingredients.map(extract_entities).to_list()
title_entities = recipes.dish.map(extract_entities).to_list()

In [109]:
unpacked_ing_entities = []
for i in ing_entities:
    unpacked_ing_entities.extend(i)

unpacked_title_entities = []
for i in title_entities:
    unpacked_title_entities.extend(i)

unpacked_entities = unpacked_ing_entities #+ unpacked_title_entities

In [110]:
unpacked_entities = list(set(unpacked_entities))

In [111]:
# unpacked_title_entities

In [112]:
len(unpacked_entities)

1033

In [113]:
def remove_duplicates(unpacked_entities):
    defuzzed = []
    for i in unpacked_entities:
        include = True
        for n in defuzzed:
            if fuzz.ratio(i, n) > 50:
                include = False
                break
        if include == True:
            defuzzed.append(i)
    return defuzzed


def shorten(ent, max_words=4):
    return [row for row in ent if len(row.split(' ')) < max_words]

In [114]:
defuzzed = shorten(remove_duplicates(unpacked_entities))

In [115]:
defuzzed = [i + '\n' for i in defuzzed]

In [116]:
file = '../bot/data/food.txt'
with open(file, 'w') as f:
    f.writelines(defuzzed)