In [1]:
import json

input_data = json.load(open('../dataset/cooking_train.json'))
train_data, test_data = input_data[:29774], input_data[29774:]

In [2]:
import spacy
from tqdm import tqdm

nlp = spacy.load('en')
ing_tokens = dict()

for cook in tqdm(train_data):
    for ing in cook['ingredients']:
        ing = ing.lower()
        for t in ing.split():
            t_ = nlp(t)[0]
            if not t_.pos_.startswith('N'):
                continue
            ing_tokens[t_.lemma_] = ing_tokens[t_.lemma_] + 1 if t_.lemma_ in ing_tokens else 1

100%|██████████| 29774/29774 [00:00<00:00, 97714.29it/s]


In [3]:
for c in train_data:
    c['ingredients'] = [sorted(ing.lower.split(), key=lambda x: ing_tokens.get(x, 0))[-1] for ing in c['ingredients']]

In [4]:
from itertools import combinations
two_dict = dict()

for c in tqdm(train_data):
    for s in combinations(c['ingredients'], 2):
        s = ' '.join(sorted(s))
        if s not in two_dict:
            two_dict[s] = dict()
        two_dict[s][c['cuisine']] = two_dict[s][c['cuisine']] + 1 if c['cuisine'] in two_dict[s] else 1

three_dict = dict()

for c in tqdm(train_data):
    for s in combinations(c['ingredients'], 3):
        s = ' '.join(sorted(s))
        if s not in three_dict:
            three_dict[s] = dict()
        three_dict[s][c['cuisine']] = three_dict[s][c['cuisine']] + 1 if c['cuisine'] in three_dict[s] else 1
        
four_dict = dict()

for c in tqdm(train_data):
    for s in combinations(c['ingredients'], 4):
        s = ' '.join(sorted(s))
        if s not in four_dict:
            four_dict[s] = dict()
        four_dict[s][c['cuisine']] = four_dict[s][c['cuisine']] + 1 if c['cuisine'] in four_dict[s] else 1

100%|██████████| 29774/29774 [00:02<00:00, 11921.40it/s]
100%|██████████| 29774/29774 [00:10<00:00, 2728.19it/s]
100%|██████████| 29774/29774 [00:41<00:00, 722.86it/s]


In [5]:
for t in tqdm(two_dict):
    in_sum = sum(two_dict[t].values())
    for c in two_dict[t]:
        two_dict[t][c] = two_dict[t][c] / in_sum

for t in tqdm(three_dict):
    in_sum = sum(three_dict[t].values())
    for c in three_dict[t]:
        three_dict[t][c] = three_dict[t][c] / in_sum
        
for t in tqdm(four_dict):
    in_sum = sum(four_dict[t].values())
    for c in four_dict[t]:
        four_dict[t][c] = four_dict[t][c] / in_sum

100%|██████████| 74619/74619 [00:00<00:00, 390440.51it/s]
100%|██████████| 953674/953674 [00:01<00:00, 748830.36it/s]
100%|██████████| 6255313/6255313 [00:07<00:00, 843040.97it/s]


In [15]:
def predict(ings):
    ings_ = list()
    for ing in ings:
        try:
            ings_.append(sorted(ing.lower().split(), key=lambda x: ing_tokens[x])[-1])
        except KeyError:
            continue
            
    ings = ings_
        
    if len(ings) < 2:
        return 'unknown'

    predictions = dict()
    
    
    for s in combinations(ings, 2):
        try:
            s = ' '.join(sorted(s))
            two = two_dict[s]
            for c in two:
                predictions[c] = predictions[c] + two[c] * 0.1 if c in predictions else two[c] * 0.1
        except KeyError:
            continue
    
    for s in combinations(ings, 3):
        try:
            s = ' '.join(sorted(s))
            three = three_dict[s]
            for c in three:
                predictions[c] = predictions[c] + three[c] * 1 if c in predictions else three[c] * 1
        except KeyError:
            continue
            
    for s in combinations(ings, 4):
        try:
            s = ' '.join(sorted(s))
            four = four_dict[s]
            for c in four:
                predictions[c] = predictions[c] + four[c] * 2 if c in predictions else four[c] * 2
        except KeyError:
            continue
    
    if len(predictions) == 0:
        return 'unknown'

    return sorted(predictions, key=lambda x: predictions[x])[-1]

In [16]:
pds = list()

for c in tqdm(test_data):
    pds.append(predict(c['ingredients']))

100%|██████████| 10000/10000 [00:28<00:00, 345.15it/s]


In [17]:
[r == p for r, p in zip([c['cuisine'] for c in test_data], pds)].count(True)

6651

In [18]:
pds.count('unknown')

12

In [11]:
sorted(ing_tokens, key=lambda x: ing_tokens[x], reverse=True)

['pepper',
 'salt',
 'oil',
 'garlic',
 'ground',
 'sauce',
 'sugar',
 'onions',
 'cheese',
 'chicken',
 'olive',
 'water',
 'flour',
 'butter',
 'tomatoes',
 'powder',
 'cloves',
 'juice',
 'onion',
 'eggs',
 'cream',
 'rice',
 'cilantro',
 'lemon',
 'milk',
 'vegetable',
 'ginger',
 'corn',
 'vinegar',
 'lime',
 'soy',
 'cumin',
 'broth',
 'wine',
 'chili',
 'bell',
 'parsley',
 'sesame',
 'beans',
 'kosher',
 'carrots',
 'beef',
 'basil',
 'baking',
 'parmesan',
 'seeds',
 'paste',
 'chilies',
 'oregano',
 'cinnamon',
 'boneless',
 'tomato',
 'egg',
 'potatoes',
 'pork',
 'thyme',
 'shrimp',
 'bread',
 'chile',
 'vanilla',
 'skinless',
 'coconut',
 'tortillas',
 'sodium',
 'celery',
 'cayenne',
 'coriander',
 'mushrooms',
 'bay',
 'breasts',
 'leaf',
 'spray',
 'flakes',
 'starch',
 'stock',
 'scallions',
 'cheddar',
 'fish',
 'jalapeno',
 'mustard',
 'orange',
 'curry',
 'shallots',
 'sea',
 'light',
 'fat',
 'honey',
 'mozzarella',
 'spinach',
 'olives',
 'peppers',
 'salsa',
 'no

In [14]:
train_data

[{'cuisine': 'greek',
  'id': 10259,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'southern_us',
  'id': 25693,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'filipino',
  'id': 20130,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'indian',
  'id': 22213,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'indian',
  'id': 13162,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'jamaican',
  'id': 6602,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'spanish',
  'id': 42779,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'italian',
  'id': 3735,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'mexican',
  'id': 16903,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'italian',
  'id': 12734,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'italian',
  'id': 5875,
  'ingredients': ['o', 'n', 'i', 'o', 'n', 's']},
 {'cuisine': 'chinese'