In [1]:
from collections import defaultdict
import re
import pickle
import os
import random
import pickle
import numpy as np

## Shard doordash data by category

This is basically a manual map-reduce. First for each shard, we separate by the category name, then we gather entries across shards by category.

In [191]:
def get_by_category(shard):
    datafile = '/home/doordash/doordash_{}.pkl'.format(shard)
    with open(datafile, 'rb') as f:
        print("Opened {}".format(datafile))
        json = pickle.load(f)
        all_entries = []
        for entry in json:
            title = entry['title']
            section = entry['metadata']['section']
            description = entry['metadata']['description']
            category = entry['metadata'].get('restaurant_category', None)
            all_entries.append((title, description, section, category))
    print("Loaded all entries")
    category_to_entries = defaultdict(list)
    for entry in all_entries:
        og_category = entry[3]
        if og_category is None:
            og_category = 'None'
        if og_category == '':
            og_category = 'blank'
        og_category = og_category.replace('/', '')
        format_category = re.sub(r"\s+", '-', og_category)
        format_category = re.sub(r"\&", "and", format_category)
        format_category = format_category.lower()
        # (title, description) in a list
        category_to_entries[format_category].append((entry[0], entry[1]))
    print("Made dictionary")
    for category, entries in category_to_entries.items():
        print(category, len(entries))
        with open(
            '/home/doordash-sharded/doordash_{}_{}'.format(category, shard), 'wb') as f:
            pickle.dump(entries, f)

In [None]:
for i in range(0, 16):
    get_by_category(i)

Opened /home/doordash/doordash_0.pkl
Loaded all entries
Made dictionary
fast-food 25435
desserts 34530
burgers 23040
chicken 28650
american 99329
lunch 11183
barbecue 7126
breakfast 36399
seafood 15768
pizza 69580
none 48873
sandwiches 41953
mexican 64465
korean 3084
vegan 3053
salad 15018
asian 120993
peruvian 8071
chinese 32692
sushi 31786
pickup 39001
tapas 1026
caribbean 2777
british 1613
indian 13451
blank 11118
good-for-groups 1480
alcohol-stores 14
greek 8226
mediterranean 4882
japanese 11253
italian 40602
coffee 16509
soup 1843
thai 11470
grocery 4511
halal 2434
latin-american 8510
vietnamese 5536
middle-eastern 6052
healthy 4136
late-night 1020
food-and-drink 3222
steak 3041
coffee-and-tea 7328
ramen 1146
catering 1958
comfort-food 1851
european 1837
alcohol 1418
delis 26099
gluten-free 933
poke 2610
convenience 3922
vegetarian 3799
noodles 2196
kosher 691
gastropubs 927
other 4501
irish 1229
smoothie 820
pho 407
poutineries 812
ethiopian 500
african 976
burmese 59
pasta 1466


Opened /home/doordash/doordash_7.pkl
Loaded all entries
Made dictionary
sandwiches 42253
chinese 34200
mediterranean 6556
pasta 1997
pizza 58188
japanese 14064
none 13754
indian 27144
american 107588
asian 129474
vegan 2387
italian 36543
coffee 18704
comfort-food 2774
breakfast 36692
korean 3606
burgers 21598
greek 8816
poutineries 1614
blank 14539
peruvian 11095
desserts 35925
grocery 4670
caribbean 2654
pakistani 1839
salad 9942
vegetarian 4236
pickup 41038
catering 1829
mexican 57728
tapas 1685
barbecue 8532
middle-eastern 5656
sushi 33893
delis 14447
thai 10421
healthy 2804
vietnamese 6447
bakery 1083
fast-food 33266
alcohol 826
poke 1202
cafes 557
soup 2129
good-for-groups 1602
latin-american 14299
ramen 1959
alcohol-stores 44
irish 769
seafood 18513
noodles 2404
chicken 26562
halal 2112
french 997
pho 525
takeout 882
african 792
steak 5091
australian 98
lunch 8183
coffee-and-tea 8335
belgian 253
food-and-drink 4057
spanish 1107
smoothie 771
gluten-free 1955
gastropubs 1196
europe

In [122]:
category_to_filenames = defaultdict(list)
rootdir = '/home/doordash-sharded'
for filename in os.listdir(rootdir):
    _, category, shard = filename.split("_")
    category_to_filenames[category].append(
        os.path.join(rootdir, filename))

for category, filenames in category_to_filenames.items():
    all_items = []
    for filename in filenames:
        with open(filename, 'rb') as f:
            all_items.extend(pickle.load(f))
    with open('/home/doordash-by-section/{}'.format(category), 'wb') as f:
        pickle.dump(all_items, f)

In [123]:
from collections import Counter
category_items_counter = Counter()
for filename in os.listdir('/home/doordash-by-category'):
    with open(os.path.join('/home/doordash-by-category', filename),
              'rb') as f:
        all_items = pickle.load(f)
        category_items_counter[filename] = len(all_items)

In [83]:
print(category_items_counter.most_common())

[('asian', 1888513), ('american', 1562955), ('pizza', 961731), ('mexican', 959707), ('chinese', 642722), ('sandwiches', 642297), ('pickup', 626122), ('breakfast', 600278), ('italian', 555576), ('desserts', 555059), ('none', 506663), ('fast-food', 472846), ('sushi', 450214), ('chicken', 446070), ('burgers', 362240), ('indian', 343348), ('delis', 296270), ('coffee', 276429), ('seafood', 275364), ('japanese', 206353), ('blank', 200247), ('salad', 186696), ('thai', 185039), ('lunch', 160028), ('latin-american', 158533), ('peruvian', 155822), ('barbecue', 129460), ('greek', 128941), ('vietnamese', 119869), ('middle-eastern', 103537), ('mediterranean', 91260), ('coffee-and-tea', 87063), ('other', 75369), ('steak', 66718), ('convenience', 65039), ('vegetarian', 58239), ('grocery', 52181), ('food-and-drink', 51103), ('korean', 50348), ('caribbean', 46851), ('healthy', 46057), ('vegan', 40141), ('soup', 37458), ('halal', 35615), ('comfort-food', 34467), ('noodles', 33164), ('catering', 33024), 

In [11]:
print(category_items_counter.most_common())

[('asian', 360092), ('american', 305837), ('pizza', 195718), ('mexican', 179656), ('none', 155214), ('chinese', 120431), ('italian', 119166), ('sandwiches', 117053), ('pickup', 116152), ('breakfast', 106645), ('desserts', 102488), ('sushi', 80808), ('delis', 80458), ('chicken', 77188), ('fast-food', 74240), ('burgers', 66679), ('coffee', 55320), ('seafood', 49422), ('indian', 48293), ('blank', 39346), ('japanese', 36586), ('salad', 35476), ('thai', 34292), ('peruvian', 29712), ('lunch', 29052), ('latin-american', 26860), ('greek', 25872), ('barbecue', 24154), ('middle-eastern', 21886), ('vietnamese', 21710), ('coffee-and-tea', 20588), ('mediterranean', 17410), ('grocery', 14014), ('steak', 12980), ('other', 12247), ('korean', 11624), ('vegetarian', 11565), ('food-and-drink', 9993), ('healthy', 9942), ('convenience', 9244), ('vegan', 8234), ('caribbean', 7901), ('poke', 7443), ('halal', 7082), ('catering', 6599), ('comfort-food', 6114), ('soup', 6049), ('noodles', 5997), ('spanish', 485

# Take sharded data and clean

In [161]:
cuisine = 'salad'
with open(os.path.join('/home/doordash-by-category', cuisine),
              'rb') as f:
    all_items = pickle.load(f)
    all_titles = [e[0] for e in all_items]

In [162]:
re.sub("[\(\[].*?[\)\]]", "", 'Bacon & Sausage (2 Each)')
re.sub(r"\&", "and", "Bacon & Sausage (2 Each)")

'Bacon and Sausage (2 Each)'

In [163]:
random.choices(all_items, k=10)

[('Pork & Beef', ''),
 ('Chicken Cordon Bleu', ''),
 ('Cherry PepsiÂ®', '20 oz.: 260 cal., 2 liter: 160 cal. per serving.'),
 ('Gyro - Grinder 8" (Deluxe)',
  'Deluxe served with fries and pickles. Lettuce, tomato, tzatziki sauce, red onion.'),
 ('Chicken Club',
  'Hand-filleted marinated chicken breast, green leaf lettuce, tomatoes, hickory-smoked bacon, fresh avocado, and mayo, served on toasted sourdough'),
 ('Sprite Caffeine-Free Lemon-Lime Soda, 2L', ''),
 ('Cheese Cake', ''),
 ("Fish 'n Chips",
  'Two crispy whitefish fillets & French fries served with tartar sauce.'),
 ('Golden Fried Chicken',
  'Fresh, filleted chicken breast, hand-breaded with house-made seasoned flour and buttermilk, cooked to a juicy golden brown, topped with creamy, spiced red pepper sauce, lettuce, tomatoes and pickles'),
 ('Build Your Own Pizza (Large - 8 Slices)', '')]

In [87]:
def check_valid(title):
    check = "".join(title.split())  # remove whitespace
    check = check.replace("-", "")  # remove -, allowed
    return check.isalpha()

In [88]:
def format_valid(title):
    title = " ".join(title.split())  # whitespace to " "
    title = title.rstrip().lstrip()
    title = title.replace("-", " ")
    return title

In [89]:
letter_items = [format_valid(x[0]) for x in all_items if check_valid(x[0])]

In [90]:
len([x for x in letter_items if x==" "])

0

In [116]:
len(letter_items), len(all_items)

(1864961, 1248)

In [117]:
random.choices(letter_items, k=10)

['Iced Tea',
 'Americano',
 'NY Style Cheesecake',
 'Arizona Green Tea',
 'IZZE Blackberry',
 'Diet Coke',
 'Fanta',
 'Hazelnut Coffee Tote',
 'Grilled Chicken Plate',
 'Extra Small Southern Style Sweet Tea']

## Take sharded data and tokenize

In [93]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-multilingual-cased',
    do_lower_case=True)

##### Demonstrate with particular cuisine

In [152]:
cuisine = 'vegan'
with open(os.path.join('/home/doordash-by-category', cuisine),
              'rb') as f:
    all_items = pickle.load(f)

In [153]:
titles = [(e[0], e[1]) for e in all_items]
print(titles[:100])

[('Pepperoni', 'Blended san marzano tomatoes, pepperoni, mozzarella.'), ('Margherita', 'Blended san marzano tomatoes, sliced garlic, basil, mozzarella.'), ('Chiky Parm', 'Mozzarella, blended San Marzano tomatoes, oregano, breaded chicken, grated parm, fresh basil'), ('Truffle Shuffle', 'Mushrooms, red onion, mozzarella, ricotta, truffle oil, arugula.'), ('Rosemary', 'Sliced garlic, rosemary, mozzarella, ricotta, black pepper.'), ('Gringo Starr', 'Kale, pork belly, red onion, mozarella.'), ('Marinara', 'Blended san marzano tomatoes, sliced garlic, oregano.'), ('Calzone', 'Check our instagram for the calzone special! @babybluepizzapdx'), ('Special', 'Check our Instagram for specials! @babybluepizzapdx'), ('Vegan Ranch', 'House made vegan ranch'), ('Pepperoni', ''), ('Pork Belly', ''), ('Ricotta', ''), ('Mama Lils Peppers', ''), ('Arugula', ''), ('Cherry Tomatoes', ''), ('Caprese Salad', 'House made mozzarella balls, arugula, cherry tomatoes, fresh basil, hemp seeds, aged balsamic'), ('La

In [133]:
max_seq_length=64
tokenized_output = tokenizer.batch_encode_plus(
    titles[:5], max_length=max_seq_length,
    pad_to_max_length=False)
print(tokenized_output)

{'input_ids': [[101, 10114, 20758, 187, 40333, 48272, 42230, 63840, 10112, 99380, 102, 108193, 10206, 119, 108193, 95002, 10115, 119, 175, 40846, 10136, 118, 13961, 119, 32650, 39903, 119, 24194, 84445, 10114, 20758, 187, 40333, 48272, 10162, 58511, 117, 25470, 25096, 10162, 10680, 19457, 13420, 15050, 12720, 10171, 117, 15045, 10347, 15008, 117, 34202, 60995, 10107, 117, 187, 15794, 60020, 10107, 117, 34657, 31001, 187, 102], [101, 10824, 90039, 14325, 11263, 76484, 10854, 31862, 99380, 102, 108193, 10206, 119, 108193, 95002, 10115, 119, 175, 40846, 10136, 118, 13961, 119, 32650, 39903, 119, 10824, 90039, 28780, 26127, 10336, 14325, 11263, 76484, 10107, 117, 23912, 10133, 117, 25470, 25096, 10162, 48445, 10806, 117, 32461, 85137, 10350, 11135, 24207, 108821, 117, 47243, 64791, 10157, 109341, 117, 64782, 36018, 10355, 10343, 10113, 24109, 102], [101, 26824, 10931, 10133, 11135, 24207, 108821, 111, 93742, 20493, 10390, 102, 108193, 10206, 119, 108193, 95002, 10115, 119, 175, 40846, 1013

###### Getting max seq length

In [134]:
cuisine_to_lens = {}
for cuisine in os.listdir('/home/doordash-by-category'):
    if os.path.isdir('/home/doordash-by-category/{}'.format(cuisine)):
        continue
    print(cuisine)
    with open(os.path.join('/home/doordash-by-category', cuisine),
                  'rb') as f:
        all_items = pickle.load(f)
    # titles = [e[0] for e in all_items]
    titles = all_items
    random.shuffle(titles)
    tokenized_output = tokenizer.batch_encode_plus(
        titles[:1000], max_seq_length=128,
        pad_to_max_length=False)
    lens = [len(input_ids) for input_ids in tokenized_output['input_ids']]
    cuisine_to_lens[cuisine] = lens

sandwiches
deals
ethiopian
pickup
soup
smoothie
good-for-groups
alcohol-stores
beauty
exclusive
tapas
thai
german
cafes
pho
alcohol
belgian
indian
mediterranean
seafood
vietnamese
catering
mexican
greek
comfort-food
irish
european
ramen
heritage
bakery
newly-added
french
pakistani
takeout
desserts
halal
chicken
bubble-tea
drinks
kwb
fondue
argentine
steak
pasta
cheese
italian
late-night
japanese
south-asian
chinese
peruvian
asian
breakfast
coffee-and-tea
southern
healthy
other
cajun
brazilian
lunch
russian
none
pizza
delis
korean
barbecue
vegetarian
grocery
caribbean
noodles
filipino
british
gastropubs
burmese
sushi
middle-eastern
vegan
blank
latin-american
convenience
poke
poutineries
coffee
kosher
convenience-exclusive
gluten-free
african
burgers
salad
fast-food
food-and-drink
american
australian
spanish


In [115]:
{c: (max(l), np.mean(l)) for c, l in cuisine_to_lens.items()}

{'sandwiches': (131, 31.248),
 'deals': (99, 21.128),
 'ethiopian': (137, 26.055),
 'pickup': (210, 23.709),
 'soup': (178, 31.886),
 'smoothie': (157, 22.142),
 'good-for-groups': (75, 28.893),
 'alcohol-stores': (62, 19.620512820512822),
 'beauty': (104, 27.714285714285715),
 'exclusive': (20, 8.80327868852459),
 'tapas': (117, 26.046),
 'thai': (83, 24.614),
 'german': (112, 23.103),
 'cafes': (253, 24.692),
 'pho': (216, 26.508),
 'alcohol': (155, 29.564),
 'belgian': (92, 26.506214689265537),
 'indian': (117, 23.84),
 'mediterranean': (146, 22.413),
 'seafood': (108, 22.666),
 'vietnamese': (126, 23.376),
 'catering': (178, 25.499),
 'mexican': (116, 23.16),
 'greek': (95, 22.193),
 'comfort-food': (143, 26.209),
 'irish': (83, 26.58),
 'european': (97, 21.793),
 'ramen': (132, 22.13),
 'heritage': (51, 23.4375),
 'bakery': (177, 21.29),
 'newly-added': (90, 24.66793893129771),
 'french': (135, 24.36),
 'pakistani': (115, 21.059),
 'takeout': (90, 19.303),
 'desserts': (303, 29.32

### Saving tokenization

In [137]:
rootdir = '/home/doordash-by-category/title-tokenized/bert-base-multilingual-cased'
max_seq_length = 64
assert os.path.exists(rootdir)

In [138]:
file_pattern = "titles_{cuisine}_{max_seq_len}_{input_key}"

In [139]:
do_clean = True

def format_valid(title):
    title = " ".join(title.split())  # whitespace to " "
    title = title.rstrip().lstrip()
    title = title.replace("-", " ")
    return title

def check_valid(title):
    check = "".join(title.split())  # remove whitespace
    check = check.replace("-", "")  # remove -, allowed
    return check.isalpha()

In [172]:
targets_to_categories = {
    # Label to list of filenames corresponding to that label
    'mexican': ['mexican'],
    'chinese': ['chinese'],
    'american': ['american', 'burgers'],
    'italian': ['italian', 'pasta'],
    'thai': ['thai'],
    'indian': ['indian'],
    'japanese': ['japanese', 'ramen', 'sushi'],
    'other': ['african', 'alcohol', 'argentine', 'australian',
             'bakery', 'belgian', 'brazilian', 'burmese', 'desserts',
             'drinks', 'ethiopian', 'filipino', 'french',
             'german', 'greek', 'korean', 'vietnamese', 'poke']
}

targets_to_categories = {
    'healthy': ['healthy', 'vegetarian', 'salad', 'vegan'],
    'unhealthy': [    
        "desserts", 
        "fondue",
        "alcohol",
        #"barbecue",
        "bakery",
        "bubble-tea",
        "comfort-food",
        "fast-food",
        "southern"
    ]
}
all_categories = [item for sublist in targets_to_categories.values() for item in sublist]

In [173]:
print(max_seq_length)

64


In [174]:
file_pattern = "titles_{cuisine}_{max_seq_len}_{input_key}"
lens = {}
for cuisine in all_categories:# os.listdir('/home/doordash-by-category'):
    if os.path.isdir('/home/doordash-by-category/{}'.format(cuisine)):
        # Not a real cuisine, for example bert-base-multilingual-cased is a folder containing tokenizations
        continue
    with open(os.path.join('/home/doordash-by-category', cuisine), 'rb') as f:
        all_items = pickle.load(f)
    titles = [e[0] for e in all_items]
    if do_clean:
        titles = [format_valid(t) for t in titles if check_valid(t)]
        # titles = [(format_valid(t[0]), t[1]) for t in titles if check_valid(t[0])]
    lens[cuisine] = len(titles)
#     if os.path.exists(os.path.join(
#         rootdir, file_pattern.format(
#             cuisine=cuisine, max_seq_len=max_seq_length, 
#             input_key="input_ids"))):
#         # Tokenziation already exists in the given folder
#         continue

    print(cuisine, len(titles))
    tokenized_output = tokenizer.batch_encode_plus(
        titles,
        max_length=max_seq_length,
        pad_to_max_length=True)
    
    print("Done tokenizing")
    for input_key, arr in tokenized_output.items():
        filename = os.path.join(rootdir,
                                file_pattern.format(cuisine=cuisine,
                                      max_seq_len=max_seq_length,
                                      input_key=input_key))
        arr = np.array(arr)
        assert arr.shape == (len(titles), max_seq_length)
        fp = np.memmap(filename, dtype='int64', mode='w+',
                      shape=arr.shape)
        fp[:] = arr[:]
        del fp

healthy 32306
Done tokenizing
vegetarian 38449
Done tokenizing
salad 119619
Done tokenizing
vegan 30102
Done tokenizing
desserts 371291
Done tokenizing
fondue 305
Done tokenizing
alcohol 8215
Done tokenizing
bakery 10314
Done tokenizing
bubble-tea 2841
Done tokenizing
comfort-food 25081
Done tokenizing
fast-food 335453
Done tokenizing
southern 905
Done tokenizing


In [183]:
with open(os.path.join(rootdir, 'lens.pkl'), 'wb') as f:
    # Keep lens which is needed for loading a memmap
    pickle.dump(lens, f)

# Dataloader

In [184]:
import torch
import itertools
from collections import Counter, OrderedDict, defaultdict
from torch.utils.data import DataLoader

In [185]:
class IterableTitles(torch.utils.data.IterableDataset):
    def __init__(self, cuisine, root_dir, max_seq_length, len_dict):
        super(IterableTitles).__init__()
        self.cuisine = cuisine
        self.max_seq_length = max_seq_length
        self.datafile_pattern = os.path.join(
            root_dir, 
            'titles_{}_{}_{{}}'.format(cuisine, max_seq_length))
        self.shape = (len_dict[cuisine], max_seq_length)
        self.length = len_dict[cuisine]

    def __len__(self):
        return self.length

    def __iter__(self):
        input_key_to_memmap = {}
        for input_key in ['input_ids',
                          'attention_mask',
                         'token_type_ids']:
            datafile = self.datafile_pattern.format(input_key)
            print("LOADING MEMEMAP", datafile)
            fp = np.memmap(
                datafile, dtype='int64', mode='r+',
                shape=self.shape)
            input_key_to_memmap[input_key] = fp
        while True:
            i = random.choice(range(self.shape[0]))
        #for i in itertools.cycle(range(self.shape[0])):
            yield (
                input_key_to_memmap['input_ids'][i],
                input_key_to_memmap['token_type_ids'][i],
                input_key_to_memmap['attention_mask'][i])


class MultiStreamDataLoader:
    def __init__(self, targets_to_categories, batch_size, rootdir):
        self.targets = list(sorted(targets_to_categories.keys()))
        self.targets_to_categories = targets_to_categories
        
        self.categories_to_target = OrderedDict()
        self.targets_to_categories_weights= {}
        self.categories_to_dataset = {}
        self.categories_to_dataset_iter = {}
        with open(os.path.join(rootdir, 'lens.pkl'), 'rb') as f:
            self.categories_to_len = pickle.load(f)
        
        for t, list_of_c in targets_to_categories.items():
            for c in list_of_c:
                self.categories_to_target[c] = t
                dataset = IterableTitles(c, rootdir, 64, self.categories_to_len)
                self.categories_to_dataset[c] = dataset
                self.categories_to_dataset_iter[c] = iter(DataLoader(dataset, batch_size=None))

        for t, c_list in self.targets_to_categories.items():
            self.targets_to_categories_weights[t] = [self.categories_to_len[c] for c in c_list]

        self.batch_size = batch_size
        self.total_samples = sum(self.categories_to_len.values())


    def __len__(self):
        return self.total_samples//self.batch_size
    

    def __iter__(self):
        while True:
            buffer = []
            labels = []
            target_choices = random.choices(self.targets, k=self.batch_size)
            category_choices = []
            for t in target_choices:
                category_choices.append(random.choices(
                    self.targets_to_categories[t],
                    weights=self.targets_to_categories_weights[t],
                    k=1)[0])

            category_counter = Counter(category_choices)
            category_labels = []
            for category, num_sample in category_counter.items():
                category_labels.extend([category for _ in range(num_sample)])
                l_num = self.targets.index(self.categories_to_target[category])
                labels.extend([l_num for _ in range(num_sample)])
                buffer.extend(
                    [next(self.categories_to_dataset_iter[category]) for _ in range(num_sample)]
                )
            yield (torch.stack([b[0] for b in buffer]),
                   torch.stack([b[1] for b in buffer]),
                   torch.stack([b[2] for b in buffer]),
                   torch.tensor(labels),
                   category_labels
                  )

In [186]:
ds = MultiStreamDataLoader(targets_to_categories,
                           batch_size=800,
                           rootdir=rootdir)

In [187]:
ds.targets

['healthy', 'unhealthy']

In [188]:
# ds.targets_to_categories_weights

In [189]:
[(t, sum(w)) for t, w in ds.targets_to_categories_weights.items()]

[('healthy', 220476), ('unhealthy', 754405)]

In [190]:
for batch_num, batch in enumerate(ds):
    if batch_num == 5:
        break
    print(Counter(batch[3].numpy()))
    input_ids = batch[0]
    labels = batch[3]
    categories = batch[4]
    for i, ids in enumerate(input_ids):
        print(
            ds.targets[labels[i]], "({})".format(categories[i]),
            ":",
            tokenizer.convert_tokens_to_string(
                tokenizer.convert_ids_to_tokens(ids, skip_special_tokens=True)),
            # "\n",
            # ids
        )
    print("=====")

LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_fast-food_64_input_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_fast-food_64_attention_mask
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_fast-food_64_token_type_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_desserts_64_input_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_desserts_64_attention_mask
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_desserts_64_token_type_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_salad_64_input_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_salad_64_attention_mask
LOADING MEMEMAP /home/doordash-by-category

unhealthy (desserts) : ibc root beer
unhealthy (desserts) : boneless wings
unhealthy (desserts) : homesteader breakfast
unhealthy (desserts) : filled
unhealthy (desserts) : bottled water
unhealthy (desserts) : black sesame sensation
unhealthy (desserts) : applewood smoked blt
unhealthy (desserts) : bottled orange juice
unhealthy (desserts) : prairie farm milk
unhealthy (desserts) : traditional truffles
unhealthy (desserts) : jelly belly pink grapefruit
unhealthy (desserts) : strawberry hand mixed classic shake
unhealthy (desserts) : orange juice
unhealthy (desserts) : glaciers
unhealthy (desserts) : spinach
unhealthy (desserts) : oatmeal bowl
unhealthy (desserts) : signature diet cherry limeade
unhealthy (desserts) : chocolate espresso bean
unhealthy (desserts) : ketchup
unhealthy (desserts) : soft european croissant
unhealthy (desserts) : nachos
unhealthy (desserts) : texas cheese fries full
unhealthy (desserts) : filet of fish
unhealthy (desserts) : wieners sandwich
unhealthy (desser

healthy (salad) : chicken salad
healthy (salad) : meatballs thin crust pizza
healthy (salad) : super farmhouse boy
healthy (salad) : falafel pita
healthy (salad) : tea english breakfast
healthy (salad) : bacon cheeseburger
healthy (salad) : crispy korean glazed tender
healthy (salad) : lunch eggplant parmigiana sandwich
healthy (salad) : okroshka
healthy (salad) : vegetable calzone
healthy (salad) : all meat pizza small
healthy (salad) : tuna sandwich
healthy (salad) : extra crackers
healthy (salad) : bbq salmon rice bowl
healthy (salad) : malts
healthy (salad) : mountain dew
healthy (salad) : hot wings
healthy (salad) : half and half bowl
healthy (salad) : cajun burger
healthy (salad) : mexican coke
healthy (salad) : eighteen wings
healthy (salad) : super deluxe big boy
healthy (salad) : fresh mozzarella
healthy (salad) : bison burger
healthy (salad) : ahi tuna filet
healthy (salad) : krabby patty
healthy (salad) : chicken fingers
healthy (salad) : cauliflower buffalo wings
healthy (s

LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_southern_64_input_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_southern_64_attention_mask
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_southern_64_token_type_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_bubble-tea_64_input_ids
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_bubble-tea_64_attention_mask
LOADING MEMEMAP /home/doordash-by-category/title-tokenized/bert-base-multilingual-cased/titles_bubble-tea_64_token_type_ids
Counter({1: 402, 0: 398})
healthy (salad) : vanilla latte
healthy (salad) : gnocchi babbo
healthy (salad) : express health salad
healthy (salad) : ten chicken souvlaki sticks
healthy (salad) : marshmallow rice crispy treat
healthy (salad) : bbq chicken
healthy (salad) : c

healthy (healthy) : sanpellegrino limonata
healthy (healthy) : side of shrimp
healthy (healthy) : avocado
healthy (healthy) : peppermint patty
healthy (healthy) : medium wasabi ponzu salmon
healthy (healthy) : strawberry
healthy (healthy) : perrier
healthy (healthy) : broccoli
healthy (healthy) : samosa chaat
healthy (healthy) : aloha
healthy (healthy) : diet pepsi
healthy (healthy) : gatorade fruit punch
healthy (healthy) : greenie in a bottle
healthy (healthy) : spicy yellow thai curry quinoa bowl
healthy (healthy) : albacore tuna executive box
healthy (healthy) : side brown rice
healthy (healthy) : the cowboy
healthy (healthy) : green machine
unhealthy (desserts) : hammonds sea salt caramel milk individual bar
unhealthy (desserts) : triple chocolate chunks cookie
unhealthy (desserts) : lo mein
unhealthy (desserts) : ultimate smokehouse combo
unhealthy (desserts) : roger that
unhealthy (desserts) : margarita grilled chicken
unhealthy (desserts) : boneless wings
unhealthy (desserts) :

unhealthy (fast-food) : patty melt beef
unhealthy (fast-food) : unsweetened tea
unhealthy (fast-food) : unsweet iced tea
unhealthy (fast-food) : caramel latte
unhealthy (fast-food) : v energy drink
unhealthy (fast-food) : sugarfree vanilla cappuccino
unhealthy (fast-food) : nonfat premium hot chocolate
unhealthy (fast-food) : caramel cappuccino
unhealthy (fast-food) : caramel macchiato
unhealthy (fast-food) : iced latte
unhealthy (fast-food) : hot tea
unhealthy (fast-food) : decaffeinated coffee
unhealthy (fast-food) : house salsa
unhealthy (fast-food) : iced nf sugar free van latte
unhealthy (fast-food) : big mac
unhealthy (fast-food) : fountain drinks
unhealthy (fast-food) : mega panda
unhealthy (fast-food) : crispy chicken salad
unhealthy (fast-food) : caramel hot chocolate
unhealthy (fast-food) : unsweetened tea
unhealthy (fast-food) : chicken burrito combo
unhealthy (fast-food) : value taco
unhealthy (fast-food) : crispy chicken garden salad
unhealthy (fast-food) : quarter pounder

unhealthy (desserts) : chicharron de pollo
unhealthy (desserts) : dipped pretzel dark chocolate
unhealthy (desserts) : garlic parmesan pretzel
unhealthy (desserts) : mango iced tea
unhealthy (desserts) : classic broccoli chicken alfredo
unhealthy (desserts) : pops chocolate chip
unhealthy (desserts) : skillet queso
unhealthy (desserts) : golden tofu
unhealthy (desserts) : shrimp fajitas
unhealthy (desserts) : coffee
unhealthy (desserts) : salted caramel chocolate chip pint
unhealthy (desserts) : kids chicken corn dog
unhealthy (desserts) : nestle water
unhealthy (desserts) : omelet with cheese
unhealthy (desserts) : shrimp fajitas
unhealthy (desserts) : tempura shrimp
unhealthy (desserts) : ibc root beer
unhealthy (desserts) : sour plum tea
unhealthy (desserts) : gulab jamun
unhealthy (desserts) : milk chocolate ballerina
unhealthy (desserts) : old fashioned long nut rolls
unhealthy (desserts) : double glazed baby back ribs
unhealthy (desserts) : duroloco
unhealthy (desserts) : turtle 

healthy (salad) : onion rings
healthy (salad) : shrimp lovers dream
healthy (salad) : monster energy
healthy (salad) : chicken club wrap
healthy (salad) : tuna salad
healthy (salad) : ahi tuna filet
healthy (salad) : spaghetti
healthy (salad) : veggie burrito
healthy (salad) : extra virgin olive oil
healthy (salad) : meatball sub
healthy (salad) : shrimp parmigiana
healthy (salad) : patty melt
healthy (salad) : hot chocolate
healthy (salad) : make your own salad
healthy (salad) : omelete
healthy (salad) : buttercream frosted flower cookie
healthy (salad) : lemonade
healthy (salad) : lettuce iceburg
healthy (salad) : taco salad
healthy (salad) : veggie deluxe pizza
healthy (salad) : garden salad
healthy (salad) : club melt
healthy (salad) : zuppa di clams dinner
healthy (salad) : grilled cheese
healthy (salad) : individually baked apple pie vanilla
healthy (salad) : green beans
healthy (salad) : sedona taquito plate
healthy (salad) : veggie burger with cheese
healthy (salad) : pound of 

unhealthy (fast-food) : iced latte
unhealthy (fast-food) : french fries
unhealthy (fast-food) : chicken california combo
unhealthy (fast-food) : nonfat french vanilla latte
unhealthy (fast-food) : cheeseburger
unhealthy (fast-food) : caramel cappuccino
unhealthy (fast-food) : iced sugar free vanilla coffee
unhealthy (fast-food) : cinnamon pull aparts
unhealthy (fast-food) : chocolate milk
unhealthy (fast-food) : chicken
unhealthy (fast-food) : caramel latte
unhealthy (fast-food) : mango pineapple smoothie
unhealthy (fast-food) : nonfat caramel cappuccino
unhealthy (fast-food) : chicken and dumplings
unhealthy (fast-food) : oreo mcflurry
unhealthy (fast-food) : americano
unhealthy (fast-food) : caramel latte
unhealthy (fast-food) : french vanilla latte
unhealthy (fast-food) : nonfat premium hot chocolate
unhealthy (fast-food) : bacon egg cheese mcgriddle meal
unhealthy (fast-food) : pepperoni steak
unhealthy (fast-food) : big mac
unhealthy (fast-food) : double filet o fish meal
unhealth

healthy (salad) : caesar salad
healthy (salad) : ceviche
healthy (salad) : heath crunch cookie
healthy (salad) : spinach deluxe
healthy (salad) : chicken teriyaki onigiri
healthy (salad) : bacon wrapped shrimp
healthy (salad) : fried beef ravioli
healthy (salad) : buffalo wings
healthy (salad) : eggplant parm
healthy (salad) : french fries
healthy (salad) : deluxe big boy
healthy (salad) : chicken wings
healthy (salad) : chicken tender hoagie
healthy (salad) : veggie light
healthy (salad) : motheroni whole pan style pizza
healthy (salad) : baked apples
healthy (salad) : deluxe pizza bowl
healthy (salad) : chocolate malt
healthy (salad) : chicken quesadilla
healthy (salad) : mediterranean salad
healthy (salad) : linguini pesto
healthy (salad) : fried chicken sandwich
healthy (salad) : the hound
healthy (salad) : oven baked chicken wings
healthy (salad) : vanilla shake
healthy (salad) : vegetarian etouffee
healthy (salad) : veggie kebab
healthy (salad) : lemon
healthy (salad) : sausage c

unhealthy (desserts) : lunch combo bacon ranch chicken quesadillas
unhealthy (desserts) : cappuccino
unhealthy (desserts) : pistachio
unhealthy (desserts) : onion rings
unhealthy (desserts) : matcha jasmine mt
unhealthy (desserts) : strawberry cupcake
unhealthy (desserts) : chocolate chip scone
unhealthy (desserts) : enstrom copper milk almond toffee tin
unhealthy (desserts) : cronut
unhealthy (desserts) : matcha green tea milk frappe
unhealthy (desserts) : side caesar salad
unhealthy (desserts) : milk chocolate lab truffle
unhealthy (desserts) : molletes
unhealthy (desserts) : juice
unhealthy (desserts) : boneless wings
unhealthy (desserts) : cheese tortellini with salmon
unhealthy (desserts) : chicken tikka plait
unhealthy (desserts) : sweet cream ice cream
unhealthy (desserts) : ranchero benedict
unhealthy (desserts) : small cheese
unhealthy (desserts) : guaca monkey sandwich
unhealthy (desserts) : chicken parmesan
unhealthy (desserts) : churros works
unhealthy (desserts) : egg sala

unhealthy (fast-food) : caramel mocha
unhealthy (fast-food) : nonfat caramel cappuccino
unhealthy (fast-food) : mix by sprite tropic berry
unhealthy (fast-food) : turkey cheddar melt
unhealthy (fast-food) : chocolate cake
unhealthy (fast-food) : blt
unhealthy (fast-food) : chicken quesadilla snacker
unhealthy (fast-food) : milk
unhealthy (fast-food) : egg mcmuffin meal
unhealthy (fast-food) : double quarter pounder with cheese
unhealthy (fast-food) : triple chicken box
unhealthy (fast-food) : iced caramel latte
unhealthy (fast-food) : senior southern style sweet tea
unhealthy (fast-food) : sweet iced tea
unhealthy (fast-food) : quarter pounder with cheese
unhealthy (fast-food) : mix by sprite tropic berry
unhealthy (fast-food) : nonfat vanilla cappuccino
unhealthy (fast-food) : big king xl meal
unhealthy (fast-food) : big breakfast with hotcakes
unhealthy (fast-food) : double big mac meal
unhealthy (fast-food) : hazelnut latte
unhealthy (fast-food) : iced latte
unhealthy (fast-food) : 

unhealthy (fast-food) : large mozzarella sticks
unhealthy (fast-food) : latte
unhealthy (fast-food) : hash brown
unhealthy (fast-food) : bbq bacon crispy chicken sandwich meal
unhealthy (fast-food) : nonfat vanilla cappuccino
unhealthy (fast-food) : handcrafted guac
unhealthy (fast-food) : box of decaf coffee
unhealthy (fast-food) : double hamburger
unhealthy (fast-food) : shrimp
unhealthy (fast-food) : strawberry banana smoothie
unhealthy (fast-food) : mcchicken biscuit meal
unhealthy (fast-food) : caramel cappuccino
unhealthy (fast-food) : nonfat french vanilla latte
unhealthy (fast-food) : vanilla cappuccino
unhealthy (fast-food) : can soda
unhealthy (fast-food) : hamburger happy meal
unhealthy (fast-food) : low fat turkey
unhealthy (fast-food) : black raspberry chocolate chip
unhealthy (fast-food) : premium roast coffee
unhealthy (fast-food) : grilled chicken salad
unhealthy (fast-food) : jacked up value brc burrito
unhealthy (fast-food) : beyond taco
unhealthy (fast-food) : iced c

healthy (salad) : buffalo garlic knots
healthy (salad) : portabella char
healthy (salad) : coconut curry chicken hot plate
healthy (salad) : the riviera sandwich
healthy (salad) : mushroom burger
healthy (salad) : chicken noodle
healthy (salad) : carrot juice freshly squeezed
healthy (salad) : chicken yaki udon
healthy (salad) : coke
healthy (salad) : buffalo chicken strips
healthy (salad) : cannoli
healthy (salad) : fried calamari
healthy (salad) : shish tawouk
healthy (salad) : mont blanc
healthy (salad) : steak hoagie
healthy (salad) : avocado
healthy (salad) : double chocolate brownie
healthy (salad) : coffee shake
healthy (salad) : greek yogurt parfait
healthy (salad) : iced tea
healthy (salad) : carbonara station
healthy (salad) : quattro formaggi
healthy (salad) : bottled water
healthy (salad) : the pastrami pizza slice
healthy (salad) : chicken cheesesteak
healthy (salad) : your panini
healthy (salad) : philly cheesesteak
healthy (salad) : pecanberry salad
healthy (salad) : chi

unhealthy (desserts) : pretzel pocket
unhealthy (desserts) : chicken piccata
unhealthy (desserts) : basket of fries
unhealthy (desserts) : white spinach queso
unhealthy (desserts) : classic broccoli blackened shrimp alfredo
unhealthy (desserts) : side steamed broccoli
unhealthy (desserts) : lunch chicken fajitas
unhealthy (desserts) : side steamed broccoli
unhealthy (desserts) : spring spinach scramble
unhealthy (desserts) : spicy shrimp tacos
unhealthy (desserts) : kids combo
unhealthy (desserts) : grilled steak sandwich
unhealthy (desserts) : apple fritter
unhealthy (desserts) : case of water
unhealthy (desserts) : toasted coconut pint
unhealthy (desserts) : the jaffa
unhealthy (desserts) : elote entero
healthy (vegetarian) : apple mango avocado salad
healthy (vegetarian) : dal mash handi
healthy (vegetarian) : chicken bacon ranch queso
healthy (vegetarian) : garden fresh salad
healthy (vegetarian) : payasam
healthy (vegetarian) : double chocolate brownie
healthy (vegetarian) : diet 

In [70]:
rootdir

'/home/doordash-by-dish-no-and/title-tokenized/bert-base-multilingual-cased'

# Old

In [None]:
targets_to_categories = {
    # Label to list of filenames corresponding to that label
    'mexican': ['mexican'],
    'chinese': ['chinese'],
    'american': ['american', 'burgers'],
    'italian': ['italian', 'pasta'],
    'thai': ['thai'],
    'indian': ['indian'],
    'japanese': ['japanese', 'ramen', 'sushi'],
    'other': ['african', 'argentine', 'australian',
             'bakery', 'belgian', 'brazilian', 'burmese', # 'desserts',
             'drinks', 'ethiopian', 'filipino', 'french', # 'alcohol'
             'german', 'greek', 'korean', 'vietnamese', 'poke']
}
targets_to_categories = {
    'burger': ['burger'],
    'other': ['other'],
    'pizza': ['pizza'],
    'salad': ['salad'],
    'sandwich': ['sandwich'],
    'soup': ['soup'],
    'sushi': ['sushi']
}