In [1]:
# !pip install torch torchvision torchaudio
# pip install "numpy<2"
# !pip install transformers
# import sys
# !{sys.executable} -m pip install transformers
# !{sys.executable} -m pip install "numpy<2"
# !{sys.executable} -m pip install torch torchvision torchaudio
# !{sys.executable} -m pip install pandas

In [2]:
import torch
import sys
import pandas as pd
from transformers import pipeline
import re
from tqdm import tqdm  # optional progress bar
from collections import Counter 
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Data Prep

- Pull in data from CSV 
- Filter for categories that have had at least 10 sale days from Jan - Apr
  - Excluding May through June for new sellers
- remove any numbers in the text description, force all case to be lower
- tokenize each description
- 


In [37]:
# Read the CSV file into a DataFrame
df = pd.read_csv('../data/Hospital_data_cleaned.csv') 
df.head()

Unnamed: 0,HOSPITAL,PMT_GRP,SALES_DT,SALES_TM,CHECK_ID,TRANS_ID,CATEGORY,ITEM_NM,GROSS_REV,RUNNING_CAT_REV,RUNNING_ITM_REV,RUNNING_HOSP_CAT_REV,RUNNING_HOSP_ITM_REV,CAT_REV_RANK,ITM_REV_RANK,HOSP_CAT_RANK,HOSP_ITM_RANK,FREQ_SELLER
0,HOSPITAL B,UNCATEGORIZED,2024-02-14,12:45:47,26937172,48028188,MARKET ST. DELI,TUNA SALAD,7.99,83152.25,7077.01,38876.16,3098.66,8,62,16,165,1
1,HOSPITAL B,UNCATEGORIZED,2024-01-19,07:07:35,27572139,48028288,ARISESHINEMRNMKT,SCRAMBLED EGGS,1.99,323487.36,55855.59,196469.96,28542.44,4,5,6,9,1
2,HOSPITAL B,UNCATEGORIZED,2024-01-18,12:57:02,27056269,48028249,GRILL & CO,LARGE FRY CUP,2.99,477168.19,74821.96,252197.78,56962.57,2,4,2,2,1
3,HOSPITAL B,UNCATEGORIZED,2024-02-21,12:39:45,28129406,48028230,PRODCLASS200,SODA COKE ZERO 20OZ BOTTLE,2.19,478517.01,13302.48,260574.97,7646.98,1,29,1,54,1
4,HOSPITAL B,UNCATEGORIZED,2024-02-02,13:26:18,26665167,48028219,LA CUCINA,PEPPERONI PIZZA,7.99,93398.59,29854.26,58561.29,16359.46,7,12,12,22,1


In [39]:

# Step 1: Get unique item names and make into a DataFrame
items_df = pd.DataFrame(df['ITEM_NM'].unique(), columns=['ITEM_NM'])

# Step 2: Define clean and tokenize function
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text
# Step 3: Apply to create tokens column
items_df['tokens'] = items_df['ITEM_NM'].apply(clean_and_tokenize) #used for running mode
df['tokens'] = df['ITEM_NM'].apply(clean) #used to join output back to input
# View result
items_df.head()

Unnamed: 0,ITEM_NM,tokens
0,TUNA SALAD,"[tuna, salad]"
1,SCRAMBLED EGGS,"[scrambled, eggs]"
2,LARGE FRY CUP,"[large, fry, cup]"
3,SODA COKE ZERO 20OZ BOTTLE,"[soda, coke, zero, oz, bottle]"
4,PEPPERONI PIZZA,"[pepperoni, pizza]"


# Models

Uses Facebooks Bart pre-trained language model

Model is able to categorize without having a predefined training set

We separate into two models as 1 food item could be both healthy and a drink

Each individual word receives a likelihood %, we use a majority rule (so biggest of the set) as the winner

### Model 1 - tagging Drinks vs Food

### Model 2 - tagging Healthy vs Unhealthy

#### Creating NLP Function

Function takes in

- label list (food/drink/etc)
- classifer model (NLP pipeline)
- unique tokens (model efficiency)
- batch size (model efficiency)

In [8]:

def NLP(labels_list, classifier_input, tokenized_list, batch_size_num):
    # Labels and model
    labels = labels_list
    classifier = classifier_input  # use GPU if available

    # Flatten and deduplicate tokens
    flat_tokens = list({token for tokens in tokenized_list for token in tokens})  # unique tokens

    # Batch processing
    batch_size = batch_size_num
    results = []

    for i in tqdm(range(0, len(flat_tokens), batch_size)):
        batch = flat_tokens[i:i+batch_size]
        outputs = classifier(batch, labels)
        if isinstance(outputs, dict):  # batch_size = 1 case
            outputs = [outputs]
        for output in outputs:
            results.append({
                "token": output['sequence'],
                "label": output['labels'][0],
                "score": output['scores'][0]
            })

    results_df = pd.DataFrame(results)
    return results_df


#### Running models for Healthy vs Unhealthy then Food vs Beverage

In [10]:
#inputs

# Labels
labels_food = ["Beverage", "Meat","Dairy","Side","Fruit","Dessert","Snack"]
# labels_food = ["Beverage","Food"]
labels_healthy = ["Healthy","Unhealthy"]

# Model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)  # use GPU if available

# Batch processing
batch_size = 16


# Running each Model type
results_df_food = NLP(labels_food, classifier, items_df['tokens'], batch_size)
results_df_health = NLP(labels_healthy, classifier, items_df['tokens'], batch_size)


Device set to use mps:0
100%|███████████████████████████████████████████| 80/80 [08:52<00:00,  6.66s/it]
100%|███████████████████████████████████████████| 80/80 [35:38<00:00, 26.73s/it]


### Selecting Majority Rank

Attaching each category into our primary DataFrame

In [41]:
def get_majority_label(token_list, token_map):
    labels = [token_map.get(token) for token in token_list if token in token_map]
    if labels:
        return Counter(labels).most_common(1)[0][0]
    else:
        return None

# mapping tokens to labels
token_to_label = dict(zip(results_df_food['token'], results_df_food['label']))
token_to_label_healthy = dict(zip(results_df_health['token'], results_df_health['label']))


items_df['predicted_category'] = items_df['tokens'].apply(lambda x: get_majority_label(x, token_to_label))
items_df['health_category'] = items_df['tokens'].apply(lambda x: get_majority_label(x, token_to_label_healthy)).drop(columns = 'tokens')

# df = df.merge(items_df, on=[['ITEM_NM','tokens']], how='left')
df = df.merge(
    items_df,
    on='ITEM_NM',
    how='left'
).drop(columns = ['tokens_x','tokens_y'])


In [43]:
df.head()

Unnamed: 0,HOSPITAL,PMT_GRP,SALES_DT,SALES_TM,CHECK_ID,TRANS_ID,CATEGORY,ITEM_NM,GROSS_REV,RUNNING_CAT_REV,RUNNING_ITM_REV,RUNNING_HOSP_CAT_REV,RUNNING_HOSP_ITM_REV,CAT_REV_RANK,ITM_REV_RANK,HOSP_CAT_RANK,HOSP_ITM_RANK,FREQ_SELLER,predicted_category,health_category
0,HOSPITAL B,UNCATEGORIZED,2024-02-14,12:45:47,26937172,48028188,MARKET ST. DELI,TUNA SALAD,7.99,83152.25,7077.01,38876.16,3098.66,8,62,16,165,1,Meat,Healthy
1,HOSPITAL B,UNCATEGORIZED,2024-01-19,07:07:35,27572139,48028288,ARISESHINEMRNMKT,SCRAMBLED EGGS,1.99,323487.36,55855.59,196469.96,28542.44,4,5,6,9,1,Side,Unhealthy
2,HOSPITAL B,UNCATEGORIZED,2024-01-18,12:57:02,27056269,48028249,GRILL & CO,LARGE FRY CUP,2.99,477168.19,74821.96,252197.78,56962.57,2,4,2,2,1,Side,Healthy
3,HOSPITAL B,UNCATEGORIZED,2024-02-21,12:39:45,28129406,48028230,PRODCLASS200,SODA COKE ZERO 20OZ BOTTLE,2.19,478517.01,13302.48,260574.97,7646.98,1,29,1,54,1,Beverage,Unhealthy
4,HOSPITAL B,UNCATEGORIZED,2024-02-02,13:26:18,26665167,48028219,LA CUCINA,PEPPERONI PIZZA,7.99,93398.59,29854.26,58561.29,16359.46,7,12,12,22,1,Side,Healthy


### Make Manual Edits

Review top 50 items and make adjustments as needed

In [45]:
df[['ITEM_NM', 'predicted_category', 'health_category', 'TRANS_ID']] \
    .groupby(['ITEM_NM', 'predicted_category', 'health_category']) \
    .count() \
    .sort_values(by='TRANS_ID', ascending = False).head(20)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TRANS_ID
ITEM_NM,predicted_category,health_category,Unnamed: 3_level_1
SCRAMBLED EGGS,Side,Unhealthy,25388
BACON,Meat,Healthy,24677
BISCUIT,Side,Healthy,22743
CHICKEN TENDERS,Meat,Healthy,21405
LARGE FRY CUP,Side,Healthy,20705
SMASHED BURGER,Side,Unhealthy,16715
12OZ COFFEE,Side,Unhealthy,15495
HASH BROWNS,Side,Healthy,14636
SMALL FRY CUP,Side,Healthy,13886
SAUSAGE GRAVY,Meat,Healthy,12213


In [47]:


conditions = [
    df['ITEM_NM'].str.contains(r'\btenders?\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bscrambled eggs\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bbiscuit\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bfry\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bburger\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bgravy\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bzero\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bdiet\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bcake\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bcoffee\b', case=False, na=False),
    df['ITEM_NM'].str.contains(r'\bbacon\b', case=False, na=False)
]

predicted_values = ['Meat', 'Breakfast','Side','Side','Meat','Side','Beverage','Beverage','Dessert','Beverage','Meat']
health_values = ['Unhealthy', 'Healthy','Unhealthy','Unhealthy','Unhealthy','Unhealthy','Healthy','Healthy','Unhealthy','Healthy','Unhealthy']

df['predicted_category'] = np.select(conditions, predicted_values, default=df['predicted_category'])
df['health_category'] = np.select(conditions, health_values, default=df['health_category'])

df[['ITEM_NM', 'predicted_category', 'health_category', 'TRANS_ID']] \
    .groupby(['ITEM_NM', 'predicted_category', 'health_category']) \
    .count() \
    .sort_values(by='TRANS_ID', ascending = False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TRANS_ID
ITEM_NM,predicted_category,health_category,Unnamed: 3_level_1
SCRAMBLED EGGS,Breakfast,Healthy,25388
BACON,Meat,Unhealthy,24677
BISCUIT,Side,Unhealthy,22743
CHICKEN TENDERS,Meat,Unhealthy,21405
LARGE FRY CUP,Side,Unhealthy,20705
SMASHED BURGER,Meat,Unhealthy,16715
12OZ COFFEE,Beverage,Healthy,15495
HASH BROWNS,Side,Healthy,14636
SMALL FRY CUP,Side,Unhealthy,13886
SAUSAGE GRAVY,Side,Unhealthy,12213


In [49]:
df.to_csv('../data/food_cluster.csv', index=False)