This notebook should smartly sample your train and test dataset

Expected input: 
- cleaned data (00)
- NER results (01b, optional)

Expected output: 
- train set/ test set csv
- remaining items to sample from

In [1]:
import utils as ut
import pandas as pd
from pandas.api.types import is_numeric_dtype
import os
import hjson as json
import numpy as np

import importlib
importlib.reload(ut)

  warn_incompatible_dep(
2023-02-20 19:10:53.642187: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-20 19:10:54.271320: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-20 19:10:54.271381: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


<module 'utils' from '/home/shared/code/08_protein_attribution/utils.py'>

In [2]:
# Read in our params file
f = open('input_params.hjson')
params = json.load(f)
f.close()

# SQL params
db_type = params['core']['db_type']
#(cia.cleansed_tier_1 = 'Beverages' or cia.cleansed_tier_2= 'Snacks/Bakery')
sql_code = params['core']['sql_code']

# Modelling params
itemname_col = params['core']['itemname_col']
spend_col = params['core']['spend_col']
cls_input_cols = params['core']['cls_input_cols']

ner_model_name = params['core']['ner_model_name']
cls_model_name = params['core']['cls_model_name']
model_type = 'classifier'
sample_n = params['nb_three']['sample_n']
test_samples = params['nb_three']['test_samples']
n_strata = params['nb_three']['n_strata']
ner_clust_cols = params['core']['ner_clust_cols']

# # Algorithm specific params
use_pretrained_model = params['core']['use_pretrained_model']
model_architecture = params['core']['model_architecture']
model_path = params['core']['model_path']


In [3]:
use_ner_results_as_clust_input = 1

if use_ner_results_as_clust_input:
    df = pd.read_csv(f'named_entity_recognition/{ner_model_name}/data/{ner_model_name}_preprocessed.csv')
else:
    df = pd.read_csv(f'{model_type}/{cls_model_name}/data/{cls_model_name}_preprocessed.csv')

df_full = df.copy(deep=True) # First, save a copy of the full dataset for reference
df

Unnamed: 0,lineitem_name,tier_1,tier_2,tier_3,tier_4,sales_amt_gross,item_for_selection,clust_input
0,Grits Large,Food,Breakfast,Breakfast Side,Grits,78552.240000,grits large,grits large
1,Large Boat,Food,Entree,Weighed/Build Your Own,Weighed/Build Your Own,190793.963333,large boat,large boat
2,Kitchen Fresh 1137 Italian Focaccia (7.7oz),Food,Entree,Sandwich/Wrap,Sandwich/Wrap,84461.820000,kitchen fresh 1137 italian focaccia 7 7oz,kitchen fresh 1137 italian focaccia 7 7oz
3,Adobo Chicken Bowl,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,236852.690000,adobo chicken bowl,adobo chicken bowl
4,8oz Steel Cut Oatmeal (1,Food,Breakfast,Oatmeal/Cereal,Oatmeal,125298.810000,8oz steel cut oatmeal 1,8oz steel cut oatmeal 1
...,...,...,...,...,...,...,...,...
460910,Spicy Pepperjack Burger,Food,Entree,Burger,Burger,5.490000,spicy pepperjack burger,spicy pepperjack burger
460911,Charlotte SP White Egg Salad (6oz),Food,Entree,Other Entree,Other Entree,2.990000,charlotte sp white egg salad 6oz,charlotte sp white egg salad 6oz
460912,Crisper and Waffle Combo,Food,Breakfast,Griddle,Waffles,11.890000,crisper and waffle combo,crisper and waffle combo
460913,BFK - FIT Applewood Bacon Egg & Cheddar Flatbread,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,0.000000,bfk fit applewood bacon egg cheddar flatbread,bfk fit applewood bacon egg cheddar flatbread


### Generate a train and test dataset

In [4]:
## First, let's generate a test set


def get_test_set(df, spend_col, sampling_technique):
    ## This function samples the df and creates (a) a test set and (b) the rest of the items

    # Get the relevant subset of columns
    df_train_test_selector = df[['item_for_selection', spend_col]]

    # group by item_for_selection to remove duplicate items.
    funcs = {col:'sum' if is_numeric_dtype(df_train_test_selector[col])
                                else 'first'
                    for col in df_train_test_selector.drop('item_for_selection', 1).columns}
    df_train_test_selector = df_train_test_selector.groupby('item_for_selection').agg(funcs).reset_index()

    # Get a sample of our dataset to be our test set.
    cols = list(df_train_test_selector)
    df_ss = df_train_test_selector.copy(deep=True)
    if sampling_technique == 'stratified':
        df_ss = df_ss[df_ss[spend_col] > df_ss[spend_col].quantile(.1)] # Generate the test set on the top 90% of data
        df_ss['bin']= pd.qcut(df_ss[spend_col].astype('float'),n_strata, labels=False)
        df_samples = df_ss.groupby('bin').sample(n=int(test_samples/n_strata), random_state=7)
    elif sampling_technique == 'random':
        df_ss = df_ss[df_ss[spend_col] > df_ss[spend_col].quantile(.1)] # Generate the test set on the top 90% of data
        df_samples = df_ss.sample(n=test_samples, random_state=7)
    df_samples = df_samples[cols] # drop all the extra cols we made

    # Generate a test set and a remaining_items set
    cols = list(df)
    df_merged = df.merge(df_samples['item_for_selection'], on='item_for_selection', how='left', indicator=True)

    df_test = df_merged[df_merged['_merge'] == 'both']
    df_test = df_test[cols] # drop all the extra cols we made

    df_not_sampled = df_merged[df_merged['_merge'] == 'left_only']
    df_not_sampled = df_not_sampled[cols] # drop all the extra cols we made
    df = df_not_sampled.copy()

    return df_test, df


# group item_for_selection
sampling_technique = 'random' # random or stratified
df_test, df = get_test_set(df, spend_col, sampling_technique)
df_remaining= df.copy(deep=True)

  for col in df_train_test_selector.drop('item_for_selection', 1).columns}


In [5]:
# ## Generate the test set
# cols = list(df)

# # Get a stratified sample of our dataset to be our test set.
# sampling_technique = 'random' # random or stratified
# df_ss = df.copy(deep=True)
# if sampling_technique == 'stratified':
#     df_ss = df_ss[df_ss[spend_col] > df_ss[spend_col].quantile(.1)] # Generate the test set on the top 90% of data
#     df_ss['bin']= pd.qcut(df_ss[spend_col].astype('float'),n_strata, labels=False)
#     df_stratified_samples = df_ss.groupby('bin').sample(n=int(test_samples/n_strata), random_state=7)
# elif sampling_technique == 'random':
#     df_stratified_samples = df_ss.sample(n=test_samples, random_state=7)
# df_stratified_samples = df_stratified_samples[cols] # drop all the extra cols we made

# # Drop the test set from the rest of the data to avoid a leak.
# df_merged = df.merge(df_stratified_samples['item_for_selection'], on='item_for_selection', how='left', indicator=True)
# df_not_sampled = df_merged[df_merged['_merge'] == 'left_only']
# df_not_sampled = df_not_sampled[cols] # drop all the extra cols we made
# df = df_not_sampled.copy()

In [6]:
## Generate the train set using k-means sampling. This prepares the data

def trim_df_for_sampling(df, max_size, spend_col):
# Limit our k-means clustering to a max size so we don't overflow 

    if df.shape[0] > max_size:
        print('Input df too large. Pruning...')
        pct = max_size/df.shape[0]
        df = df[df[spend_col] > df[spend_col].quantile(.1)] # Focus on the top 90% of data b/c it tends to be cleaner
        df = df.sample(n=max_size, random_state=7)
        print(f'Percent of data kept: {pct}')
    return df

max_size = 1000
df = trim_df_for_sampling(df, max_size, spend_col)
df

Input df too large. Pruning...
Percent of data kept: 0.0021697198240791167


Unnamed: 0,lineitem_name,tier_1,tier_2,tier_3,tier_4,sales_amt_gross,item_for_selection,clust_input
122446,WELL-BEING - Lunch Deli Promo,Food,Entree,Other Entree,Lunch Special,353.99,well being lunch deli promo,well being lunch deli promo
67135,Double Chalupa Beef,Food,Entree,Beef Entree,Beef Entree,826.16,double chalupa beef,double chalupa beef
245235,JO: Chicken Salad Snacker,Food,Entree,Salad,Salad,1731.53,jo chicken salad snacker,jo chicken salad snacker
166353,BEEF AND BROCCOLI,Food,Entree,Beef Entree,Beef Entree,98.45,beef and broccoli,beef and broccoli
295607,Flame Feature 1. Chicken Bacon S,Food,Entree,Poultry Entree,Poultry Entree,24.00,flame feature 1 chicken bacon s,flame feature 1 chicken bacon s
...,...,...,...,...,...,...,...,...
287298,Street Eats - Banh Mi Combo,Food,Entree,Lunch Combo,Lunch Combo,479.40,street eats banh mi combo,street eats banh mi combo
211326,3 Tenders& 3 Sauces & Cheesy Biscuit,Food,Entree,Chicken Tenders,Chicken Tenders,941.85,3 tenders 3 sauces cheesy biscuit,3 tenders 3 sauces cheesy biscuit
206825,panang tofu curry,Food,Entree,Vegetarian Entree,Vegetarian Entree,264.00,panang tofu curry,panang tofu curry
107131,Irish Breakfast Grilled Cheese,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,354.67,irish breakfast grilled cheese,irish breakfast grilled cheese


In [7]:
'''
    Here, we're sampling twice. Depending on the columns selected in ner_clust_cols, you might find a lot of cases where the clust_input is empty. 
    Instead of ignoring these cases, we're going to cluster on the itemname_cols instead, so we can still have a represetnative sample of items 
    where the cluster columns are empty. This is particularly important when building models around relatively sparse features. For example, in a classifier on 
    dietary categories, most items won't contain any dietary-related tokens (e.g., diet, zero, zero sugar, probiotic). In those empty cases the classifier to just 
    label the item as a 'regular' item; the model needs to see enough 'regular' items to learn the proper mappings.
'''

def k_means_sampling(df, spend_col, sample_n, model_path, use_pretrained_model, model_architecture, use_cached_results):
    df = df[['item_for_selection', 'clust_input', spend_col]]

    out = []

    empty_ppt = (df['clust_input'] == '').sum()/len(df)
    CRS = ut.ClustResample(
        df = df[(df[spend_col] > 0) & (df['clust_input']!='')],
        input_col = 'clust_input',
        sample_n = round(sample_n*1-empty_ppt),
        model_path = model_path,
        use_pretrained_model=use_pretrained_model,
        model_architecture=model_architecture
    )

    if use_cached_results and os.path.isfile(f'{model_type}/{cls_model_name}/models/{cls_model_name}_bert_vecs_1_feather'):
        CRS.out = pd.read_feather(f'{model_type}/{cls_model_name}/models/{cls_model_name}_bert_vecs_1_feather') # Save the bert vecs so you don't have to run it everytime
    else:
        CRS.bert_vecs()
        CRS.kmeans()
        CRS.sample_centroids()
        CRS.format_for_classifier_labelling()
        CRS.out = CRS.out.drop(columns = [x for x in ner_clust_cols if x not in [itemname_col]])
        CRS.out.reset_index().to_feather(f'{model_type}/{cls_model_name}/models/{cls_model_name}_bert_vecs_1_feather')
    out.append(CRS.out)

    CRS = ut.ClustResample(
        df = df[(df[spend_col] > 0) & (df['clust_input']=='')],
        input_col = 'item_for_selection',
        sample_n = round(sample_n*empty_ppt),
        model_path = model_path,
        use_pretrained_model=use_pretrained_model,
        model_architecture=model_architecture
    )

    if len(CRS.df) > round(sample_n*empty_ppt) and  round(sample_n*empty_ppt) > 0:  
        if use_cached_results and os.path.isfile(f'{model_type}/{cls_model_name}/models/{cls_model_name}_bert_vecs_2_feather'):
            CRS.out = pd.read_feather(f'{model_type}/{cls_model_name}/models/{cls_model_name}_bert_vecs_2_feather') # Save the bert vecs so you don't have to run it everytime
        else:
            CRS.bert_vecs()
            CRS.kmeans()
            CRS.sample_centroids()
            CRS.format_for_classifier_labelling()
            CRS.out = CRS.out.drop(columns = [x for x in ner_clust_cols if x not in [itemname_col]])
            CRS.out.reset_index().to_feather(f'{model_type}/{cls_model_name}/models/{cls_model_name}_bert_vecs_2_feather')
    else:
        CRS.out = CRS.df
        
    out.append(CRS.out)

    out = pd.concat(out)
    return out

use_cached_results = 1
out = k_means_sampling(df, spend_col, sample_n, model_path, use_pretrained_model, model_architecture, use_cached_results)

### Join the train and test dataset, generate a remaining_items dataframe and write out

In [8]:
def remove_from_full_dataset(df, df_samples):
    # Generate a train set and a remaining_items set
    cols = list(df)
    df_merged = df.merge(df_samples['item_for_selection'], on='item_for_selection', how='left', indicator=True)

    df_test = df_merged[df_merged['_merge'] == 'both']
    df_test = df_test[cols] # drop all the extra cols we made

    df_not_sampled = df_merged[df_merged['_merge'] == 'left_only']
    df_not_sampled = df_not_sampled[cols] # drop all the extra cols we made
    df = df_not_sampled.copy()

    return df_test, df

out, df_remaining = remove_from_full_dataset(df_remaining, out)

In [9]:
out

Unnamed: 0,lineitem_name,tier_1,tier_2,tier_3,tier_4,sales_amt_gross,item_for_selection,clust_input
243,TRA - Pepperoni Pizza,Food,Entree,Pizza,Pizza,278335.94,tra pepperoni pizza,tra pepperoni pizza
978,MEX - Meat Taco Salad,Food,Entree,Salad,Salad,139089.16,mex meat taco salad,mex meat taco salad
3085,Baked Fish,Food,Entree,Seafood Entree,Seafood Entree,33076.53,baked fish,baked fish
4563,TERP-GKE-Teriyaki_Plate Chx Spicey,Food,Entree,Poultry Entree,Poultry Entree,117863.03,terp gke teriyaki plate chx spicey,terp gke teriyaki plate chx spicey
5669,Chicken Salad Kit,Food,Entree,Salad,Salad,791.70,chicken salad kit,chicken salad kit
...,...,...,...,...,...,...,...,...
429139,Grilled Chicken Gyro Salad,Food,Entree,Salad,Salad,11.98,grilled chicken gyro salad,grilled chicken gyro salad
430092,Today Only: Pork Souvlaki # Lemon Tahini Sauce,Food,Entree,Pork Entree,Pork Entree,14.50,today only pork souvlaki lemon tahini sauce,today only pork souvlaki lemon tahini sauce
439090,CC Vegan Couscous Falafel Salad/Végane Cous & Fal,Food,Entree,Salad,Salad,7.22,cc vegan couscous falafel salad végane cous fal,cc vegan couscous falafel salad végane cous fal
442554,AFC Sushi - California Roll SP,Food,Entree,Sushi,Sushi,29.94,afc sushi california roll sp,afc sushi california roll sp


In [10]:

key_cols = cls_input_cols+[itemname_col]+[spend_col]+['label','which_set']

# Join the train and test set
out['which_set'] = 'train'
df_test['which_set']='test'
df_test['label']=''
out = pd.concat([out, df_test])[key_cols].reset_index(drop=True)
out

Unnamed: 0,tier_1,tier_2,tier_3,tier_4,lineitem_name,lineitem_name.1,sales_amt_gross,label,which_set
0,Food,Entree,Pizza,Pizza,TRA - Pepperoni Pizza,TRA - Pepperoni Pizza,278335.94,,train
1,Food,Entree,Salad,Salad,MEX - Meat Taco Salad,MEX - Meat Taco Salad,139089.16,,train
2,Food,Entree,Seafood Entree,Seafood Entree,Baked Fish,Baked Fish,33076.53,,train
3,Food,Entree,Poultry Entree,Poultry Entree,TERP-GKE-Teriyaki_Plate Chx Spicey,TERP-GKE-Teriyaki_Plate Chx Spicey,117863.03,,train
4,Food,Entree,Salad,Salad,Chicken Salad Kit,Chicken Salad Kit,791.70,,train
...,...,...,...,...,...,...,...,...,...
113,Food,Entree,Tacos,Tacos,pork chile verde tacos (2ea),pork chile verde tacos (2ea),130.86,,test
114,Food,Entree,Tacos,Tacos,FW  Gringos  3 Tacos,FW  Gringos  3 Tacos,50.00,,test
115,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,Egg and Cheddar Biscuit,Egg and Cheddar Biscuit,36.00,,test
116,Food,Entree,Poultry Entree,Poultry Entree,herb roasted halal chicken cobb,herb roasted halal chicken cobb,132.81,,test


In [11]:
# Save df_full, out, df_remaining
out.to_csv(f'{model_type}/{cls_model_name}/data/{cls_model_name}_label_set_round1_n{sample_n+test_samples}.csv',index = False)

if 'which_set' in key_cols:
    key_cols.remove('which_set')
    
for kc in key_cols:
    if kc not in list(df_full):
        df_full[kc]=''
df_full[key_cols].to_csv(f'{model_type}/{cls_model_name}/data/{cls_model_name}_full_item_catalog.csv',index = False)

# df_remaining = df_full.merge(out['item_input'], on='item_input', how='left', indicator=True)
# df_remaining = df_remaining[df_remaining['_merge']=='left_only']
for kc in key_cols:
    if kc not in list(df_remaining):
        df_remaining[kc]=''
df_remaining = df_remaining[key_cols]
df_remaining.to_csv(f'{model_type}/{cls_model_name}/data/{cls_model_name}_remaining_items.csv',index = False)


#### Validation tests

In [12]:
if df_full.shape[0]!= out.shape[0]+df_remaining.shape[0]:
    print('WARNING: THE SAMPLING ALGORITHM ISN"T WORKING AS EXPECTED. CHECK THE SIZES OF YOUR DATAFRAMES BEFORE CONTINUING')

In [13]:
df_full.shape[0]

460915

In [14]:
out.shape[0]

118

In [15]:
df_remaining.shape[0]

460797