In this notebook, we're going to train a classification model, evaluate it and generate new training samples for future iterations.

Expected input:
- Labelled data split into a train/test set
- Remaining items (all items not in the train/test set)
- Full set of data to run inference on

Expected output:
- Model files
- Additional params- for further training
- Labelled full set of data

### Section 1
Firstly, let's train our model:

In [103]:
import pandas as pd
import numpy as np
from transformers import AutoConfig, TFAutoModelForSequenceClassification, TFAutoModel, AutoTokenizer
import tensorflow as tf
from scipy.special import softmax
from keras.callbacks import LearningRateScheduler
import utils as ut
import pickle
import glob
import hjson as json
from sklearn.metrics import f1_score, confusion_matrix
import re
import plotly.figure_factory as ff
import os
from datetime import datetime
from pathlib import Path

import importlib
importlib.reload(ut)


<module 'utils' from '/home/shared/code/08_protein_attribution/utils.py'>

In [104]:
# Read in our params file
f = open('input_params.hjson')
params = json.load(f)
f.close()

# Modelling params
model_params = {
    'model_type':'classifier',
    'model_name':params['core']['cls_model_name'],
    'input_cols':params['core']['cls_input_cols'],
    'model_path': params['core']['model_path'],
    'patience': params['nb_four']['patience'],
    'learning_rate': params['nb_four']['learning_rate'],
    'batch_size': params['nb_four']['batch_size'],
    'model_architecture': params['core']['model_architecture'],
    'use_pretrained_model': params['core']['use_pretrained_model']
}
n_increments = params['nb_four']['n_increments']
# itemname_col = params['core']['itemname_col']

#### Section 1
Model training

In [105]:
# First, train your model

def train_or_load_model(model_params, fine_tune):
    # Tokenize
    tokenizer = AutoTokenizer.from_pretrained(model_params['model_architecture'])

    #load data and preprocess
    classify = ut.Classify()
    train_df, _ = classify.loadData(model_params['model_name'],model_params['input_cols'],tokenizer)

    train_df = train_df.sample(frac = 1).reset_index(drop = True)
    train_df,label_dic,conv_dic = classify.get_label_ids_dic(train_df,'label')
    train_df['input'] = train_df[model_params['input_cols']].fillna('').apply(lambda x: ' [SEP] '.join(x), axis = 1)

    X_train = classify.preprocess_inputs(train_df['input'].tolist(),tokenizer)[0]
    Y_train = train_df['label_id'].values
    class_wts = classify.get_class_wts(train_df)

    if fine_tune:
        # Fine tune the model
        model = classify.load_model(label_dic, model_params['model_architecture'], model_params['use_pretrained_model'], model_path = model_params['model_path'])
        print('training')
        history = classify.train_model(model,X_train,Y_train, class_wts, model_params['patience'], model_params['learning_rate'], model_params['batch_size'])
        train_results = {
            'hparams': model_params,
            'label_dic': label_dic,
            'conv_dic':conv_dic,
            'history': history.history
        }
        print(history)

        # Save this run of the model
        current_date_time = datetime.now().strftime('%Y_%m_%d-%I_%M_%S_%p')
        if not os.path.exists(f"classifier/{model_params['model_name']}/models/{current_date_time}"):
            os.makedirs(f"classifier/{model_params['model_name']}/models/{current_date_time}")

        model.save_pretrained(f"classifier/{model_params['model_name']}/models/{current_date_time}/model")
        pickle.dump(train_results,open(f"classifier/{model_params['model_name']}/models/{current_date_time}/train_results.pkl",'wb'))

    else:
        # Load the latest model and run inference with it
        dirpath=f"classifier/{model_params['model_name']}/models/"
        paths = sorted(Path(dirpath).iterdir(), key=os.path.getmtime)

        latest_model_folder = str(paths[-1])
        latest_model_path  = latest_model_folder+"/model"
        latest_model_params = latest_model_folder+"/train_results.pkl"
        with open(latest_model_params, 'rb') as f:
            train_results = pickle.load(f)

        model = TFAutoModelForSequenceClassification.from_pretrained(latest_model_path)
        tokenizer = AutoTokenizer.from_pretrained(params['core']['model_architecture'])

    return train_results, model, tokenizer, classify

fine_tune = 0
train_results, model, tokenizer, classify = train_or_load_model(model_params, fine_tune)


# #Load in our full catalog to run inference, concat the test df so that any made up examples are included
# item_df = pd.read_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_full_item_catalog.csv")

# item_df_cols = list(item_df)
# item_df = pd.concat([item_df, test_answers_df[item_df_cols]]).drop_duplicates().reset_index(drop=True)

# # Run inference and evaluate the quality of predictions
# results = classify.model_predictions(item_df,tokenizer,model_params,model,label_dic)
# df_ready_for_eval = test_answers_df.merge(results[model_params['input_cols']+['prediction','rounded_confidence']], how='left', on=model_params['input_cols'], indicator=True, suffixes=('','_extra'))

# increment_results[str(i)]= {}
# increment_results[str(i)]['predictions']=results
# increment_results[str(i)]['train_results']=train_results
# increment_results[str(i)]['prediction_evaluation']={}
# increment_results[str(i)]['prediction_evaluation']['labels'] = df_ready_for_eval['label'].unique()
# increment_results[str(i)]['prediction_evaluation']['confusion_matrix']= confusion_matrix(df_ready_for_eval['label'],df_ready_for_eval['prediction'], labels= list(df_ready_for_eval['label'].unique()))
# cols= list(df_ready_for_eval['label'].unique())
# vals= f1_score(df_ready_for_eval['label'],df_ready_for_eval['prediction'], labels= list(df_ready_for_eval['label'].unique()), average=None)
# df_temp = pd.DataFrame(vals.reshape(1,len(cols)), columns = cols)
# df_temp['n_samples'] = train_df.shape[0]
# increment_results[str(i)]['prediction_evaluation']['performance']=df_temp.copy(deep=True)
    
# # Concat all our results in one df for future analysis
# to_concat = []
# for k, v in increment_results.items():
#     to_concat.append(v['prediction_evaluation']['performance'])
# increment_results['performance'] = pd.concat(to_concat)
    
# # Write out the test set and full model predictions
# results.to_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_full_set_predictions.csv", index=False)
# cols_out = list(results)
# df_ready_for_eval[cols_out].to_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_test_set_predictions.csv", index=False)
    




Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.

Some layers from the model checkpoint at classifier/protein_attribution/models/2023_02_19-05_23_18_AM/model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification f

In [106]:
# Run inference on the full set
skip_full_set_inference = 1

if skip_full_set_inference:
    results = pd.read_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_full_set_predictions.csv")

else:
    item_df = pd.read_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_full_item_catalog.csv")
    results = classify.model_predictions(item_df,tokenizer,train_results['hparams'],model,train_results['label_dic'])

    results.to_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_full_set_predictions.csv", index=False)
results

Unnamed: 0,tier_1,tier_2,tier_3,tier_4,lineitem_name,sales_amt_gross,label,prediction,rounded_confidence
0,Food,Breakfast,Breakfast Side,Grits,Grits Large,78552.240000,,AO,1.0
1,Food,Entree,Weighed/Build Your Own,Weighed/Build Your Own,Large Boat,190793.963333,,Supplies,1.0
2,Food,Entree,Sandwich/Wrap,Sandwich/Wrap,Kitchen Fresh 1137 Italian Focaccia (7.7oz),84461.820000,,AO,1.0
3,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,Adobo Chicken Bowl,236852.690000,,Chicken,1.0
4,Food,Breakfast,Oatmeal/Cereal,Oatmeal,8oz Steel Cut Oatmeal (1,125298.810000,,AO,1.0
...,...,...,...,...,...,...,...,...,...
460910,Food,Entree,Burger,Burger,Spicy Pepperjack Burger,5.490000,,Beef,1.0
460911,Food,Entree,Other Entree,Other Entree,Charlotte SP White Egg Salad (6oz),2.990000,,Egg,1.0
460912,Food,Breakfast,Griddle,Waffles,Crisper and Waffle Combo,11.890000,,AO,1.0
460913,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,BFK - FIT Applewood Bacon Egg & Cheddar Flatbread,0.000000,,Bacon,1.0


In [107]:
# Run inference on the test set
_, test_df = classify.loadData(model_params['model_name'],model_params['input_cols'],tokenizer)
test_results = classify.model_predictions(test_df,tokenizer,train_results['hparams'],model,train_results['label_dic'])

test_results.to_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_test_set_predictions.csv", index=False)


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



1.0    9083
0.9     135
0.8      60
0.7      54
0.6      42
0.5      12
0.3       4
0.4       1
Name: rounded_confidence, dtype: int64


### Section 2:

Visualize our test set results

In [108]:
labels = list(test_results['label'].unique())
confusion_matrix = confusion_matrix(test_results['label'],test_results['prediction'], labels= labels)

fig = ff.create_annotated_heatmap(confusion_matrix)
fig = ff.create_annotated_heatmap(confusion_matrix, x=labels, y=labels)

fig.update_layout(yaxis = dict(categoryorder = 'category descending'))
fig.update_layout(xaxis = dict(categoryorder = 'category ascending'))
fig.show()

# Multiclass confusion matrix plot
# Note: horizontal is predicted class, vertical is true class

In [109]:
import plotly.express as px

vals= f1_score(test_results['label'],test_results['prediction'], labels= labels, average=None)
df_current_perf = pd.DataFrame(vals.reshape(1,len(labels)), columns = labels)
df_current_perf['n_samples'] = test_results.shape[0]

df_historical_perf = pd.concat([pd.DataFrame(params['model_performance']), df_current_perf]).fillna(0).reset_index(drop=True)


if df_historical_perf.shape[0]>1:
    fig = px.line(df_historical_perf, x=df_historical_perf.index, y=labels)
    fig.show()

# Calculate the f1 score, compare it against historical data and plot it

In [110]:
# Write out model results for next time.
params['model_performance']= df_historical_perf.to_dict()

with open("input_params.hjson", "w") as outfile:
    outfile.write(json.dumps(params, indent=3))

### Section 3:

After we see our performance gains, let's generate more training samples to continuously improve our model.

In [111]:
## Setup collection methods to increase our training samples

additional_training_dict = {}
n_additional=20
min_per_class=5

# First, let's check if there are any classes that are under represented (n<5)

cm = confusion_matrix.sum(axis=1)

additional_training_dict['under_represented_classes'] = {}
for i,j in zip(cm.tolist(), labels):
    if i < min_per_class:
        additional_training_dict['under_represented_classes'][j] = min_per_class-i

print('under-represented class name: # of samples to add')
print(additional_training_dict['under_represented_classes'])

under-represented class name: # of samples to add
{}


In [112]:
# Then, lets assign 20 new items based on how many were misclassified

cm_misclassified = confusion_matrix.copy()
for i in range(cm_misclassified.shape[0]):
    cm_misclassified[i,i]=0
cm_misclassified = cm_misclassified.sum(axis=1)/cm_misclassified.sum()

additional_training_dict['misclassified'] = {}
for i, j in zip(cm_misclassified, labels):
    if i>0:
        additional_training_dict['misclassified'][j]=int(i*n_additional)

print('misclassified_class_name: number of samples to add')
print(additional_training_dict['misclassified'])

misclassified_class_name: number of samples to add
{'Cheese': 0, 'AO': 10, 'Sausage': 0, 'Seafood': 0, 'Fish': 0, 'Meatball': 0, 'Chicken': 0, 'Veggie': 1, 'Beef': 1, 'Ham': 0, 'Pepperoni': 0, 'Turkey': 0, 'Supplies': 4, 'Bacon': 0, 'Pork': 0, 'Egg': 0}


In [113]:
import string
from pandas.api.types import is_numeric_dtype

## Lets generate a df of remaining items with suggested labels to select from

# This is the same as the function in 00
def preprocess_item_name(df, itemname_col):
    '''
    Add in any preprocessing your dataset needs in this step
    
    Other notes:
    item_input will be used for inference
    item_for_selection will be used for train and test set selection

    '''

    # # item_input is used for inference, for now we'll feed the model the input exactly as it's given to us.
    # # You can do some processing to item_input if you find that it breaks the model.
    # df['item_input'] = df[itemname_col]

    # item_for_selection is setting up a groupby later on which is looking to group variations of the same item.
    # Remove any special characters present in your item name here.
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) # Replace punctuation with ' '
    df['item_for_selection']= df[itemname_col].fillna('').str.translate(translator).str.lower().str.strip()

    #### Add in any preprocessing required here #####

    return df

def group_by_unique_items(df_r, params):
    # Get the relevant subset of columns
    df_selection_pool = df_r[['item_for_selection', params['core']['spend_col']]]

    # group by item_for_selection to remove duplicate items.
    funcs = {col:'sum' if is_numeric_dtype(df_selection_pool[col])
                                else 'first'
                    for col in df_selection_pool.drop('item_for_selection', 1).columns}
    df_selection_pool = df_selection_pool.groupby('item_for_selection').agg(funcs).reset_index()
    return df_selection_pool

df_remaining = pd.read_csv(f"classifier/{model_params['model_name']}/data/{model_params['model_name']}_remaining_items.csv")
df_r= preprocess_item_name(df_remaining, params['core']['itemname_col'])
df_selection_pool = group_by_unique_items(df_r, params)

df_selection_pool



In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



Unnamed: 0,item_for_selection,sales_amt_gross
0,,0.00
1,0 ounce,0.00
2,0 pound,0.00
3,0 pound lbs,0.00
4,0 pounds,0.00
...,...,...
421179,“so sous me” cookbook series doc schneider ...,54.25
421180,“stop food waste day”,241.11
421181,•\ttofu vegetable stir fry over rice,0.00
421182,○\tegg allergy,0.00


In [118]:
def add_predictions_to_remaining_items(results, params, df_selection_pool):
    results_pp = results.copy(deep=True)
    results_pp = preprocess_item_name(results_pp, params['core']['itemname_col'])
    results_merged = results_pp.merge(df_selection_pool['item_for_selection'], how='left', on='item_for_selection', indicator=True)
    df_joined = results_merged.loc[results_merged['_merge']=='both']
    df_joined.drop(['_merge'], axis=1, inplace=True)
    return df_joined

df_joined = add_predictions_to_remaining_items(results, params, df_selection_pool)

In [119]:
# Generate the labelling set and write it out

def sample_items_from_df(additional_training_dict, df_joined):
    samples_to_merge = []

    # Generate our samples
    for key in additional_training_dict.keys():
        for k,v in additional_training_dict[key].items():
            # Sample from the remaining items, and then remove it from the pool so it can't be sampled again
            df_additional_samples = df_joined[df_joined['prediction']==k].sort_values(by='rounded_confidence').head(v)
            df_joined = df_joined.merge(df_additional_samples['item_for_selection'], how='left', indicator=True)
            df_joined = df_joined.loc[df_joined['_merge'] == 'left_only']
            df_joined.drop(['_merge'], axis=1, inplace=True)

            # Label sample based on why we're sampling it
            if key=='under_represented_classes':
                df_additional_samples['which_set'] = 'test'
            else:
                df_additional_samples['which_set'] = 'train'
            samples_to_merge.append(df_additional_samples)

    df_out = pd.concat(samples_to_merge).reset_index()
    df_out_r = df_out.copy(deep=True)
    cols = list(results)
    cols = cols+ ['which_set']
    df_out = df_out[cols]

    additional_sample_files = glob.glob(f"classifier/{model_params['model_name']}/data/additional_params*.csv")
    additional_sample_files = [i for i in additional_sample_files if 'labelled' not in i] # Remove the file names with "labelled" in it
    if len(additional_sample_files) == 0:
        df_out.to_csv(f"classifier/{model_params['model_name']}/data/additional_params_1.csv", index=False)
    else:
        n = int(re.split('[./_]', sorted(additional_sample_files)[-1])[-2])+1
        df_out.to_csv(f"classifier/{model_params['model_name']}/data/additional_params_{str(n)}.csv", index=False)
    
    return df_out_r

df_out_r = sample_items_from_df(additional_training_dict, df_joined)

# These are the selected items for additional labelling
df_out_r

Unnamed: 0,index,tier_1,tier_2,tier_3,tier_4,lineitem_name,sales_amt_gross,label,prediction,rounded_confidence,item_for_selection,which_set
0,235845,Food,Breakfast,Griddle,Pancakes,Blueberry corn pancakes with chi,65.94,,AO,0.4,blueberry corn pancakes with chi,train
1,274228,Food,Breakfast,Breakfast Side,Hash Browns,BFK - Supreme Hasbrowns,237.25,,AO,0.4,bfk supreme hasbrowns,train
2,44954,Food,Breakfast,Breakfast Side,Hash Browns,GA - BK - Hashbrown Potatoes,3860.52,,AO,0.4,ga bk hashbrown potatoes,train
3,241688,Food,Entree,Other Entree,Other Entree,Cranberry Molassess Lacquered Gr,179.1,,AO,0.4,cranberry molassess lacquered gr,train
4,154123,Food,Breakfast,Oatmeal/Cereal,Cereal,Cereal Cup Raisin Bran Crunch,805.92,,AO,0.4,cereal cup raisin bran crunch,train
5,380098,Food,Entree,Other Entree,Lunch Special,Porchetta Special,38.0,,AO,0.4,porchetta special,train
6,273552,Food,Entree,Other Entree,Prepackaged Meals,Meal Fresh N Ready Trian,99.75,,AO,0.4,meal fresh n ready trian,train
7,438533,Food,Entree,Misc Protein Entree,Misc Protein Entree,Grill Special: Habanero Peach Ch,84.0,,AO,0.4,grill special habanero peach ch,train
8,308400,Food,Entree,Misc Protein Entree,Misc Protein Entree,Boars Head Ichiban Teriy,365.39,,AO,0.4,boars head ichiban teriy,train
9,155064,Food,Entree,Misc Protein Entree,Misc Protein Entree,BBQ Tticket,4221.54,,AO,0.4,bbq tticket,train


In [120]:
# Finally, remove the selected items from the set of remainining and write out

df_remaining_out = df_joined.merge(df_out_r['item_for_selection'], how='left', on='item_for_selection',indicator=True)
df_remaining_out = df_remaining_out.loc[df_remaining_out['_merge'] == 'left_only']

cols_out = [c for c in df_remaining_out if c not in ['_merge', 'rounded_confidence', 'prediction', 'item_for_selection']]

df_remaining_out[cols_out].to_csv(f"{model_params['model_type']}/{model_params['model_name']}/data/{model_params['model_name']}_remaining_items.csv",index = False)
df_remaining_out[cols_out]

Unnamed: 0,tier_1,tier_2,tier_3,tier_4,lineitem_name,sales_amt_gross,label
0,Food,Breakfast,Breakfast Side,Grits,Grits Large,78552.240000,
1,Food,Entree,Weighed/Build Your Own,Weighed/Build Your Own,Large Boat,190793.963333,
2,Food,Entree,Sandwich/Wrap,Sandwich/Wrap,Kitchen Fresh 1137 Italian Focaccia (7.7oz),84461.820000,
3,Food,Entree,Noodle/Grain Bowl,Noodle/Grain Bowl,Adobo Chicken Bowl,236852.690000,
4,Food,Breakfast,Oatmeal/Cereal,Oatmeal,8oz Steel Cut Oatmeal (1,125298.810000,
...,...,...,...,...,...,...,...
457606,Food,Entree,Burger,Burger,Spicy Pepperjack Burger,5.490000,
457607,Food,Entree,Other Entree,Other Entree,Charlotte SP White Egg Salad (6oz),2.990000,
457608,Food,Breakfast,Griddle,Waffles,Crisper and Waffle Combo,11.890000,
457609,Food,Breakfast,Breakfast Sandwiches,Breakfast Sandwich/Wrap,BFK - FIT Applewood Bacon Egg & Cheddar Flatbread,0.000000,


### Write out results
Let's write out the model predictions on our train, test and full dataset.

### Conclusion

If you find the performance of your model acceptable, you can end it here. If you want to continue improving your model, find the additional_params_n.csv file and:
1. fill in the 'label' column. 
2. Remove the 'prediction' and 'rounded_confidence' columns
3. Save the file as additional_params_n_labelled.csv
4. Rerun this notebook

In [None]:
df_remaining_out.shape #13733

(8174, 17)

#### Tests

In [None]:
## Join df_joined and df_remaining_out
