In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys, os
import pandas as pd 
import numpy as np
import warnings
sys.path.append('/content/drive/MyDrive/Colab Notebooks/my_packages')
from preprocessor import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Set input file directory
ip_file_dir = "/content/drive/MyDrive/Target/data/"

# Get grocery product hierarchy information
group10 = pd.read_csv(os.path.join(ip_file_dir, 
                                   'group10_header.csv'),
                      sep='\t', 
                      low_memory=False)
# Get scraped information for the above products
products = pd.read_csv(os.path.join(ip_file_dir,
                                    'products.csv'))

# Merge scraped information into the hierarchy table
group10 = pd.merge(group10, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group10 = preprocess_df(group10)

In [5]:
def preprocess(sentence):
    # Lowercase product titles
    sentence = sentence.lower()
    
    r = r'-'
    sentence = re.sub(r, ' ', sentence)                                      

    # Remove punctuations from product titles
    regex = re.compile(r'[' + string.punctuation + '0-9\r\t\n]')    
    sentence = regex.sub("", sentence)
    
    # Collapse multiple spaces into single space
    sentence = re.sub("[\s]+", " ", sentence)
    sentence = " ".join([w for w in sentence.split() if len(w) > 2])
    return sentence 

def remove_qty(title):
    # Remove qty information from product titles
    replace_expr = r'[0-9]+(.)?([0-9]+)?[\s]*(cans|can|boxes|box|bottles|bottle|gallons|gallon\
                                        |fl oz|oz|fl|gal|pk|ct|ml|lbs|lb|qt|pt|ounce|0z|l|g)\b'
    title = re.sub(replace_expr, '', title)   
    return title

def remove_brand(title):
    title = title.replace(u"\u2122", '')
    title = re.sub(fr'^({brands})\b', '', f'{title}')
    title = re.sub(fr'\b({brands})$', '', f'{title}')
    return title

def replace_metacharacters(title):
    title = title.replace('+', '\+')
    title = title.replace('\\', '\\\\')
    title = title.replace('^', '\^')
    title = title.replace('$', '\^')
    title = title.replace('*', '\*')
    title = title.replace('?', '\?')
    title = title.replace('.', '\.')
    return title

group10['title_lower'] = group10['title'].str.lower().apply(remove_qty)
group10['brand_lower'] = group10['brand'].str.lower().apply(replace_metacharacters)
brands = "|".join(group10['brand_lower'].unique())
group10['title_processed'] = group10['title_lower'].apply(remove_brand).apply(preprocess)


In [6]:
group10[['title', 'title_processed']].head(25)

Unnamed: 0,title,title_processed
0,Blue Diamond Almonds Wasabi & Soy Sauce - 6oz,wasabi soy sauce
1,Thomas' Everything Bagel Thins - 13oz/8ct,everything bagel thins
2,Wesson Canola Oil - 128oz,canola oil
3,Entenmann's Little Bites Blueberry Muffins - 8...,little bites blueberry muffins
4,Whole Milk - 0.5gal - Good & Gather™,whole milk
5,Entenmann's Little Bites Banana Muffins - 8.25oz,little bites banana muffins
6,Arnold 100% Health Nut Bread - 24oz,health nut bread
7,Original Frosted Mini-Wheats Breakfast Cereal ...,original frosted mini wheats breakfast cereal
8,Ragu Pizza Quick Traditional Sauce - 14oz,pizza quick traditional sauce
9,Corn Chex Breakfast Cereal - 12oz - General Mills,corn chex breakfast cereal


In [7]:
level = 'class_name'
sentence = 'title_processed'

In [8]:
df = group10[[sentence, level]]
df = df[~pd.isnull(df[level])]
df[sentence] = df[sentence].str.lower()

In [9]:
num_labels = len(df[level].unique())
print(num_labels)

185


In [10]:
class LabelEncoderWithNA():
    def fit(self, train, col):
        train[col] = train[col].astype('category').cat.as_ordered()
        self.encoder = train[col].cat.categories
    def transform(self, val, col):
        val[col] = pd.Categorical(val[col], categories=self.encoder, ordered=True)
        val[col] = val[col].cat.codes
    def fit_transform(self, train, col):
        self.fit(train, col)
        self.transform(train, col)

In [11]:
# Label encoding for the rest
le = LabelEncoderWithNA()
le.fit_transform(df, level)

In [12]:
df.head(2)

Unnamed: 0,title_processed,class_name
0,wasabi soy sauce,56
1,everything bagel thins,17


In [13]:
le.encoder[0]

'AMBIENT BAKERY'

In [14]:
!pip install sentence-transformers



In [15]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
import math
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator

In [16]:
import logging
import os
import random
from collections import defaultdict

In [17]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [18]:
# Remove infrequently used product groups
val_counts = df[level].value_counts()
prop = val_counts / val_counts.max()
weights = 1 / prop
weights = weights.sort_index()
weights = weights / weights.sum()
df = df[~df[level].isin(list(val_counts[val_counts < 3].index))]

In [19]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, test_size=0.2, stratify=df[level])
# train, valid = train_test_split(train_val, test_size=0.2, stratify=train_val['class_name'])

In [20]:
train_set = []
guid=1
for index, row in train.iterrows():
  guid += 1
  train_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))

valid_set = []
guid=1
for index, row in valid.iterrows():
  guid += 1
  valid_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))

# test_set = []
# guid=1
# for index, row in test.iterrows():
#   guid += 1
#   test_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))    

In [21]:
random.seed(42)

In [22]:
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'distilroberta-base'
train_batch_size = 16
model_save_path = '/content/drive/MyDrive/Colab Notebooks/models/nli-classification-' + level

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder(model_name, num_labels=num_labels)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weig

2022-02-03 01:03:12 - Use pytorch device: cuda


In [23]:
#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_set, shuffle=True, batch_size=train_batch_size)

In [24]:
#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(valid_set, name='AllNLI-dev')

In [25]:
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

2022-02-03 01:03:12 - Warmup-steps: 1162


In [26]:
logging.info("Performance before fine-tuning:")
evaluator(model)

2022-02-03 01:03:12 - Performance before fine-tuning:
2022-02-03 01:03:12 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset:
2022-02-03 01:03:24 - Accuracy: 0.32


0.0032278889606197547

In [27]:
import torch
class_weights = torch.FloatTensor(weights).cuda()
cel = torch.nn.CrossEntropyLoss(weight=class_weights)

In [28]:
model = CrossEncoder(model_save_path)

2022-02-03 01:03:26 - Use pytorch device: cuda


In [30]:
# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          optimizer_params={'lr':2e-05},
          evaluation_steps=10000,
          loss_fct = cel,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:22:35 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 0:
2022-02-03 01:22:37 - Accuracy: 86.21
2022-02-03 01:22:37 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-class_name


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:24:03 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 1:
2022-02-03 01:24:06 - Accuracy: 85.45


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:25:29 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 2:
2022-02-03 01:25:32 - Accuracy: 86.08


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:26:56 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 3:
2022-02-03 01:26:59 - Accuracy: 86.18


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:28:22 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 4:
2022-02-03 01:28:25 - Accuracy: 86.57
2022-02-03 01:28:25 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-class_name


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:29:50 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 5:
2022-02-03 01:29:53 - Accuracy: 86.59
2022-02-03 01:29:53 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-class_name


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:31:19 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 6:
2022-02-03 01:31:21 - Accuracy: 87.15
2022-02-03 01:31:22 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-class_name


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:32:47 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 7:
2022-02-03 01:32:50 - Accuracy: 87.20
2022-02-03 01:32:50 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-class_name


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:34:16 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 8:
2022-02-03 01:34:18 - Accuracy: 87.02


Iteration:   0%|          | 0/1162 [00:00<?, ?it/s]

2022-02-03 01:35:41 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 9:
2022-02-03 01:35:44 - Accuracy: 87.33
2022-02-03 01:35:44 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-class_name


In [31]:
# level = 'item_type_name'
# model = CrossEncoder(f'/content/drive/MyDrive/Colab Notebooks/models/nli-classification-{level}')
# df = group10[[sentence, level]]
# df[sentence] = df[sentence].str.lower()
# # Label encoding for the rest
# le = LabelEncoderWithNA()
# le.fit_transform(df, level)

In [37]:
scores = model.predict(['onion'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [38]:
n = 1
labels = [(le.encoder[score_argmax], score_max)
          for score_argmax, score_max 
          in zip(scores.argsort()[-n:][::-1], sorted(scores, reverse=True)[0:n])]

In [39]:
labels

[('MEXICAN/BADIA/GOYA', 8.532907)]

In [40]:
import torch
scores = torch.nn.functional.softmax(torch.tensor(scores)).numpy()
n = 5
labels = [(le.encoder[score_argmax], score_max)
          for score_argmax, score_max 
          in zip(scores.argsort()[-n:][::-1], sorted(scores, reverse=True)[0:n])]

In [41]:
labels

[('MEXICAN/BADIA/GOYA', 0.82493),
 ('SPICES', 0.061022356),
 ('POTATOES', 0.01650269),
 ('FRESH CUT', 0.01599632),
 ('FRITO LAY', 0.015274064)]