In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sys, os
import pandas as pd 
import numpy as np
import warnings
sys.path.append('/content/drive/MyDrive/Colab Notebooks/my_packages')
from preprocessor import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Set input file directory
ip_file_dir = "/content/drive/MyDrive/Target/data/"

# Get grocery product hierarchy information
group10 = pd.read_csv(os.path.join(ip_file_dir, 
                                   'group10_header.csv'),
                      sep='\t', 
                      low_memory=False)
# Get scraped information for the above products
products = pd.read_csv(os.path.join(ip_file_dir,
                                    'products.csv'))

# Merge scraped information into the hierarchy table
group10 = pd.merge(group10, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group10 = preprocess_df(group10)

In [5]:
def preprocess(sentence):
    # Lowercase product titles
    sentence = sentence.lower()
    
    r = r'-'
    sentence = re.sub(r, ' ', sentence)                                      

    # Remove punctuations from product titles
    regex = re.compile(r'[' + string.punctuation + '0-9\r\t\n]')    
    sentence = regex.sub("", sentence)
    
    # Collapse multiple spaces into single space
    sentence = re.sub("[\s]+", " ", sentence)
    sentence = " ".join([w for w in sentence.split() if len(w) > 2])
    return sentence 

def remove_qty(title):
    # Remove qty information from product titles
    replace_expr = r'[0-9]+(.)?([0-9]+)?[\s]*(cans|can|boxes|box|bottles|bottle|gallons|gallon\
                                        |fl oz|oz|fl|gal|pk|ct|ml|lbs|lb|qt|pt|ounce|0z|l|g)\b'
    title = re.sub(replace_expr, '', title)   
    return title

def remove_brand(title):
    title = title.replace(u"\u2122", '')
    title = re.sub(fr'^({brands})\b', '', f'{title}')
    title = re.sub(fr'\b({brands})$', '', f'{title}')
    return title

def replace_metacharacters(title):
    title = title.replace('+', '\+')
    title = title.replace('\\', '\\\\')
    title = title.replace('^', '\^')
    title = title.replace('$', '\^')
    title = title.replace('*', '\*')
    title = title.replace('?', '\?')
    title = title.replace('.', '\.')
    return title

group10['title_lower'] = group10['title'].str.lower().apply(remove_qty)
group10['brand_lower'] = group10['brand'].str.lower().apply(replace_metacharacters)
brands = "|".join(group10['brand_lower'].unique())
group10['title_processed'] = group10['title_lower'].apply(remove_brand).apply(preprocess)


In [6]:
group10[['title', 'title_processed']].head(25)

Unnamed: 0,title,title_processed
0,Blue Diamond Almonds Wasabi & Soy Sauce - 6oz,wasabi soy sauce
1,Thomas' Everything Bagel Thins - 13oz/8ct,everything bagel thins
2,Wesson Canola Oil - 128oz,canola oil
3,Entenmann's Little Bites Blueberry Muffins - 8...,little bites blueberry muffins
4,Whole Milk - 0.5gal - Good & Gather™,whole milk
5,Entenmann's Little Bites Banana Muffins - 8.25oz,little bites banana muffins
6,Arnold 100% Health Nut Bread - 24oz,health nut bread
7,Original Frosted Mini-Wheats Breakfast Cereal ...,original frosted mini wheats breakfast cereal
8,Ragu Pizza Quick Traditional Sauce - 14oz,pizza quick traditional sauce
9,Corn Chex Breakfast Cereal - 12oz - General Mills,corn chex breakfast cereal


In [8]:
level = 'department_name'
sentence = 'title_processed'

In [9]:
df = group10[[sentence, level]]
df = df[~pd.isnull(df[level])]
df[sentence] = df[sentence].str.lower()

In [10]:
num_labels = len(df[level].unique())
print(num_labels)

26


In [11]:
class LabelEncoderWithNA():
    def fit(self, train, col):
        train[col] = train[col].astype('category').cat.as_ordered()
        self.encoder = train[col].cat.categories
    def transform(self, val, col):
        val[col] = pd.Categorical(val[col], categories=self.encoder, ordered=True)
        val[col] = val[col].cat.codes
    def fit_transform(self, train, col):
        self.fit(train, col)
        self.transform(train, col)

In [12]:
# Label encoding for the rest
le = LabelEncoderWithNA()
le.fit_transform(df, level)

In [13]:
df.head(2)

Unnamed: 0,title_processed,department_name
0,wasabi soy sauce,22
1,everything bagel thins,3


In [14]:
le.encoder[56]

IndexError: ignored

In [15]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 4.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 27.6 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 36.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 45.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596

In [16]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
import math
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator

In [17]:
import logging
import os
import random
from collections import defaultdict

In [18]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [19]:
# Remove infrequently used product groups
val_counts = df[level].value_counts()
df = df[~df[level].isin(list(val_counts[val_counts < 10].index))]

2022-01-31 21:11:41 - NumExpr defaulting to 2 threads.


In [20]:
len(df[level].value_counts())

25

In [21]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, test_size=0.2, stratify=df[level])
# train, valid = train_test_split(train_val, test_size=0.2, stratify=train_val['class_name'])

In [22]:
train_set = []
guid=1
for index, row in train.iterrows():
  guid += 1
  train_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))

valid_set = []
guid=1
for index, row in valid.iterrows():
  guid += 1
  valid_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))

# test_set = []
# guid=1
# for index, row in test.iterrows():
#   guid += 1
#   test_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))    

In [23]:
random.seed(42)

In [72]:
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'distilroberta-base'
train_batch_size = 16
model_save_path = '/content/drive/MyDrive/Colab Notebooks/models/nli-classification-' + level

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder(model_name, num_labels=num_labels)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

2022-01-31 22:50:45 - Use pytorch device: cuda


In [73]:
#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_set, shuffle=True, batch_size=train_batch_size)

In [74]:
#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(valid_set, name='AllNLI-dev')

In [75]:
num_epochs = 10
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

2022-01-31 22:50:45 - Warmup-steps: 1163


In [76]:
logging.info("Performance before fine-tuning:")
evaluator(model)

2022-01-31 22:50:45 - Performance before fine-tuning:
2022-01-31 22:50:45 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset:
2022-01-31 22:50:48 - Accuracy: 0.58


0.005807700580770058

In [None]:
# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          optimizer_params={'lr':2e-05},
          evaluation_steps=10000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 22:52:29 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 0:
2022-01-31 22:52:32 - Accuracy: 75.50
2022-01-31 22:52:32 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 22:54:15 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 1:
2022-01-31 22:54:18 - Accuracy: 82.25
2022-01-31 22:54:18 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 22:56:02 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 2:
2022-01-31 22:56:05 - Accuracy: 84.06
2022-01-31 22:56:05 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 22:57:49 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 3:
2022-01-31 22:57:52 - Accuracy: 86.21
2022-01-31 22:57:52 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 22:59:36 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 4:
2022-01-31 22:59:39 - Accuracy: 86.99
2022-01-31 22:59:39 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 23:01:21 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 5:
2022-01-31 23:01:24 - Accuracy: 87.12
2022-01-31 23:01:24 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 23:03:06 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 6:
2022-01-31 23:03:09 - Accuracy: 87.72
2022-01-31 23:03:09 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 23:04:51 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 7:
2022-01-31 23:04:54 - Accuracy: 88.38
2022-01-31 23:04:54 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 23:06:36 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 8:
2022-01-31 23:06:38 - Accuracy: 88.45
2022-01-31 23:06:38 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


Iteration:   0%|          | 0/1163 [00:00<?, ?it/s]

2022-01-31 23:08:20 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 9:
2022-01-31 23:08:23 - Accuracy: 88.56
2022-01-31 23:08:23 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-department_name


In [None]:
# level = 'item_type_name'
# model = CrossEncoder(f'/content/drive/MyDrive/Colab Notebooks/models/nli-classification-{level}')
# df = group10[[sentence, level]]
# df[sentence] = df[sentence].str.lower()
# # Label encoding for the rest
# le = LabelEncoderWithNA()
# le.fit_transform(df, level)

In [None]:
scores = model.predict(['onion'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
n = 1
labels = [(le.encoder[score_argmax], score_max)
          for score_argmax, score_max 
          in zip(scores.argsort()[-n:][::-1], sorted(scores, reverse=True)[0:n])]

In [None]:
labels

[('MEAL ESSENTIALS', 7.008428)]

In [69]:
import torch
scores = torch.nn.functional.softmax(torch.tensor(scores)).numpy()
n = 2
labels = [(le.encoder[score_argmax], score_max)
          for score_argmax, score_max 
          in zip(scores.argsort()[-n:][::-1], sorted(scores, reverse=True)[0:n])]

In [70]:
labels

[('VEGETABLES/PACKAGED VEG', 0.6769526), ('BREAD/BAKING', 0.15166843)]