In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys, os
import pandas as pd 
import numpy as np
import warnings
sys.path.append('/content/drive/MyDrive/Colab Notebooks')
from preprocessor import *

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Set input file directory
ip_file_dir = "/content/drive/MyDrive/Target/"

# Get grocery product hierarchy information
group4 = pd.read_csv(os.path.join(ip_file_dir, 
                                   'group4_header.csv'),
                      sep=',', 
                      low_memory=False)
# Get scraped information for the above products
products = pd.read_csv(os.path.join(ip_file_dir,
                                    'products_group4.csv'))

# Merge scraped information into the hierarchy table
group4 = pd.merge(group4, products, 
                   how = 'left', on = 'tcin')

# Preprocess the table
group4 = preprocess_df(group4)

In [None]:
def preprocess(sentence):
    # Lowercase product titles
    sentence = sentence.lower()
    
    r = r'-'
    sentence = re.sub(r, ' ', sentence)                                      

    # Remove punctuations from product titles
    regex = re.compile(r'[' + string.punctuation + '0-9\r\t\n]')    
    sentence = regex.sub("", sentence)
    
    # Collapse multiple spaces into single space
    sentence = re.sub("[\s]+", " ", sentence)
    sentence = " ".join([w for w in sentence.split() if len(w) > 2])
    return sentence 

def remove_qty(title):
    # Remove qty information from product titles
    replace_expr = r'[0-9]+(.)?([0-9]+)?[\s]*(cans|can|boxes|box|bottles|bottle|gallons|gallon\
                                        |fl oz|oz|fl|gal|pk|ct|ml|lbs|lb|qt|pt|ounce|0z|l|g)\b'
    title = re.sub(replace_expr, '', title)   
    return title

def remove_brand(title):
    title = title.replace(u"\u2122", '')
    title = re.sub(fr'^({brands})\b', '', f'{title}')
    title = re.sub(fr'\b({brands})$', '', f'{title}')
    return title

def replace_metacharacters(title):
    title = title.replace('+', '\+')
    title = title.replace('\\', '\\\\')
    title = title.replace('^', '\^')
    title = title.replace('$', '\^')
    title = title.replace('*', '\*')
    title = title.replace('?', '\?')
    title = title.replace('.', '\.')
    return title

group4['title_lower'] = group4['title'].str.lower().apply(remove_qty)
group4['brand_lower'] = group4['brand'].str.lower().apply(replace_metacharacters)
brands = "|".join(group4['brand_lower'].unique())
group4['title_processed'] = group4['title_lower'].apply(remove_brand).apply(preprocess)


In [None]:
group4[['title', 'title_processed']].head(25)

Unnamed: 0,title,title_processed
0,Houdini Deluxe Lever Corkscrew,deluxe lever corkscrew
1,Kitchen Selectives Single Burner,kitchen selectives single burner
2,Presto PowerCup Concentrator - 8 pack,powercup concentrator pack
3,IMUSA 3pc Stamped Caldero Cookware Set,stamped caldero cookware set
4,ZeroWater Replacement Filters 2pk,replacement filters
5,Taylor 99 Minute Slim Digital Timer,minute slim digital timer
6,Taylor 11lb Glass Platform Digital Food Scale,glass platform digital food scale
7,Natural Home 12oz Recycled Glass Olive Oil Dis...,recycled glass olive oil dispenser clear
8,Sunbeam Hand & Stand 5-Speed Mixer - Black FBS...,hand stand speed mixer black fbsbh
9,Ball 4ct 16oz Collection Elite Glass Mason Jar...,collection elite glass mason jar with lid and ...


In [None]:
level = 'subclass_name'
sentence = 'title_processed'

In [None]:
df = group4[[sentence, level]]
df = df[~pd.isnull(df[level])]
df[sentence] = df[sentence].str.lower()

In [None]:
num_labels = len(df[level].unique())
print(num_labels)

285


In [None]:
# Remove infrequently used product groups
val_counts = df[level].value_counts()
df = df[~df[level].isin(list(val_counts[val_counts < 10].index))]

In [None]:
class LabelEncoderWithNA():
    def fit(self, train, col):
        train[col] = train[col].astype('category').cat.as_ordered()
        self.encoder = train[col].cat.categories
    def transform(self, val, col):
        val[col] = pd.Categorical(val[col], categories=self.encoder, ordered=True)
        val[col] = val[col].cat.codes
    def fit_transform(self, train, col):
        self.fit(train, col)
        self.transform(train, col)

In [None]:
# Label encoding for the rest
le = LabelEncoderWithNA()
le.fit_transform(df, level)

In [None]:
!pip install sentence-transformers



In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util
from sentence_transformers.datasets import SentenceLabelDataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
import math
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator

import logging
import os
import random
from collections import defaultdict
random.seed(42)

In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [None]:
len(df[level].value_counts())

156

In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df, test_size=0.2, stratify=df[level])
# train, valid = train_test_split(train_val, test_size=0.2, stratify=train_val['class_name'])

In [None]:
len(valid[level].value_counts())

156

In [None]:
train_set = []
guid=1
for index, row in train.iterrows():
  guid += 1
  train_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))

valid_set = []
guid=1
for index, row in valid.iterrows():
  guid += 1
  valid_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))

# test_set = []
# guid=1
# for index, row in test.iterrows():
#   guid += 1
#   test_set.append(InputExample(guid=guid, texts=[row[sentence]], label=row[level]))    

In [None]:
len(train_set)

5722

In [None]:
# You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'distilroberta-base'
train_batch_size = 16
model_save_path = '/content/drive/MyDrive/Colab Notebooks/models/nli-classification-' + level

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder(model_name, num_labels=num_labels)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 

2022-02-16 22:57:59 - Use pytorch device: cuda


In [None]:
#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_set, shuffle=True, batch_size=train_batch_size)

In [None]:
#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(valid_set, name='AllNLI-dev')

In [None]:
num_epochs = 8
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

2022-02-16 22:58:06 - Warmup-steps: 287


In [None]:
logging.info("Performance before fine-tuning:")
evaluator(model)

2022-02-16 22:58:00 - Performance before fine-tuning:
2022-02-16 22:58:00 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset:
2022-02-16 22:58:01 - Accuracy: 0.56


0.005590496156533892

In [None]:
# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          optimizer_params={'lr':2e-05},
          evaluation_steps=10000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 22:58:48 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 0:
2022-02-16 22:58:49 - Accuracy: 28.72
2022-02-16 22:58:49 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 22:59:33 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 1:
2022-02-16 22:59:34 - Accuracy: 45.70
2022-02-16 22:59:34 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 23:00:14 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 2:
2022-02-16 23:00:15 - Accuracy: 53.74
2022-02-16 23:00:15 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 23:00:54 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 3:
2022-02-16 23:00:55 - Accuracy: 57.93
2022-02-16 23:00:55 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 23:01:34 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 4:
2022-02-16 23:01:35 - Accuracy: 60.59
2022-02-16 23:01:35 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 23:02:14 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 5:
2022-02-16 23:02:15 - Accuracy: 63.38
2022-02-16 23:02:15 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 23:02:55 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 6:
2022-02-16 23:02:56 - Accuracy: 64.57
2022-02-16 23:02:56 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


Iteration:   0%|          | 0/358 [00:00<?, ?it/s]

2022-02-16 23:03:34 - CESoftmaxAccuracyEvaluator: Evaluating the model on AllNLI-dev dataset after epoch 7:
2022-02-16 23:03:35 - Accuracy: 65.41
2022-02-16 23:03:35 - Save model to /content/drive/MyDrive/Colab Notebooks/models/nli-classification-subclass_name


In [None]:
scores = model.predict(['pan'])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
n = 2
labels = [(le.encoder[score_argmax], score_max)
          for score_argmax, score_max 
          in zip(scores.argsort()[-n:][::-1], sorted(scores, reverse=True)[0:n])]
labels

[('FRY / SAUTE PAN', 5.260729), ('ASTV KITCHEN', 4.006313)]

In [None]:
import torch
scores = torch.nn.functional.softmax(torch.tensor(scores)).numpy()
n = 2
labels = [(le.encoder[score_argmax], score_max)
          for score_argmax, score_max 
          in zip(scores.argsort()[-n:][::-1], sorted(scores, reverse=True)[0:n])]
labels

[('FRY / SAUTE PAN', 0.40161166), ('ASTV KITCHEN', 0.11455666)]