In [105]:
import tensorflow as tf
# tf.config.run_functions_eagerly(True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn
import pathlib
import io, os, re, string, time, datetime
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense, Flatten, InputLayer, BatchNormalization, Input, Embedding, TextVectorization
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dropout, Conv1D, MultiHeadAttention, LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy, TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam
from tensorboard.plugins import projector
import traceback

from datasets import load_dataset
from transformers import BertTokenizerFast
from transformers import DataCollatorWithPadding
from transformers import TFBertForSequenceClassification, TFBertModel
from transformers import create_optimizer, AutoTokenizer, TFAutoModel

import csv

In [90]:
BATCH_SIZE = 32
MAX_LENGTH = 64

# Datasets

In [4]:
filepath_train = "localdata/Section18/data/processed/public/task_1_query-product_ranking/train-v0.3.csv"
filepath_catalog = "localdata/Section18/data/processed/public/task_1_query-product_ranking/product_catalogue-v0.3.csv"

In [5]:
df_catalogue = pd.read_csv(filepath_catalog)
df_train = pd.read_csv(filepath_train)

In [6]:
len(df_catalogue), len(df_train)

(883868, 781744)

In [7]:
df_train

Unnamed: 0,query_id,query,query_locale,product_id,esci_label
0,0,# 2 pencils not sharpened,us,B0000AQO0O,exact
1,0,# 2 pencils not sharpened,us,B0002LCZV4,exact
2,0,# 2 pencils not sharpened,us,B00125Q75Y,exact
3,0,# 2 pencils not sharpened,us,B001AZ1D3C,exact
4,0,# 2 pencils not sharpened,us,B001B097KC,exact
...,...,...,...,...,...
781739,33803,針なしほっちきす,jp,B08XGQ9RH7,substitute
781740,33803,針なしほっちきす,jp,B0987RGRF2,exact
781741,33803,針なしほっちきす,jp,B099NFJWP6,exact
781742,33803,針なしほっちきす,jp,B09F3B413J,exact


In [23]:
df_catalogue

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color_name,product_locale
0,B0188A3QRM,"Amazon Basics Woodcased #2 Pencils, Unsharpene...",,144 woodcase #2 HB pencils made from high-qual...,Amazon Basics,Yellow,us
1,B075VXJ9VG,"BAZIC Pencil #2 HB Pencils, Latex Free Eraser,...",<p><strong>BACK TO BAZIC</strong></p><p>Our go...,&#11088; UN-SHARPENED #2 PREMIUM PENCILS. Each...,BAZIC Products,12-count,us
2,B07G7F6JZ6,Emraw Pre Sharpened Round Primary Size No 2 Ju...,<p><b>Emraw Pre-Sharpened #2 HB Wood Pencils -...,✓ PACK OF 8 NUMBER 2 PRESHARPENED BEGINNERS PE...,Emraw,Yellow,us
3,B07JZJLHCF,Emraw Pre Sharpened Triangular Primary Size No...,<p><b>Emraw Pre-Sharpened #2 HB Wood Pencils -...,✓ PACK OF 6 NUMBER 2 PRESHARPENED BEGINNERS PE...,Emraw,Yellow,us
4,B07MGKC3DD,"BIC Evolution Cased Pencil, #2 Lead, Gray Barr...",,Premium #2 HB lead pencils with break-resistan...,Design House,Gray,us
...,...,...,...,...,...,...,...
883863,B094NN6T8V,アイリスプラザ コーナーソファ 3点セット 高反発 2人掛け 3人掛け ブラウン 394310,,商品サイズ（cm）：幅約154×奥行約109×高さ約33（※組み合わせ時）\n商品重量：約6...,アイリスプラザ(IRIS PLAZA),ブラウン,jp
883864,B096RD95K9,Uping テーブル・ベッドの高さ調節が簡単にできる ベッドの高さをあげる足 4個セット 高...,B096RD95K9,サイズ:約16.5×11.7×8.2cm\n総重量:約1.23kg\n材質:ABSプラスチッ...,Uping,,jp
883865,B099Z4HXH1,アイリスプラザ ソファ ソファー ローソファ ローソファー 1人掛け 1人用 幅約108cm...,,商品サイズ(cm):幅約108×奥行約44×高さ約62\n商品重量:約15kg\n材質 表面...,アイリスプラザ(IRIS PLAZA),グレー,jp
883866,B09B9QMMSF,足入れヒーター 足温器 電気足温器 1～6時間タイマー 丸洗い可能 省エネ あったか脚入れヒ...,,【腰からスッポリ包む、一人様用のごたつ】 つま先・腰・お腹まですっぽり包む寝袋のようなデザイ...,タサイスク,ブラウン,jp


In [50]:
def get_product_title(product_id):
    options = []
    options.append(str(product_id))
    # print(options)
    matched_row = df_catalogue[df_catalogue['product_id'].isin(options)]
    # print(matched_row)
    # print(matched_row['product_title'])
    # print(int(str(matched_row['product_title']).split("    ")[0]))
    # return matched_row['product_title'][(int(str(matched_row['product_title']).split("    ")[0]))]
    return matched_row['product_title'].iloc[0]

In [51]:
get_product_title('B0000AQO0O')

'Ticonderoga Beginner Pencils, Wood-Cased #2 HB Soft, With Eraser, Yellow, 12-Pack (13308)'

### Dataset creation

In [61]:
filepath_train = 'localdata/Section18/csv_file_9.csv'

In [65]:
df_merged = pd.merge(df_train, df_catalogue, on='product_id').dropna()

In [93]:
df_dataset = df_merged[['query', 'product_title', 'esci_label']]
df_dataset.columns = ['query', 'product', 'label']
df_dataset.to_csv(filepath_train, header=True, index=False)

In [94]:
dataset = load_dataset('csv', data_files=[filepath_train])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [95]:
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'product', 'label'],
        num_rows: 236242
    })
})

In [96]:
dataset['train'][0]

{'query': '# 2 pencils not sharpened',
 'product': 'Arteza HB Pencils #2, Pack of 48, Wood-Cased Graphite Pencils in Bulk, Pre-Sharpened, with Latex-Free Erasers, Office & School Supplies for Exams and Classrooms',
 'label': 'complement'}

In [97]:
def get_label(label):
    if label == 'exact':
        return 1.0
    elif label == 'substitute':
        return 0.7
    elif label == 'complement':
        return 0.5
    else:
        return 0.0

In [85]:
model_id = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [91]:
def preprocess(dataset):
    if dataset['product'] == None:
        dataset['product'] = dataset['query']

    dataset['input_ids_query'] = []
    dataset['token_type_ids_query'] = []
    dataset['attention_mask_query'] = []

    dataset['input_ids_product'] = []
    dataset['token_type_ids_product'] = []
    dataset['attention_mask_product'] = []

    query_tokens = tokenizer(dataset['query'], max_length=MAX_LENGTH, padding='max_length', truncation=True)
    dataset['input_ids_query'].append(query_tokens['input_ids'])
    dataset['token_type_ids_query'].append(query_tokens['token_type_ids'])
    dataset['attention_mask_query'].append(query_tokens['attention_mask'])

    product_tokens = tokenizer(dataset['product'], max_length=MAX_LENGTH, padding='max_length', truncation=True)
    dataset['input_ids_product'].append(product_tokens['input_ids'])
    dataset['token_type_ids_product'].append(product_tokens['token_type_ids'])
    dataset['attention_mask_product'].append(product_tokens['attention_mask'])

    dataset['label'] = get_label(dataset['label'])

    return dataset

In [98]:
prep_dataset = dataset.map(preprocess)

Map:   0%|          | 0/236242 [00:00<?, ? examples/s]

In [100]:
prep_dataset["train"][0]

{'query': '# 2 pencils not sharpened',
 'product': 'Arteza HB Pencils #2, Pack of 48, Wood-Cased Graphite Pencils in Bulk, Pre-Sharpened, with Latex-Free Erasers, Office & School Supplies for Exams and Classrooms',
 'label': 0.5,
 'input_ids_query': [[101,
   1001,
   1016,
   14745,
   2015,
   2025,
   26694,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]],
 'token_type_ids_query': [[0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,

In [101]:
tf_dataset = prep_dataset["train"].to_tf_dataset(
    columns=["input_ids_query", "token_type_ids_query", "attention_mask_query", "input_ids_product", "token_type_ids_product", "attention_mask_product", "label"],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

2023-11-06 15:44:25.023306: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-11-06 15:44:25.023339: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2023-11-06 15:44:25.023344: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2023-11-06 15:44:25.023381: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-06 15:44:25.023399: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [102]:
for i in tf_dataset.take(1):
    print(i)

{'label': <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([1. , 0.7, 0. , 0. , 0. , 1. , 1. , 0.7, 1. , 1. , 0.7, 1. , 0. ,
       0.5, 1. , 1. , 1. , 1. , 0.7, 0.7, 1. , 0. , 1. , 0.7, 1. , 0.5,
       1. , 1. , 1. , 1. , 0.5, 0.7], dtype=float32)>, 'input_ids_query': <tf.Tensor: shape=(32, 1, 64), dtype=int64, numpy=
array([[[  101,  4689,  2519, ...,     0,     0,     0]],

       [[  101,  1015,  1011, ...,     0,     0,     0]],

       [[  101,  7473,  7492, ...,     0,     0,     0]],

       ...,

       [[  101,  9088,  3422, ...,     0,     0,     0]],

       [[  101,  9553, 12170, ...,     0,     0,     0]],

       [[  101,  3336,  5302, ...,     0,     0,     0]]])>, 'token_type_ids_query': <tf.Tensor: shape=(32, 1, 64), dtype=int64, numpy=
array([[[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0]]])>, 'atte

# Modeling

In [106]:
model = TFAutoModel.from_pretrained(model_id)
model.summary()

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/multi-qa-MiniLM-L6-cos-v1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  22713216  
                                                                 
Total params: 22713216 (86.64 MB)
Trainable params: 22713216 (86.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [120]:
class SentenceTransfomer(tf.keras.Model):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.dense = Dense(1, activation="sigmoid")

    def compile(self, optimizer, loss_fn):
        super().compile()
        self.loss_fn = loss_fn
        self.loss_metric = tf.keras.metrics.Mean(name='loss')

    @property
    def metrics(self):
        return [self.loss_metric]

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]

        input_mask_expanded = tf.cast(
            tf.broadcast_to(tf.expand_dims(attention_mask, -1), tf.shape(token_embeddings)),
            tf.float32)
        return tf.math.reduce_sum(token_embeddings + input_mask_expanded, axis=1) / \
            tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)

    def train_step(self, train_data):
        query = {
            'input_ids': train_data['input_ids_query'][:, 0, :],
            'token_type_ids': train_data['token_type_ids_query'][:, 0, :],
            'attention_mask': train_data['attention_mask_query'][:, 0, :]
        }
        product = {
            'input_ids': train_data['input_ids_product'][:, 0, :],
            'token_type_ids': train_data['token_type_ids_product'][:, 0, :],
            'attention_mask': train_data['attention_mask_product'][:, 0, :]
        }

        labels = train_data['label']

        with tf.GradientTape() as recorder:
            query_predication = self.model(query)
            pred_query = self.mean_pooling(query_predication, train_data['attention_mask_query'][:, 0, :])

            product_predication = self.model(product)
            pred_product = self.mean_pooling(product_predication, train_data['attention_mask_product'][:, 0, :])

            pred_concat = tf.concat([pred_query, pred_product, tf.abs(pred_query - pred_product)], axis=-1)

            predication = self.dense(pred_concat)
            loss = self.loss_fn(labels, predication)

        partial_derivatives = recorder.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(partial_derivatives, self.model.trainable_weights))

        self.loss_metric.update_state(loss)

        return {'loss': self.loss_metric.result(), }

## Model Training

In [121]:
stransformer = SentenceTransfomer(model)
stransformer.compile(
    optimizer=tf.keras.optimizers.Adam(2e-5),
    loss_fn = tf.keras.losses.BinaryCrossentropy(),
)



In [122]:
EPOCHS = 2
history = stransformer.fit(tf_dataset, epochs=EPOCHS

Epoch 1/2






2023-11-06 16:24:52.198306: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 