In [85]:
import torch
from transformers import AutoModel
from torch.utils.data import DataLoader
from torchvision import transforms
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformers
from datasets import load_dataset

import random
import random
from PIL import ImageDraw, ImageFont, Image
import pathlib
import sklearn
import datasets
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection
import glob
import functools

TOKENIZERS_PARALLELISM=False


In [86]:
dataset_path = '/Users/tylerklimas/Desktop/BERTModel/dataset_processed'
dataset_raw = datasets.load_from_disk(dataset_path)
dataset_raw

DatasetDict({
    train: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 72743
    })
    test: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 24248
    })
    valid: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 24248
    })
})

In [87]:
labels = dataset_raw['train'].features['label'].names
labels

['ACCESSORY',
 'BOOT',
 'CELLULAR_PHONE_CASE',
 'CHAIR',
 'EARRING',
 'FINEEARRING',
 'FINENECKLACEBRACELETANKLET',
 'FINERING',
 'GROCERY',
 'HANDBAG',
 'HARDWARE_HANDLE',
 'HAT',
 'HEALTH_PERSONAL_CARE',
 'HOME',
 'HOME_BED_AND_BATH',
 'HOME_FURNITURE_AND_DECOR',
 'JANITORIAL_SUPPLY',
 'KITCHEN',
 'LAMP',
 'LIGHT_BULB',
 'LIGHT_FIXTURE',
 'OFFICE_PRODUCTS',
 'OUTDOOR_LIVING',
 'PET_SUPPLIES',
 'RUG',
 'SANDAL',
 'SHOES',
 'SOFA',
 'SPORTING_GOODS',
 'TABLE',
 'WALL_ART']

In [88]:
id2label = {}
label2id = {}
for idx, ele in enumerate(labels):
    label2id[ele] = idx
    id2label[idx] = ele


In [89]:
base_model = "distilbert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model)
model = transformers.AutoModel.from_pretrained(base_model)

In [109]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset_raw.map(tokenize_function, batched=True)

# tokenize(dataset_raw['train'][0]) the text element gets transformed into integers 
# "Gmall Men's Summer Face Cover UV Protection Neck Gaiter Scarf Sunscreen Breathable Bandana, Black, 1 Piece"
# 101, 20917, 3363, 2273, 1005, 1055, 2621, 2227, 3104, 23068, 3860, 3300, 11721, 21646, 18982, 19352, 24410, 3052, 3085, 24112, 2532, 1010, 2304, 1010, 1015, 3538, 102

Map: 100%|███████████████████████| 72743/72743 [00:09<00:00, 7936.60 examples/s]
Map: 100%|███████████████████████| 24248/24248 [00:03<00:00, 7374.20 examples/s]
Map: 100%|███████████████████████| 24248/24248 [00:03<00:00, 8027.80 examples/s]


In [110]:
num_rows_train = tokenized_datasets['train'].num_rows
subset = 2000

subset_train = tokenized_datasets['train'].shuffle(seed=42).select(range(subset))
subset_test = tokenized_datasets['test'].shuffle(seed=42).select(range(subset))
subset_val = tokenized_datasets['valid'].shuffle(seed=42).select(range(subset))

In [111]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model,
                                                                        num_labels = len(labels),
                                                                        label2id=label2id,
                                                                        id2label=id2label)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [112]:
training_args = transformers.TrainingArguments(f"trainer_{base_model}"
                                  , evaluation_strategy="epoch"
                                  , save_strategy="epoch"
                                  , logging_steps=100
                                 # , eval_steps=100
                                  , load_best_model_at_end=True
                                  ,num_train_epochs=5              # total number of training epochs
                                  #,report_to="wandb",  # enable logging to W&B
                                               
                                 )

In [113]:
metric_f1 = datasets.load_metric('f1', trust_remote_code=True)
metric_accuracy = datasets.load_metric('accuracy', trust_remote_code=True)
metric_precision = datasets.load_metric('precision', trust_remote_code=True)
metric_recall = datasets.load_metric('recall', trust_remote_code=True)

In [114]:
def compute_metrics(eval_pred):
    metrics_dict = {}
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    metrics_dict.update(metric_f1.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(metric_accuracy.compute(predictions = predictions, references = labels))
    metrics_dict.update(metric_precision.compute(predictions = predictions, references = labels, average = 'macro'))
    metrics_dict.update(metric_recall.compute(predictions = predictions, references = labels, average = 'macro'))
    return metrics_dict

In [115]:
trainer = transformers.Trainer(
    model=model, 
    args=training_args, 
    train_dataset=subset_train, 
    eval_dataset=subset_val,
    compute_metrics=compute_metrics,
)

In [116]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 