In [1]:
import torch
from transformers import AutoModel
from torch.utils.data import DataLoader
from torchvision import transforms
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformers
from datasets import load_dataset

import random
import random
from PIL import ImageDraw, ImageFont, Image
import pathlib
import sklearn
import datasets
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection
import glob
import functools
import utils

TOKENIZERS_PARALLELISM=False

In [3]:
device = 'cpu'

dataset_path = '/Users/tylerklimas/Desktop/BERTModel/dataset_processed'
dataset_raw = datasets.load_from_disk(dataset_path)

labels = dataset_raw['train'].features['label'].names


id2label = {}
label2id = {}
for idx, ele in enumerate(labels):
    label2id[ele] = idx
    id2label[idx] = ele



{0: 'ACCESSORY',
 1: 'BOOT',
 2: 'CELLULAR_PHONE_CASE',
 3: 'CHAIR',
 4: 'EARRING',
 5: 'FINEEARRING',
 6: 'FINENECKLACEBRACELETANKLET',
 7: 'FINERING',
 8: 'GROCERY',
 9: 'HANDBAG',
 10: 'HARDWARE_HANDLE',
 11: 'HAT',
 12: 'HEALTH_PERSONAL_CARE',
 13: 'HOME',
 14: 'HOME_BED_AND_BATH',
 15: 'HOME_FURNITURE_AND_DECOR',
 16: 'JANITORIAL_SUPPLY',
 17: 'KITCHEN',
 18: 'LAMP',
 19: 'LIGHT_BULB',
 20: 'LIGHT_FIXTURE',
 21: 'OFFICE_PRODUCTS',
 22: 'OUTDOOR_LIVING',
 23: 'PET_SUPPLIES',
 24: 'RUG',
 25: 'SANDAL',
 26: 'SHOES',
 27: 'SOFA',
 28: 'SPORTING_GOODS',
 29: 'TABLE',
 30: 'WALL_ART'}

In [5]:
base_model = "distilbert-base-uncased"
model_dir = '/Users/tylerklimas/Desktop/BERTModel/BERTModelArchive'


model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir, return_dict=False)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir)

In [6]:
columns = set(dataset_raw['train'].column_names) - set(['text', 'label'])
columns

{'brand', 'item_id', 'item_name', 'main_image_id', 'node'}

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset_raw.map(tokenize_function, batched=True, remove_columns=columns)

In [12]:
subset = tokenized_datasets['train'].num_rows
subset = 50 # JUST FOR INFERENCE PURPOSE

test_dataset = tokenized_datasets['test'].shuffle(seed=22).select(range(subset))
test_dataset.set_format(type='torch')
test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [13]:
import utils
utils.prediction_batch(model, test_dataset)   

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|█████████████████████████████████████████████| 2/2 [00:13<00:00,  6.98s/it]


{'accuracy': 0.82}

In [14]:
quantized_model_int8 = torch.quantization.quantize_dynamic(
                       model,
                       {torch.nn.Linear},
                       dtype = torch.qint8)
# quantized_model_int8 

In [9]:
import os

def print_size_model(model):
    torch.save(model.state_dict(), 'temp.p')
    print('Size (MB): ', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')
    
print_size_model(model)
print_size_model(quantized_model_int8)

Size (MB):  267.942302
Size (MB):  138.729314


In [15]:
base_model = "distilbert-base-uncased"
script_tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir, torchscript=True)
script_model = transformers.AutoModelForSequenceClassification.from_pretrained(model_dir,
                                                                        num_labels = len(labels),
                                                                        label2id=label2id,
                                                                        id2label=id2label,
                                                                        torchscript=True)                                               
                                                                               
                                                                               

In [16]:
text = 'mens dress shoes'

res = script_tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True)

text_tokens = res['input_ids'].to(device)
text_attentions = res['attention_mask'].to(device)

dummy_input = [text_tokens, text_attentions]

script_model = script_model.to(device)

traced_model = torch.jit.trace(script_model, dummy_input)



  mask, torch.tensor(torch.finfo(scores.dtype).min)


In [17]:


utils.prediction( model=traced_model
                 , tokens_tensor=text_tokens
                 , masks_tensors=text_attentions 
                 , id2label_str=id2label)

[[26, 1, 25, 0, 13]]


KeyError: '26'

In [36]:
model_dir = '/Users/tylerklimas/Desktop/BERTModel/BERTModelArchive'
model.save_pretrained('/Users/tylerklimas/Desktop/BERTModel/BERTModelArchive')
tokenizer.save_pretrained('/Users/tylerklimas/Desktop/BERTModel/TBERTModelArchive')
torch.save(model.state_dict(), '/Users/tylerklimas/Desktop/BERTModel/TrainedModels/model_weights.pt')



In [37]:
setup_config = {
    'model_name':"pt-original",
    "do_lower_case": tokenizer.do_lower_case,
    "num_labels":len(id2label),
    'save_mode':"original",
    'max_length': tokenizer.model_max_length,
    'captum_explanation': True,
    "base_model": 'distilbert-base-uncased',
    'top_k': 5
}

In [38]:
import json

with open(f'{model_dir}/setup_config.json', 'w') as f:
    json.dump(setup_config,f)

In [39]:
setup_config_trace = {**setup_config}
setup_config_trace['model_name'] = 'pt-jit'
setup_config_trace['capture_explanation'] = False
setup_config_trace['save_model'] = 'jit'


In [40]:

with open(f'{model_dir}/setup_config.json', 'w') as f:
    json.dump(setup_config_trace,f)