In [1]:
import torch
from transformers import AutoModel
from torch.utils.data import DataLoader
from torchvision import transforms
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformers
from datasets import load_dataset

import random
import random
from PIL import ImageDraw, ImageFont, Image
import pathlib
import sklearn
import datasets
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection
import glob
import functools

TOKENIZERS_PARALLELISM=False

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
device = 'cpu'

dataset_path = '/Users/tylerklimas/Desktop/BERTModel/dataset_processed'
dataset_raw = datasets.load_from_disk(dataset_path)

labels = dataset_raw['train'].features['label'].names
labels

id2label = {}
label2id = {}
for idx, ele in enumerate(labels):
    label2id[ele] = idx
    id2label[idx] = ele

    

In [15]:
base_model = "distilbert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model)
model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model,
                                                                        num_labels = len(labels),
                                                                        label2id=label2id,
                                                                        id2label=id2label)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
columns = set(dataset_raw['train'].column_names) - set(['text', 'label'])
columns

{'brand', 'item_id', 'item_name', 'main_image_id', 'node'}

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset_raw.map(tokenize_function, batched=True, remove_columns=columns)

In [18]:
subset = tokenized_datasets['train'].num_rows
subset = 20 # JUST FOR INFERENCE PURPOSE

test_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(subset))
test_dataset.set_format(type='torch')
test_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 20
})

In [19]:
import utils
utils.prediction_batch(model, test_dataset)   

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.96s/it]


{'accuracy': 0.05}

In [20]:
quantized_model_int8 = torch.quantization.quantize_dynamic(
                       model,
                       {torch.nn.Linear},
                       dtype = torch.qint8)
# quantized_model_int8 

In [21]:
import os

def print_size_model(model):
    torch.save(model.state_dict(), 'temp.p')
    print('Size (MB): ', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')
    
print_size_model(model)
print_size_model(quantized_model_int8)

Size (MB):  267.942302
Size (MB):  138.729314


In [22]:
base_model = "distilbert-base-uncased"
script_tokenizer = transformers.AutoTokenizer.from_pretrained(base_model, torchscript=True)
script_model = transformers.AutoModelForSequenceClassification.from_pretrained(base_model,
                                                                        num_labels = len(labels),
                                                                        label2id=label2id,
                                                                        id2label=id2label,
                                                                        torchscript=True)                                               
                                                                               
                                                                               

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
text = 'mens dress shoes'

res = script_tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True)

text_tokens = res['input_ids'].to(device)
text_attentions = res['attention_mask'].to(device)

dummy_input = [text_tokens, text_attentions]

script_model = script_model.to(device)

traced_model = torch.jit.trace(script_model, dummy_input)



  mask, torch.tensor(torch.finfo(scores.dtype).min)


In [44]:
model_dir = '/Users/tylerklimas/Desktop/BERTModel/TrainedModels'
model.save_pretrained('/Users/tylerklimas/Desktop/BERTModel/TrainedModels')
tokenizer.save_pretrained('/Users/tylerklimas/Desktop/BERTModel/TrainedModels')




('/Users/tylerklimas/Desktop/BERTModel/TrainedModels/tokenizer_config.json',
 '/Users/tylerklimas/Desktop/BERTModel/TrainedModels/special_tokens_map.json',
 '/Users/tylerklimas/Desktop/BERTModel/TrainedModels/vocab.txt',
 '/Users/tylerklimas/Desktop/BERTModel/TrainedModels/added_tokens.json',
 '/Users/tylerklimas/Desktop/BERTModel/TrainedModels/tokenizer.json')

In [43]:
setup_config = {
    'model_name':"pt-original",
    "do_lower_case": tokenizer.do_lower_case,
    "num_labels":len(id2label),
    'save_mode':"original",
    'max_length': tokenizer.model_max_length,
    'captum_explanation': True,
    "base_model": 'distilbert-base-uncased',
    'top_k': 5
}

In [47]:
import json

with open(f'{model_dir}/setup_config.json', 'w') as f:
    json.dump(setup_config,f)

In [None]:
setup_config_trace = {**setup_config}
setup_config_trace['model_name'] = 'pt-jit'
setup_config_trace['capture_explanation'] = False
setup_config_trace['save_model']