# Install and Import

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
import evaluate
import numpy as np
from datasets import load_metric, load_dataset
import random


# The Work

In [6]:
# Defining a constant SEED for reproducibility in random operations
SEED = 42

# Setting the seed for the random library to ensure consistent results
random.seed(SEED)

In [7]:
# 'star' is a column in our dataset and we want to convert it to a ClassLabel column
# so we can stratify our samples.

# Importing the ClassLabel module to represent categorical class labels
from datasets import ClassLabel

# Loading the 'app_reviews' dataset's training split into the 'dataset' variable
dataset = load_dataset('app_reviews', split='train')

# Converting the 'star' column in our dataset to a ClassLabel type
# This allows for categorical representation and easier handling of classes
dataset = dataset.class_encode_column('star')

# Displaying the dataset to see the changes
dataset

Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.42k/5.42k [00:00<00:00, 26.5kB/s]
Stringifying the column: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 288065/288065 [00:00<00:00, 975608.17 examples/s]
Casting to class labels: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 288065/288065 [00:00<00:00, 862885.54 examples/s]


Dataset({
    features: ['package_name', 'review', 'date', 'star'],
    num_rows: 288065
})

In [8]:
# Splitting the dataset into a training set and a test set.
# We reserve 20% of the data for testing and use stratification on the 'star' column
# to ensure both sets have an equal distribution of each star category.
dataset = dataset.train_test_split(test_size=0.2, seed=SEED, stratify_by_column='star')

# Now, we further split our training dataset to reserve 25% of it for validation.
# Again, we stratify by the 'star' column to keep the distribution consistent.
df = dataset['train'].train_test_split(test_size=.25, seed=SEED, stratify_by_column='star')

# Assigning the split datasets to their respective keys:
# - The remaining 75% of our initial training data becomes the new training dataset.
dataset['train'] = df['train']

# - The 25% split from our initial training data becomes the validation dataset.
dataset['val'] = df['test']

# Displaying the dataset to see the distribution across train, test, and validation sets.
dataset

DatasetDict({
    train: Dataset({
        features: ['package_name', 'review', 'date', 'star'],
        num_rows: 172839
    })
    test: Dataset({
        features: ['package_name', 'review', 'date', 'star'],
        num_rows: 57613
    })
    val: Dataset({
        features: ['package_name', 'review', 'date', 'star'],
        num_rows: 57613
    })
})

In [9]:
MODEL = 'distilbert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(MODEL)



In [None]:
tokenizer('Sinan')

{'input_ids': [101, 14009, 1389, 102], 'attention_mask': [1, 1, 1, 1]}

In [None]:
dataset['train'][0]['review']

'Nice😉'

In [None]:
tokenizer.decode(tokenizer('Nice😉')['input_ids'])

'[CLS] [UNK] [SEP]'

In [None]:
tokenizer.decode(tokenizer('Nice 😉')['input_ids'])

'[CLS] Nice [UNK] [SEP]'

In [None]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):  # each example is an element from the Dataset
    return tokenizer(examples["review"], truncation=True)

In [None]:
dataset = dataset.map(preprocess_function, batched=True)

dataset = dataset.rename_column("star", "label")
dataset = dataset.remove_columns(['package_name', 'date'])  # not removing review for later comparison against fine-tuned GPT 3.5
dataset

Map:   0%|          | 0/172839 [00:00<?, ? examples/s]

Map:   0%|          | 0/57613 [00:00<?, ? examples/s]

Map:   0%|          | 0/57613 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 172839
    })
    test: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 57613
    })
    val: Dataset({
        features: ['review', 'label', 'input_ids', 'attention_mask'],
        num_rows: 57613
    })
})

In [None]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the
#  length of the longest element in the batch, making them all the same length.
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
data_collator(tokenizer([dataset['train'][0]['review'], dataset['train'][1]['review']]))['input_ids']

tensor([[ 101,  100,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0],
        [ 101, 7986, 1505, 1555, 2066, 1141, 7910, 1157, 6929,  183, 1181, 1696,
          102]])

In [None]:
data_collator(tokenizer([dataset['train'][0]['review'], dataset['train'][1]['review']]))['attention_mask']

tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [10]:
sequence_clf_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=5,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
sequence_clf_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

In [None]:
epochs = 1

training_args = TrainingArguments(
    output_dir="./bert_clf_results",
    num_train_epochs=epochs,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=128,
    load_best_model_at_end=True,

    # some deep learning parameters that the Trainer is able to take in
    warmup_ratio=0.1,
    weight_decay = 0.01,
    learning_rate=2e-5,

    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    compute_metrics=compute_metrics,  # optional
    data_collator=data_collator  # technically optional
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: review. If review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 172,839
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 2,701
  Number of trainable parameters = 65,785,349


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0896,0.829034,0.712322


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: review. If review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 57613
  Batch size = 64


Saving model checkpoint to ./bert_clf_results/tmp-checkpoint-2701
Configuration saved in ./bert_clf_results/tmp-checkpoint-2701/config.json
Model weights saved in ./bert_clf_results/tmp-checkpoint-2701/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./bert_clf_results/checkpoint-2701 (score: 0.8290337920188904).


TrainOutput(global_step=2701, training_loss=0.8888398967752277, metrics={'train_runtime': 1637.3473, 'train_samples_per_second': 105.56, 'train_steps_per_second': 1.65, 'total_flos': 4505733852934470.0, 'train_loss': 0.8888398967752277, 'epoch': 1.0})

In [None]:
test_results = trainer.evaluate(dataset['test'])

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: review. If review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 57613
  Batch size = 64


In [None]:
# Print the results
print("Test Set Evaluation Results:")
for key, value in test_results.items():
    print(f"{key}: {value}")


Test Set Evaluation Results:
eval_loss: 0.8238798975944519
eval_accuracy: 0.7138840192317706
eval_runtime: 221.4381
eval_samples_per_second: 260.177
eval_steps_per_second: 4.069
epoch: 1.0


In [None]:
trainer.model.push_to_hub('distilbert-base-cased-finetuned-stars')

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Configuration saved in /tmp/tmpkskn6jwz/config.json
Model weights saved in /tmp/tmpkskn6jwz/model.safetensors
Uploading the following files to profoz/distilbert-base-cased-finetuned-stars: README.md,config.json,model.safetensors


model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/profoz/distilbert-base-cased-finetuned-stars/commit/0e26c2d100a06c78a21f7f2669c4815b71ef56e1', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='0e26c2d100a06c78a21f7f2669c4815b71ef56e1', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub('distilbert-base-cased-finetuned-stars')

tokenizer config file saved in /tmp/tmp6cuwat9d/tokenizer_config.json
Special tokens file saved in /tmp/tmp6cuwat9d/special_tokens_map.json
Uploading the following files to profoz/distilbert-base-cased-finetuned-stars: README.md,tokenizer_config.json,special_tokens_map.json,vocab.txt,tokenizer.json


CommitInfo(commit_url='https://huggingface.co/profoz/distilbert-base-cased-finetuned-stars/commit/80554e73024cd6517f29158dda129312fa9f4ed6', commit_message='Upload tokenizer', commit_description='', oid='80554e73024cd6517f29158dda129312fa9f4ed6', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
dataset['test'].select(range(5)).to_pandas()

Unnamed: 0,review,label,input_ids,attention_mask
0,Very good app easy to use... Some bugs though...,4,"[101, 6424, 1363, 12647, 3123, 1106, 1329, 119...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Wanted to Love it Really wanted to Love this g...,1,"[101, 20601, 1106, 2185, 1122, 8762, 1458, 110...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Great,3,"[101, 2038, 102]","[1, 1, 1]"
3,Voice command is not do what you say its needs...,0,"[101, 7900, 2663, 1110, 1136, 1202, 1184, 1128...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,Nice app to impress the boss 😀,4,"[101, 8835, 12647, 1106, 19726, 1103, 6054, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [None]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained('profoz/distilbert-base-cased-finetuned-stars').eval().to(0)
model.config.id2label = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}
tokenizer = AutoTokenizer.from_pretrained('profoz/distilbert-base-cased-finetuned-stars')

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--profoz--distilbert-base-cased-finetuned-stars/snapshots/0e26c2d100a06c78a21f7f2669c4815b71ef56e1/config.json
Model config DistilBertConfig {
  "_name_or_path": "profoz/distilbert-base-cased-finetuned-stars",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": f

In [None]:
clf_results

[]

In [None]:
batch_size = 128
clf_pipeline = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

# run against dataset['test']['review']
clf_results = []
for i in tqdm(range(0, len(dataset['test']['review']), batch_size)):
    batch = dataset['test']['review'][i:i+batch_size]
    clf_results.extend(clf_pipeline(batch))
    if i % 100 == 0:
        # Accuracy
        star_preds = np.array([_['label'] for _ in clf_results])
        truth_preds = np.array(dataset['test']['label'][:len(star_preds)])
        acc = (np.array(star_preds) == truth_preds).mean()
        print(f'Accuracy is {acc}')
        # < 3 accuracy
        less_three = ((np.array(star_preds) <= 3) == (truth_preds <= 3)).mean()
        print(f'<3 Accuracy is {less_three}')
        # Accuracy (off-by-1)
        off_one = (np.abs((star_preds - truth_preds)) <= 1).mean()
        print(f'Off by one Accuracy is {off_one}')


  0%|          | 1/451 [00:01<09:38,  1.29s/it]

Accuracy is 0.734375
<3 Accuracy is 0.7890625
Off by one Accuracy is 0.9296875


  6%|▌         | 26/451 [00:29<07:53,  1.12s/it]

Accuracy is 0.7247596153846154
<3 Accuracy is 0.8064903846153846
Off by one Accuracy is 0.8792067307692307


 11%|█▏        | 51/451 [00:57<06:53,  1.03s/it]

Accuracy is 0.7170649509803921
<3 Accuracy is 0.7991727941176471
Off by one Accuracy is 0.8705575980392157


 17%|█▋        | 76/451 [01:19<05:10,  1.21it/s]

Accuracy is 0.7158717105263158
<3 Accuracy is 0.7983141447368421
Off by one Accuracy is 0.8689350328947368


 22%|██▏       | 101/451 [01:41<04:59,  1.17it/s]

Accuracy is 0.7157332920792079
<3 Accuracy is 0.7970297029702971
Off by one Accuracy is 0.8689665841584159


 28%|██▊       | 126/451 [02:04<05:34,  1.03s/it]

Accuracy is 0.7144097222222222
<3 Accuracy is 0.7967509920634921
Off by one Accuracy is 0.869171626984127


 33%|███▎      | 151/451 [02:27<05:18,  1.06s/it]

Accuracy is 0.7165252483443708
<3 Accuracy is 0.8002379966887417
Off by one Accuracy is 0.8703435430463576


 39%|███▉      | 176/451 [02:49<03:46,  1.22it/s]

Accuracy is 0.7157315340909091
<3 Accuracy is 0.7989169034090909
Off by one Accuracy is 0.8688299005681818


 45%|████▍     | 201/451 [03:11<03:34,  1.17it/s]

Accuracy is 0.7151352611940298
<3 Accuracy is 0.7973802860696517
Off by one Accuracy is 0.8682758084577115


 50%|█████     | 226/451 [03:33<03:44,  1.00it/s]

Accuracy is 0.715535121681416
<3 Accuracy is 0.7973589601769911
Off by one Accuracy is 0.8680171460176991


 56%|█████▌    | 251/451 [03:55<02:45,  1.21it/s]

Accuracy is 0.7159487051792829
<3 Accuracy is 0.7973418824701195
Off by one Accuracy is 0.8673742529880478


 61%|██████    | 276/451 [04:17<02:30,  1.16it/s]

Accuracy is 0.7150985054347826
<3 Accuracy is 0.7962522644927537
Off by one Accuracy is 0.8668761322463768


 67%|██████▋   | 301/451 [04:40<02:15,  1.11it/s]

Accuracy is 0.7157392026578073
<3 Accuracy is 0.796952865448505
Off by one Accuracy is 0.8675249169435216


 72%|███████▏  | 326/451 [05:01<01:56,  1.08it/s]

Accuracy is 0.7158263036809815
<3 Accuracy is 0.7976418711656442
Off by one Accuracy is 0.8675230061349694


 78%|███████▊  | 351/451 [05:23<01:22,  1.21it/s]

Accuracy is 0.7155893874643875
<3 Accuracy is 0.7975204772079773
Off by one Accuracy is 0.8679220085470085


 83%|████████▎ | 376/451 [05:46<01:04,  1.17it/s]

Accuracy is 0.7149476396276596
<3 Accuracy is 0.7971658909574468
Off by one Accuracy is 0.8680809507978723


 89%|████████▉ | 401/451 [06:09<00:47,  1.06it/s]

Accuracy is 0.7146391832917706
<3 Accuracy is 0.7969724127182045
Off by one Accuracy is 0.8677330112219451


 94%|█████████▍| 426/451 [06:31<00:23,  1.06it/s]

Accuracy is 0.7146786971830986
<3 Accuracy is 0.7965448943661971
Off by one Accuracy is 0.8678110328638498


100%|██████████| 451/451 [06:52<00:00,  1.09it/s]

Accuracy is 0.7138840192317706
<3 Accuracy is 0.7962091888983389
Off by one Accuracy is 0.868033256383108





In [None]:
star_preds = np.array([_['label'] for _ in clf_results])

In [None]:
truth_preds = np.array(dataset['test']['label'][:len(star_preds)])

In [None]:
# Accuracy
acc = (np.array(star_preds) == truth_preds).mean()
print(f'Accuracy is {acc}')
# < 3 accuracy
less_three = ((np.array(star_preds) <= 3) == (truth_preds <= 3)).mean()
print(f'<3 Accuracy is {less_three}')
# Accuracy (off-by-1)
off_one = (np.abs((star_preds - truth_preds)) <= 1).mean()
print(f'Off by one Accuracy is {off_one}')


Accuracy is 0.7138840192317706
<3 Accuracy is 0.7962091888983389
Off by one Accuracy is 0.868033256383108


In [None]:
# compare to gpt 3.5 and ada
    # accuracy
    # cost to train/host (on HF or make your own API)
    # latency/throughput
    # param count
    # how many requests on 40 bucks a month from chatgpt just to classify stars


In [None]:
from transformers import DataCollatorWithPadding
from tqdm import tqdm

def test_bert_throughput(batch_size=32, time_bias_in_seconds=0):

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    test_dataloader = DataLoader(dataset['test'], batch_size=batch_size, collate_fn=data_collator)

    sequence_clf_model.eval()  # Set the model to evaluation mode
    total_time = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            start_time = time.time()
            outputs = sequence_clf_model(**{k: v.to(sequence_clf_model.device) for k, v in batch.items() if k in ['input_ids', 'attention_mask']})
            end_time = time.time()
            total_time += (end_time - start_time) + time_bias_in_seconds  # Add bias to account for lag
            total_samples += batch['input_ids'].size(0)  # Number of samples in the batch

    throughput = total_samples / total_time  # Samples per second
    print(f"Throughput: {throughput} samples/second")

In [None]:
test_bert_throughput(64)

100%|██████████| 901/901 [03:23<00:00,  4.42it/s]

Throughput: 298.1546134400942 samples/second





In [None]:
test_bert_throughput(32)

100%|██████████| 1801/1801 [02:46<00:00, 10.80it/s]

Throughput: 369.4027813955529 samples/second





In [None]:
test_bert_throughput(1) # same assumption as chatgpt that we do 1 at a time

100%|██████████| 57613/57613 [07:55<00:00, 121.16it/s]

Throughput: 131.52179507102556 samples/second





In [None]:
test_bert_throughput(32, 0.5)  # even with a lag of a half second on each batch (that's a lot frankly) every batch, not bad

100%|██████████| 1801/1801 [02:45<00:00, 10.89it/s]

Throughput: 54.6509415462314 samples/second





In [None]:
test_bert_throughput(1, 0.02) # Assuming 20ms lag on each BERT call (pretty darn fast RPC call) we get about the throughput as ChatGPT at one sample at a time

36.227549154514456

In [None]:
import openai
from openai import AsyncOpenAI
import os

os.environ['OPENAI_API_KEY'] = 'your_api_key_here'

In [None]:
import asyncio
from openai import AsyncOpenAI
import time

async def generate_completion(client, system, review):
    completion = await client.chat.completions.create(
        model="ft:gpt-3.5-turbo-0613:personal::8GsD6MhX",
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": review}
        ],
        max_tokens=1,
        temperature=0.1
    )
    return completion.choices[0].message.content.strip()

async def measure_throughput(test_data):
    client = AsyncOpenAI()
    start_time = time.time()

    # Process all requests concurrently
    await asyncio.gather(*(generate_completion(client, system, review) for system, review in test_data))

    end_time = time.time()
    throughput = len(test_data) / (end_time - start_time)
    return throughput

# Prepare your test data
test_data = [("", review) for review in dataset['test']['review'][:10000]]  # only doing 10K

# Run throughput measurement
throughput = await measure_throughput(test_data)
print(f"Throughput: {throughput} samples/second")


Throughput: 34.896461495439766 samples/second


In [None]:
import tiktoken

# Get the tokenizer for GPT-3.5
enc = tiktoken.get_encoding("p50k_base")

total_tokens = 0

# Calculate tokens for each review
for _, review in test_data:
    total_tokens += len(enc.encode(review))

# Add 1 token per input for the output
total_tokens += len(test_data)

# Calculate the cost
cost_per_1000_input_tokens = 0.0030
cost_per_1000_output_tokens = 0.0060
cost = (total_tokens / 1000) * cost_per_1000_input_tokens + (len(test_data) / 1000) * cost_per_1000_output_tokens

print(f"Estimated Cost to run test data through ChatGPT: ${cost:.4f}")


Estimated Cost to run test data through ChatGPT: $0.5574


In [None]:
# even with bad lag, our batched bert model has better throughput (54.6 / 34.89) (1.56x / 56% better throughput) than our fine-tuned ChatGPT model
# and accuracy wise it's 71.74% (BERT) vs 73.66% (ChatGPT)
# 65.78 million parameters vs 175 BILLION
#