In [1]:
!pip install transformers datasets torch

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
###
#
# Prepare data for upload
#
###


import json
from string import punctuation

from tqdm import tqdm
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

imdb = load_dataset("imdb")
# Dataset shape:
# {
#     "label": <int>,
#     "text": "<string>",
# }

classes = ['negative', 'positive']

def mean_pooling(model_output, attention_mask):
    """
    Average the token embeddings into a single sentence embeddings
    """
    token_embeddings = model_output
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_text_metadata(text):
    """
    Compute basic text metadata for easy filtering in Dioptra
    """
    return {
        'num_char': len(text),
        'num_punct': sum([1 for k in text if k in punctuation]),
        'num_digits': sum([1 for k in text if k in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]),
        'num_tokens': sum([1 for k in text if k in [' ']])
    }

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to('cuda')

datapoints = []
number_of_datapoints = 100

for row in tqdm(imdb['test'].select(range(number_of_datapoints)), desc='Processing your data...'):
    
    text = row['text']
    label = classes[row['label']]

    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to('cuda')
    model_output = model(**inputs, output_hidden_states=True)

    transformer_output = mean_pooling(model_output.hidden_states[-1], inputs['attention_mask']).flatten().tolist()
    
    logits = model_output.logits
    confidence = logits.softmax(-1)
    prediction = classes[confidence.argmax(1)]

    datapoint = {
        'text': text,
        'embeddings': transformer_output,
        'groundtruth': label,
        'prediction': {
            'class_name': classes,
            'confidence': confidence.tolist()[0],
            'logits': logits.tolist()[0]
        },
        'text_metadata': get_text_metadata(text)
    }
    datapoints.append(datapoint)

Reusing dataset imdb (/home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

Processing your data...: 100%|██████████| 100/100 [00:01<00:00, 97.14it/s]


In [4]:
!pip install -y dioptra

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Found existing installation: dioptra 0.2.7
Uninstalling dioptra-0.2.7:
  Successfully uninstalled dioptra-0.2.7
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
running install
running bdist_egg
running egg_info
writing dioptra.egg-info/PKG-INFO
writing dependency_links to dioptra.egg-info/dependency_links.txt
writing requirements to dioptra.egg-info/requires.txt
writing top-level names to dioptra.egg-info/top_level.txt
reading manifest file 'dioptra.egg-info/SOURCES.txt'
adding

In [5]:
###
#
#  Send data to Dioptra
#
###

import json
from dioptra.api import Logger
from dioptra.supported_types import SupportedTypes
import datetime
import uuid
from tqdm import tqdm

model_id = 'distilbert-base-uncased-finetuned-sst-2-english'
model_version = 'v1.1'
dataset_id = 'imdb'
benchmark_id = str(uuid.uuid4())

api_key = 'MY_API_KEY'

dioptra_logger = Logger(api_key=api_key)

for datapoint in tqdm(datapoints):
    datapoint['request_id'] = str(uuid.uuid4())
    datapoint['timestamp'] = datetime.datetime.utcnow()
    datapoint['model_id'] = model_id
    datapoint['model_version'] = model_version
    datapoint['dataset_id'] = dataset_id
    datapoint['benchmark_id'] = benchmark_id
    datapoint['model_type'] = SupportedTypes.TEXT_CLASSIFIER
    dioptra_logger.commit(**datapoint)

100%|██████████| 100/100 [00:09<00:00, 10.37it/s]
