In [1]:
from transformers import BertForSequenceClassification, BertTokenizer
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

model_name = 'tinybert-tkd'
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Save the model in the desired format (PyTorch model)
model.save_pretrained("saved_model_tkd")

# Convert the PyTorch model to TensorFlow
tokenizer.save_pretrained("saved_model_tkd")  # Save the tokenizer along with the model

# Use Path for the output parameter with a specific directory
output_dir = Path('onnx_output')
output_path = output_dir / 'onnx_model.onnx'
output_dir.mkdir(parents=True, exist_ok=True)  # Create the directory if it doesn't exist

# Convert to ONNX format
convert(model=model, tokenizer=tokenizer, output=output_path, opset=11, framework='pt')

  from .autonotebook import tqdm as notebook_tqdm
2024-02-05 11:44:27.983434: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-05 11:44:28.008840: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-05 11:44:28.008864: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-05 11:44:28.009550: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-05 11:44:28.0

ONNX opset version set to: 11
Loading pipeline (model: BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
  

In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data = pd.read_csv('train_150k.txt', sep='\t', header=None, names=['label', 'text'])
data = data[['text', 'label']].to_dict('list')
train_dataset = Dataset.from_dict(data)

data = pd.read_csv('test_62k.txt', sep='\t', header=None, names=['label', 'text'])
# data = data.iloc[0:10]
data = data[['text', 'label']].to_dict('list')
ev_dataset = Dataset.from_dict(data)
ev_dataset = ev_dataset.map(lambda examples: bert_tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)
ev_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer1 = Trainer(
    model=BertForSequenceClassification.from_pretrained('fineBERT'),
    args=training_args,
    train_dataset=train_dataset,  # replace with your actual training dataset
    eval_dataset=ev_dataset,
)
results = trainer1.evaluate()
results

from torch.utils.data import DataLoader

dataloader = DataLoader(ev_dataset, batch_size=5)
fine_tuned_model = BertForSequenceClassification.from_pretrained('fineBERT')
fine_tuned_model.eval()
# Perform inference and calculate accuracy
correct_predictions = 0
total_samples = 0
result = []

for batch in dataloader:
    with torch.no_grad():
        input_ids = [ids.to(device) for ids in batch['input_ids']]
        attention_mask = [mask.to(device) for mask in batch['attention_mask']]
        labels = batch['label'].to(device)

        # Ensure input_ids and attention_mask are not lists
        input_ids = torch.stack(input_ids)
        attention_mask = torch.stack(attention_mask)

        # Make sure the model is on the same device
        fine_tuned_model = fine_tuned_model.to(device)

        outputs = fine_tuned_model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        pred = predictions.cpu().tolist()
        result = result + pred

        correct_predictions += (predictions == labels).sum().item()
        total_samples += len(labels)

test_accuracy = correct_predictions / total_samples
print(f'Test Accuracy on the New Dataset: {test_accuracy}')

  from .autonotebook import tqdm as notebook_tqdm
2024-02-05 15:49:34.997850: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-05 15:49:35.031808: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-05 15:49:35.031842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-05 15:49:35.032691: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-05 15:49:35.0

Test Accuracy on the New Dataset: 0.8498177360560019


In [2]:
from sklearn.metrics import classification_report
print(classification_report(pd.read_csv('test_62k.txt', sep='\t', header=None, names=['label', 'text'])['label'], result))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     30969
           1       0.85      0.85      0.85     31029

    accuracy                           0.85     61998
   macro avg       0.85      0.85      0.85     61998
weighted avg       0.85      0.85      0.85     61998



In [3]:
len(result)

61998

In [6]:
df = pd.read_csv('test_62k.txt', sep='\t', header=None, names=['label', 'text'])
df['pred'] = result
df.to_csv('result.csv', index=False)