In [1]:
# bert imports

import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [2]:
def data_processing(first_batch: int, last_batch: int):
    tokens = []
    entities = []
    
    for i in range(first_batch, last_batch):
        file = open(f"training_data/each_data_set/data_set_{i}.txt", "r")
        text = file.read()
        text = text.split("\n")
        file.close()
        
        words = []
        tags = []
        
        for line in text:
            if line != "":
                values = line.split(" ")
                word = values[0]
                tag = values[1]
                
                words.append(word)
                tags.append(tag)
            
        tokens.append(words)
        entities.append(tags)
    
    df = pd.DataFrame({"tokens": tokens, "ner_labels": entities})
    table = pa.Table.from_pandas(df)  
    dataset = Dataset.from_pandas(df)
    
    return dataset, table

In [11]:
training_dataset, training_parquet = data_processing(0,100)
testing_dataset, testing_parquet = data_processing(100,126)

pq.write_table(training_parquet, "./training_data/training_dataset.parquet")
pq.write_table(testing_parquet, "./training_data/testing_dataset.parquet")

In [4]:
label_list = ['O', 'B-PRODUCT', 'I-PRODUCT']
label_dict = dict(zip(label_list, range(len(label_list))))

model_checkpoint = 'distilbert-base-uncased'
batch_size = 16 

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True, return_tensors="pt", padding=True)
    
    labels = []
    for i, label in enumerate(examples["ner_labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        prev_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != prev_word_idx:
                label_ids.append(label_dict[label[word_idx]])
            else:
                label_ids.append(label_dict[label[word_idx]] if label_all_tokens else -100)

            prev_word_idx = word_idx

        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs
        

In [12]:
tokenized_training_dataset = training_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_testing_dataset = testing_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

In [7]:
def compute_metrics(p):
    perdiction, labels = p
    predictions = np.argmax(perdiction, axis=2)
    
    true_predictions = [[label_list[p] for (p,l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions,labels)]
    true_labels = [[label_list[l] for (p,l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions,labels)]
    
    result = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": result["overall_precision"], "recall": result["overall_recall"], "f1": result["overall_f1"], "accuracy": result["overall_accuracy"]}

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model.to(device)

training_args = TrainingArguments(
    "ner-model",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=100,
    weight_decay=1e-6,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [9]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_training_dataset,
    eval_dataset=tokenized_testing_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('product-recognition.model')



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.687273,0.211538,0.013213,0.024873,0.707299
2,No log,0.604657,0.17934,0.075075,0.105843,0.726385
3,No log,0.619386,0.221526,0.127327,0.161709,0.733001
4,No log,0.698161,0.225071,0.142342,0.174393,0.729964
5,No log,0.727608,0.249107,0.167568,0.200359,0.743846
6,No log,0.684249,0.261406,0.254655,0.257986,0.731591
7,No log,0.641978,0.28693,0.222823,0.250845,0.758161
8,No log,0.978319,0.313531,0.057057,0.096545,0.77497
9,No log,0.771764,0.290909,0.096096,0.14447,0.770632
10,No log,0.757177,0.31769,0.264264,0.288525,0.763475
