In [1]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
train_df_file = 'furniture/furniture_train_df.gzip'
train_df = pd.read_parquet(train_df_file)
train_df.update(train_df.tokens.apply(np.ndarray.tolist))
train_df.update(train_df.ner_tags.apply(np.ndarray.tolist))
train_df

Unnamed: 0,tokens,ner_tags
0,[Also available in a selection of other colors...,"[O, O, O, I-LOC, O, O, O, O, O, O, O, O, O, O,..."
1,"[ULUDAGE SOFA SET 3M+3+1, $('.swatch[data-opti...","[I-LOC, O, O, O, I-LOC, O, O, O, O, O, O, O, O..."
2,"[Dhs. 2,500.00, With an innovative blend of pl...","[O, O, O, I-LOC, O, O, O, I-LOC, O, O, O, O, I..."
3,"[0510, Love to host but short on space? The la...","[O, O, O, O, O, O, O, O, O, O, O, O, O, I-LOC,..."
4,"[0419, $('.swatch[data-option-index=""0""] .aed-...","[O, O, O, O, O, O, O, O, O, I-LOC, O, O, O, I-..."
...,...,...
2629,"[48"" Round Outdoor Teak Dining Table Handcraft...","[O, O, O, I-LOC, O, I-LOC, O, O, O, O, I-LOC, ..."
2630,[7 pc Venice Teak Deep Seating Deluxe Sofa wit...,"[I-LOC, I-LOC, I-LOC, O, I-LOC, I-LOC, O, I-LO..."
2631,"[Email Customer Service, Daybed 72""Lx63.5""Wx16...","[O, O, I-LOC, O, O, O, O, I-LOC, O, O, I-LOC, ..."
2632,"[$8,194.99, $4,289.99, Sectional Sofa 119""x32....","[O, O, O, I-LOC, O, I-LOC, O, O, I-LOC, O, I-L..."


In [14]:
test_df_file = 'furniture/furniture_test_df.gzip'
test_df = pd.read_parquet(test_df_file)
test_df.update(test_df.tokens.apply(np.ndarray.tolist))
test_df.update(test_df.ner_tags.apply(np.ndarray.tolist))
test_df

Unnamed: 0,tokens,ner_tags
0,[Welded steel rods with polished chrome finish...,"[O, O, O, O, O, O, O, I-LOC, O, O, O, O, O, O,..."
1,[Our Touch family of designs emphasizes the be...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,"[Tall / Walnut / Slate, Losanges I, $5,257.00,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[View Spot Stool Bar Height, 18 - 27.16"" diam ...","[O, O, O, O, O, O, I-LOC, O, O, O, O, O, O, O,..."
4,[Smooth tilt mimics body's natural pivot point...,"[O, O, O, O, O, O, I-LOC, O, O, O, O, O, O, O,..."
...,...,...
963,"[Jada Stool by , Sorin Bed Pristine Vintage Fa...","[I-LOC, I-LOC, I-LOC, O, I-LOC, O, O, O, O, O,..."
964,"[Marlowe Stool Fur by , $1,634.00, This produc...","[I-LOC, O, O, O, I-LOC, O, O, I-LOC, I-LOC, I-..."
965,"[$720.00, Fergie Stool Cerused Oak by , Venus ...","[O, I-LOC, O, O, O, O, I-LOC, O, O, I-LOC, O, ..."
966,"[Zanzibar Side Table and Stool Driftwood by , ...","[I-LOC, I-LOC, I-LOC, O, O, I-LOC, I-LOC, O, O..."


In [15]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [21]:
label_list = ['O', 'I-LOC']
label_encoding_dict = {'O':0, 'I-LOC':1}

task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at C:\Users\Fus/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\Fus/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\vocab.txt
loading file tokenizer.json from cache at C:\Users\Fus/.cache\huggingface\hub\models--distilber

In [22]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

100%|██████████| 3/3 [00:01<00:00,  2.34ba/s]
100%|██████████| 1/1 [00:00<00:00,  2.04ba/s]


In [70]:
train_tokenized_datasets_df = pd.DataFrame(train_tokenized_datasets)
train_tokenized_datasets_df

Unnamed: 0,tokens,ner_tags,input_ids,attention_mask,labels
0,[Also available in a selection of other colors...,"[O, O, O, I-LOC, O, O, O, O, O, O, O, O, O, O,...","[101, 2036, 2800, 1999, 1037, 4989, 1997, 2060...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[ULUDAGE SOFA SET 3M+3+1, $('.swatch[data-opti...","[I-LOC, O, O, O, I-LOC, O, O, O, O, O, O, O, O...","[101, 17359, 14066, 3351, 10682, 2275, 1017, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,"[Dhs. 2,500.00, With an innovative blend of pl...","[O, O, O, I-LOC, O, O, O, I-LOC, O, O, O, O, I...","[101, 28144, 2015, 1012, 1016, 1010, 3156, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0510, Love to host but short on space? The la...","[O, O, O, O, O, O, O, O, O, O, O, O, O, I-LOC,...","[101, 5709, 10790, 2293, 2000, 3677, 2021, 246...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0419, $('.swatch[data-option-index=""0""] .aed-...","[O, O, O, O, O, O, O, O, O, I-LOC, O, O, O, I-...","[101, 5840, 16147, 1002, 1006, 1005, 1012, 254...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
2629,"[48"" Round Outdoor Teak Dining Table Handcraft...","[O, O, O, I-LOC, O, I-LOC, O, O, O, O, I-LOC, ...","[101, 4466, 1000, 2461, 7254, 5572, 2243, 7759...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2630,[7 pc Venice Teak Deep Seating Deluxe Sofa wit...,"[I-LOC, I-LOC, I-LOC, O, I-LOC, I-LOC, O, I-LO...","[101, 1021, 7473, 7914, 5572, 2243, 2784, 1074...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2631,"[Email Customer Service, Daybed 72""Lx63.5""Wx16...","[O, O, I-LOC, O, O, O, O, I-LOC, O, O, I-LOC, ...","[101, 10373, 8013, 2326, 2154, 8270, 5824, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2632,"[$8,194.99, $4,289.99, Sectional Sofa 119""x32....","[O, O, O, I-LOC, O, I-LOC, O, O, I-LOC, O, I-L...","[101, 1002, 1022, 1010, 19955, 1012, 5585, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [65]:
train_tokenized_datasets_df['tokens'][0]

['Also available in a selection of other colors',
 'Dhs. 2,240.00',
 '$(\'.swatch[data-option-index="0"] .brown-tosc-06-swatch\', \'#product-form-1823748587620product-templa',
 'ALBERO TV UNIT (FULL SET)',
 '$(\'.swatch[data-option-index="0"] .60-cm-swatch\', \'#product-form-2549553660004product-template\').rem',
 'Dhs. 1,785.00',
 '0171',
 '850 grms Hard Felts (country of origin Turkey) on both Sides.',
 '0789',
 '120 X 200 CM',
 'Dhs. 18,760.00',
 'Dhs. 11,999.00',
 'MDF ending at the back.',
 'Offer your dining room a definitively traditional style with this spindle-back chair. Creating a ref',
 'WOODEN',
 'Dimensions : H 108 x W 53 x D 45 cm.']

In [66]:
train_tokenized_datasets_df['ner_tags'][0]

['O',
 'O',
 'O',
 'I-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [69]:
train_tokenized_datasets_df['input_ids'][0]

[101,
 2036,
 2800,
 1999,
 1037,
 4989,
 1997,
 2060,
 6087,
 28144,
 2015,
 1012,
 1016,
 1010,
 11212,
 1012,
 4002,
 1002,
 1006,
 1005,
 1012,
 25414,
 2818,
 1031,
 2951,
 1011,
 5724,
 1011,
 5950,
 1027,
 1000,
 1014,
 1000,
 1033,
 1012,
 2829,
 1011,
 2000,
 11020,
 1011,
 5757,
 1011,
 25414,
 2818,
 1005,
 1010,
 1005,
 1001,
 4031,
 1011,
 2433,
 1011,
 12522,
 2581,
 18139,
 27814,
 2581,
 2575,
 11387,
 21572,
 8566,
 6593,
 1011,
 8915,
 8737,
 2721,
 2632,
 5677,
 2080,
 2694,
 3131,
 1006,
 2440,
 2275,
 1007,
 1002,
 1006,
 1005,
 1012,
 25414,
 2818,
 1031,
 2951,
 1011,
 5724,
 1011,
 5950,
 1027,
 1000,
 1014,
 1000,
 1033,
 1012,
 3438,
 1011,
 4642,
 1011,
 25414,
 2818,
 1005,
 1010,
 1005,
 1001,
 4031,
 1011,
 2433,
 1011,
 22234,
 2683,
 24087,
 21619,
 16086,
 8889,
 2549,
 21572,
 8566,
 6593,
 1011,
 23561,
 1005,
 1007,
 1012,
 2128,
 2213,
 28144,
 2015,
 1012,
 1015,
 1010,
 6275,
 2629,
 1012,
 4002,
 5890,
 2581,
 2487,
 15678,
 24665,
 5244,
 2524,


In [24]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

loading configuration file config.json from cache at C:\Users\Fus/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at C:\Users\Fus/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\1c4513b2eedbda136f57676a34eea67aba266e5c\pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were no

In [25]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] 
                        for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] 
                   for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"]}

In [26]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_tokenized_datasets,
                  eval_dataset=test_tokenized_datasets,
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()
trainer.save_model('furniture-ner2.model')

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2634
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 495
  Number of trainable parameters = 66364418
  0%|          | 0/495 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 165/495 [31:26<57:41, 10.49s/it]  The following columns in the evaluation set don't have a corresponding argument in `DistilBertForT

{'eval_loss': 0.31776630878448486, 'eval_precision': 0.013709677419354839, 'eval_recall': 0.028317601332593003, 'eval_f1': 0.01847491396486144, 'eval_accuracy': 0.8613398225697849, 'eval_runtime': 237.64, 'eval_samples_per_second': 4.073, 'eval_steps_per_second': 0.257, 'epoch': 1.0}


 67%|██████▋   | 330/495 [1:07:05<27:56, 10.16s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 968
  Batch size = 16
                                                   
 67%|██████▋   | 330/495 [1:11:03<27:56, 10.16s/it]

{'eval_loss': 0.3534741699695587, 'eval_precision': 0.024605214836577304, 'eval_recall': 0.03720155469183787, 'eval_f1': 0.02961980548187445, 'eval_accuracy': 0.8881197358628755, 'eval_runtime': 238.194, 'eval_samples_per_second': 4.064, 'eval_steps_per_second': 0.256, 'epoch': 2.0}


100%|██████████| 495/495 [1:42:38<00:00, 11.11s/it]  The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 968
  Batch size = 16
                                                   
100%|██████████| 495/495 [1:46:36<00:00, 11.11s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 495/495 [1:46:36<00:00, 12.92s/it]
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 968
  Batch 

{'eval_loss': 0.4284403324127197, 'eval_precision': 0.02480270574971815, 'eval_recall': 0.036646307606885066, 'eval_f1': 0.029583146571044373, 'eval_accuracy': 0.8862769187152257, 'eval_runtime': 237.944, 'eval_samples_per_second': 4.068, 'eval_steps_per_second': 0.256, 'epoch': 3.0}
{'train_runtime': 6396.4995, 'train_samples_per_second': 1.235, 'train_steps_per_second': 0.077, 'train_loss': 0.04418554980345447, 'epoch': 3.0}


100%|██████████| 61/61 [03:53<00:00,  3.83s/it]
Saving model checkpoint to furniture-ner2.model
Configuration saved in furniture-ner2.model\config.json
Model weights saved in furniture-ner2.model\pytorch_model.bin
tokenizer config file saved in furniture-ner2.model\tokenizer_config.json
Special tokens file saved in furniture-ner2.model\special_tokens_map.json


## Predict on new sentences

In [27]:
tokenizer = AutoTokenizer.from_pretrained('./furniture-ner.model/')

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [51]:
paragraph = '''Before proceeding further, I should like to inform members that action on draft resolution iv, entitled situation of human rights of Rohingya Muslims and other minorities in Myanmar is postponed to a later date to allow time for the review of its programme budget implications by the fifth committee. The assembly will take action on draft resolution iv as soon as the report of the fifth committee on the programme budget implications is available. I now give the floor to delegations wishing to deliver explanations of vote or position before voting or adoption.'''
tokens_un = tokenizer(paragraph)
tokens_un

{'input_ids': [101, 2077, 18207, 2582, 1010, 1045, 2323, 2066, 2000, 12367, 2372, 2008, 2895, 2006, 4433, 5813, 4921, 1010, 4709, 3663, 1997, 2529, 2916, 1997, 20996, 12053, 3148, 7486, 1998, 2060, 14302, 1999, 12620, 2003, 14475, 2000, 1037, 2101, 3058, 2000, 3499, 2051, 2005, 1996, 3319, 1997, 2049, 4746, 5166, 13494, 2011, 1996, 3587, 2837, 1012, 1996, 3320, 2097, 2202, 2895, 2006, 4433, 5813, 4921, 2004, 2574, 2004, 1996, 3189, 1997, 1996, 3587, 2837, 2006, 1996, 4746, 5166, 13494, 2003, 2800, 1012, 1045, 2085, 2507, 1996, 2723, 2000, 10656, 2015, 10261, 2000, 8116, 17959, 1997, 3789, 2030, 2597, 2077, 6830, 2030, 9886, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [72]:
furniture_paragraph = ', '.join(pd.read_csv('furniture/results_fulltext_crawler/brooklyncityfurniture.com.csv')['0'].tolist())
furniture_paragraph

'- Candles, Candles, - Desk, - Table, Champagne, - Sideboard, Fragrance Family:, - Standing, A handblown glass vessel crafted to enhance the burning experience., TemaHome, - Innovation Living, 0, - Lounge Chair, Bench, - Side Table, Soho Concept, Rhubarb, Essential Oil Based Fragrances, Greenington, Accessories, Your cart, Coffee Table, 0 Items, Mirrors, LAMPS, Phone : 347-457-5727, DINING, Innovative, Top notes are the first impression of a fragrance., Ottoman, 100% Cotton Wick, Middle notes are the heart the of fragrance., Sectional Sofa, - Eilersen, Bottom notes are the final and lasting impression., Lounge Chair, Amber, - Bench, Four Hands, - Ottoman, BRANDS, - Varier, Cassis Blush, WORKING, - Hanging, MIDDLE:, Pin It, Hanging, Fatboy, More from this collection, Orange Blossom, Facebook, Instagram, - Greenington, Login, Table Lamps, Fragrance Life:, Weight:, Create account |, - Stool, - Sectional Sofa, - Chair, Storage, 820 Manhattan Ave, Brooklyn, NY 11222, ABOUT US, Your cart (0)

In [77]:
tokens_f = tokenizer(furniture_paragraph)

In [79]:
torch.tensor(tokens_f['input_ids']).unsqueeze(0).size()

torch.Size([1, 1740])

In [75]:
model = AutoModelForTokenClassification.from_pretrained('./furniture-ner.model/', num_labels=len(label_list))
predictions = model.forward(input_ids=torch.tensor(tokens_f['input_ids']).unsqueeze(0),
                            attention_mask=torch.tensor(tokens_f['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [label_list[i] for i in predictions]

loading configuration file ./furniture-ner.model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./furniture-ner.model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file ./furniture-ner.model/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForTokenClassification.

All the weights of DistilBertForTokenClassification were initialized from the model checkpoint at ./furniture-ner.model/.
If your task is similar to the task the model of the checkpoint was tra

RuntimeError: The size of tensor a (1740) must match the size of tensor b (512) at non-singleton dimension 1

In [31]:
words = tokenizer.batch_decode(tokens_f['input_ids'])
pd.DataFrame({'ner': predictions, 'words': words}).to_csv('furniture_ner.csv')