In [1]:
!pip install -q transformers datasets seqeval accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━

In [2]:
# =========================================================
# 1️⃣ Imports
# =========================================================
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

2025-10-14 20:52:41.776565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760475162.022025      90 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760475162.091100      90 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import transformers

In [4]:
# Enable logging for debugging
transformers.utils.logging.set_verbosity_info()

In [5]:
# =========================================================
# 2️⃣ Setup (GPU / CPU)
# =========================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [6]:
# =========================================================
# 3️⃣ Load dataset from Kaggle input path
# =========================================================
# 📦 Dataset: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion
data_path = "/kaggle/input/conll003-englishversion/"


In [7]:
train_df = pd.read_csv(os.path.join(data_path, "train.txt"), sep=" ", names=["word", "pos", "chunk", "ner"], skip_blank_lines=False)
valid_df = pd.read_csv(os.path.join(data_path, "valid.txt"), sep=" ", names=["word", "pos", "chunk", "ner"], skip_blank_lines=False)
test_df  = pd.read_csv(os.path.join(data_path, "test.txt"),  sep=" ", names=["word", "pos", "chunk", "ner"], skip_blank_lines=False)

In [8]:
print("Train shape:", train_df.shape)
print("Validation shape:", valid_df.shape)
print("Test shape:", test_df.shape)


Train shape: (219554, 4)
Validation shape: (55044, 4)
Test shape: (50350, 4)


In [10]:
# =========================================================
# 5️⃣ Convert CoNLL format → sentence + label lists
# =========================================================
def read_conll(df):
    sentences, labels = [], []
    sent, labs = [], []
    for word, tag in zip(df["word"], df["ner"]):
        if str(word) == "nan":
            if sent:
                sentences.append(sent)
                labels.append(labs)
                sent, labs = [], []
        else:
            sent.append(word)
            labs.append(tag)
    if sent:
        sentences.append(sent)
        labels.append(labs)
    return sentences, labels

train_sentences, train_labels = read_conll(train_df)
val_sentences, val_labels = read_conll(valid_df)
test_sentences, test_labels = read_conll(test_df)

print("\nExample sentence:")
print(train_sentences[1])
print(train_labels[1])


Example sentence:
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [11]:
# Clean NaN or invalid tags
def clean_labels(labels):
    return [[t if isinstance(t, str) and t != 'nan' else 'O' for t in seq] for seq in labels]

train_labels = clean_labels(train_labels)
val_labels = clean_labels(val_labels)
test_labels = clean_labels(test_labels)

In [12]:
# =========================================================
# 6️⃣ Map NER tags to numeric IDs
# =========================================================
# Identifies all NER tags and maps them to numeric IDs
unique_tags = sorted(set(tag for seq in train_labels for tag in seq))
print("\nUnique NER Tags:", unique_tags)

tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
num_labels = len(unique_tags)


Unique NER Tags: ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [13]:
# =========================================================
# 7️⃣ Convert into Hugging Face Datasets
# =========================================================
train_dataset = Dataset.from_dict({
    "tokens": train_sentences,
    "ner_tags": [[tag2id[t] for t in seq] for seq in train_labels]
})
val_dataset = Dataset.from_dict({
    "tokens": val_sentences,
    "ner_tags": [[tag2id[t] for t in seq] for seq in val_labels]
})
test_dataset = Dataset.from_dict({
    "tokens": test_sentences,
    "ner_tags": [[tag2id[t] for t in seq] for seq in test_labels]
})

datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print("\nDataset preview:")
print(datasets)

# Check dataset sizes
print(f"Train: {len(datasets['train'])}, Validation: {len(datasets['validation'])}, Test: {len(datasets['test'])}")


Dataset preview:
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14991
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3470
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3690
    })
})
Train: 14991, Validation: 3470, Test: 3690


In [14]:
# =========================================================
# 8️⃣ Load tokenizer and model
# =========================================================
MODEL_NAME = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.53.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer_config.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidde

In [15]:
# =========================================================
# 9️⃣ Tokenization & label alignment
# =========================================================
max_length = 128

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=max_length,
    )

    all_labels = examples["ner_tags"]
    aligned_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14991 [00:00<?, ? examples/s]

Map:   0%|          | 0/3470 [00:00<?, ? examples/s]

Map:   0%|          | 0/3690 [00:00<?, ? examples/s]

In [16]:
# =========================================================
# 🔟 Data collator for dynamic padding
# =========================================================
data_collator = DataCollatorForTokenClassification(tokenizer)

In [17]:
# =========================================================
# 1️⃣1️⃣ Load model
# =========================================================
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)
model.to(device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e/model.safetensors
A pretrained model of type `BertForTokenClassification` contains parameters that have been renamed internally (a few are listed below but more are present in the model):
* `cls.predictions.transform.LayerNorm.beta` -> `cls.predictions.transform.LayerNorm.bias`
* `cls.predictions.transform.LayerNorm.gamma` -> `cls.predictions.transform.LayerNorm.weight`
If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.w

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [18]:
# =========================================================
# 1️⃣2️⃣ Define metrics
# =========================================================
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=-1)
    batch_size, seq_len = preds.shape
    preds_list, labels_list = [], []

    for i in range(batch_size):
        pred_tags = []
        true_tags = []
        for j in range(seq_len):
            if label_ids[i, j] == -100:
                continue
            pred_tags.append(id2tag[preds[i, j]])
            true_tags.append(id2tag[label_ids[i, j]])
        preds_list.append(pred_tags)
        labels_list.append(true_tags)

    return preds_list, labels_list


In [19]:
def compute_metrics(eval_pred):
    logits, label_ids = eval_pred
    preds_list, labels_list = align_predictions(logits, label_ids)
    return {
        "precision": precision_score(labels_list, preds_list),
        "recall": recall_score(labels_list, preds_list),
        "f1": f1_score(labels_list, preds_list),
        "accuracy": accuracy_score(labels_list, preds_list),
    }

In [21]:
# =========================================================
# 1️⃣3️⃣ Training configuration
# =========================================================
training_args = TrainingArguments(
    output_dir="./bert-ner-conll2003",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Mixed precision for faster GPU training
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",  # Avoid W&B hangs
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

PyTorch: setting up devices


In [22]:
# =========================================================
# 1️⃣4️⃣ Trainer setup
# =========================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"] if len(datasets["validation"]) > 0 else tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(
Using auto half precision backend


In [23]:
# =========================================================
# 1️⃣5️⃣ Train
# =========================================================
trainer.train()

The following columns in the Training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14,991
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2,811
  Number of trainable parameters = 107,726,601


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0527,0.045772,0.937699,0.943228,0.940455,0.989743
2,0.0319,0.067903,0.936075,0.947271,0.941639,0.989999
3,0.0213,0.099579,0.945309,0.952156,0.94872,0.990922


The following columns in the Evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3470
  Batch size = 16
Saving model checkpoint to ./bert-ner-conll2003/checkpoint-937
Configuration saved in ./bert-ner-conll2003/checkpoint-937/config.json
Model weights saved in ./bert-ner-conll2003/checkpoint-937/model.safetensors
tokenizer config file saved in ./bert-ner-conll2003/checkpoint-937/tokenizer_config.json
Special tokens file saved in ./bert-ner-conll2003/checkpoint-937/special_tokens_map.json
The following columns in the Evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore t

TrainOutput(global_step=2811, training_loss=0.052363134382543766, metrics={'train_runtime': 591.5108, 'train_samples_per_second': 76.031, 'train_steps_per_second': 4.752, 'total_flos': 1101398273750682.0, 'train_loss': 0.052363134382543766, 'epoch': 3.0})

In [24]:
# 1️⃣5️⃣ Evaluate
print("Validation results:")
val_metrics = trainer.evaluate(tokenized_datasets["validation"])
print(val_metrics)

print("\nTest results:")
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print(test_metrics)

The following columns in the Evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3470
  Batch size = 16


Validation results:




The following columns in the Evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3690
  Batch size = 16


{'eval_loss': 0.09957929700613022, 'eval_precision': 0.9453085800301053, 'eval_recall': 0.9521563342318059, 'eval_f1': 0.9487201007133864, 'eval_accuracy': 0.9909223091130586, 'eval_runtime': 14.9391, 'eval_samples_per_second': 232.277, 'eval_steps_per_second': 14.526, 'epoch': 3.0}

Test results:
{'eval_loss': 0.264486700296402, 'eval_precision': 0.9021758050478678, 'eval_recall': 0.9183203401842664, 'eval_f1': 0.9101764860830626, 'eval_accuracy': 0.9816525671260737, 'eval_runtime': 15.3934, 'eval_samples_per_second': 239.714, 'eval_steps_per_second': 15.006, 'epoch': 3.0}


In [25]:
# 1️⃣6️⃣ Detailed classification report
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
preds_list, labels_list = align_predictions(predictions, labels)
print("\nClassification report (test):")
print(classification_report(labels_list, preds_list, digits=4))

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, ner_tags. If tokens, ner_tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.

***** Running Prediction *****
  Num examples = 3690
  Batch size = 16



Classification report (test):
              precision    recall  f1-score   support

         LOC     0.9322    0.9238    0.9279      1666
        MISC     0.7843    0.8234    0.8033       702
         ORG     0.8678    0.9127    0.8897      1661
         PER     0.9627    0.9598    0.9612      1615

   micro avg     0.9022    0.9183    0.9102      5644
   macro avg     0.8867    0.9049    0.8955      5644
weighted avg     0.9036    0.9183    0.9107      5644



In [26]:
# 1️⃣7️⃣ Save model
model.save_pretrained("./bert-ner-conll2003")
tokenizer.save_pretrained("./bert-ner-conll2003")
print("✅ Model saved successfully!")

Configuration saved in ./bert-ner-conll2003/config.json
Model weights saved in ./bert-ner-conll2003/model.safetensors
tokenizer config file saved in ./bert-ner-conll2003/tokenizer_config.json
Special tokens file saved in ./bert-ner-conll2003/special_tokens_map.json


✅ Model saved successfully!


In [29]:
# 1️⃣8️⃣ Inference on a sample sentence
x = "The amazing company Google is about to layoff 1000 people"

sample_sentence = x.split()
encoding = tokenizer(sample_sentence, is_split_into_words=True, return_tensors="pt")
encoding = {k: v.to(device) for k, v in encoding.items()}

with torch.no_grad():
    outputs = model(**encoding)
preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()[0]

predicted_tags = [id2tag[p] for p in preds[1:len(sample_sentence)+1]]  # skip CLS, SEP
print("\nSample sentence prediction:")
for word, tag in zip(sample_sentence, predicted_tags):
    print(f"{word:<12} -> {tag}")


Sample sentence prediction:
The          -> O
amazing      -> O
company      -> O
Google       -> B-ORG
is           -> O
about        -> O
to           -> O
layoff       -> O
1000         -> O
people       -> O


In [31]:
!zip -r /kaggle/working/bert-ner-conll2003.zip /kaggle/working/bert-ner-conll2003

  adding: kaggle/working/bert-ner-conll2003/ (stored 0%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/ (stored 0%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/training_args.bin (deflated 52%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/scaler.pt (deflated 60%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/rng_state.pth (deflated 25%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/vocab.txt (deflated 49%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/scheduler.pt (deflated 56%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 7%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/optimizer.pt (deflated 18%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/config.json (deflated 56%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/trainer_state.json (deflated 73%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/tokenizer.json (deflated 70%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-2811/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/bert-ner-conll2003/vocab.txt (deflated 49%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-1874/ (stored 0%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-1874/training_args.bin (deflated 52%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-1874/scaler.pt (deflated 60%)
  adding: kaggle/working/bert-ner-conll2003/checkpoint-1874/rng_state.pth (deflated 25%)
  adding: kaggle/work