# Adjust GLiNER model with contrastive learning (NCELoss)

In [1]:
!pip install gliner

Collecting gliner
  Downloading gliner-0.2.22-py3-none-any.whl.metadata (9.4 kB)
Collecting transformers<=4.51.0,>=4.38.2 (from gliner)
  Downloading transformers-4.51.0-py3-none-any.whl.metadata (38 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<=4.51.0,>=4.38.2->gliner)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting coloredlogs (from onnxruntime->gliner)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->gliner)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading gliner-0.2.22-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.3/76.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers

In [2]:
!pip install transformers



In [None]:
import os
import json
import random

import torch

from gliner.training import Trainer, TrainingArguments
from model_finetuning.model import ContrastiveGLiNER
from model_finetuning.data_collator import ContrastiveDataCollator

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [13]:
# Load the processed output from the JSON file which contains data ready for training
with open("../new_data/processed_output.json", "r", encoding="utf-8") as file:
    original_data = json.load(file)

print(len(original_data))
print(original_data[0])

2811
{'tokenized_text': ['Any', 'part', 'of', 'your', 'neck', 'muscles', ',', 'bones', ',', 'joints', ',', 'tendons', ',', 'ligaments', ',', 'or', 'nerves', 'can', 'cause', 'neck', 'problems', '.', 'Neck', 'pain', 'is', 'very', 'common', '.', 'Pain', 'may', 'also', 'come', 'from', 'your', 'shoulder', ',', 'jaw', ',', 'head', ',', 'or', 'upper', 'arms', '.', 'Muscle', 'strain', 'or', 'tension', 'often', 'causes', 'neck', 'pain', '.', 'The', 'problem', 'is', 'usually', 'overuse', ',', 'such', 'as', 'from', 'sitting', 'at', 'a', 'computer', 'for', 'too', 'long', '.', 'Sometimes', 'you', 'can', 'strain', 'your', 'neck', 'muscles', 'from', 'sleeping', 'in', 'an', 'awkward', 'position', 'or', 'overdoing', 'it', 'during', 'exercise', '.', 'Falls', 'or', 'accidents', ',', 'including', 'car', 'accidents', ',', 'are', 'another', 'common', 'cause', 'of', 'neck', 'pain', '.', 'Whiplash', ',', 'a', 'soft', 'tissue', 'injury', 'to', 'the', 'neck', ',', 'is', 'also', 'called', 'neck', 'sprain', 'or',

In [18]:
# Load the processed output from the JSON file which contains data ready for training
with open("../data/pilener_data.json", "r", encoding="utf-8") as file:
    pilener_data = json.load(file)

print(len(pilener_data))
print(pilener_data[0])

45871
{'tokenized_text': ['Q', ':', 'Position', 'character', 'based', 'on', 'enemy', 'coordinates', 'in', 'lua', 'I', 'have', 'written', 'a', 'function', 'here', 'which', 'should', 'turn', 'my', 'character', 'based', 'on', 'enemy', 'coordinates', 'but', 'it', "'", 's', 'not', 'perfect', 'because', 'it', 'does', 'not', 'always', 'turn', 'where', 'I', 'want', 'it', 'to', 'and', 'perhaps', 'there', 'is', 'a', 'better', 'way', 'of', 'writing', 'it', 'local', 'myPosition', '=', '{', 'x', '=', '350', ',', 'y', '=', '355', '}', 'local', 'enemyPosition', '=', '{', 'x', '=', '352', ',', 'y', '=', '354', '}', 'local', 'xValue', ',', 'yValue', ',', 'xDir', ',', 'yDir', ',', 'dir', 'if', 'myPosition', '.', 'x', '>', 'enemyPosition', '.', 'x', 'then', 'xValue', '=', 'myPosition', '.', 'x', '-', 'enemyPosition', '.', 'x', 'elseif', 'myPosition', '.', 'x', '<', 'enemyPosition', '.', 'x', 'then', 'xValue', '=', 'myPosition', '.', 'x', '-', 'enemyPosition', '.', 'x', 'else', 'xValue', '=', '0', 'end', 

In [None]:
random.shuffle(original_data)
random.shuffle(pilener_data)

In [None]:
limited_original = original_data[:2250] if len(original_data) >= 2250 else original_data
limited_pilener = pilener_data[:4500] if len(pilener_data) >= 4500 else pilener_data

In [None]:
batch_size = 8

In [None]:
original_chunks = [limited_original[i:i+batch_size] for i in range(0, len(limited_original), batch_size)]
pilener_chunks = [limited_pilener[i:i+batch_size] for i in range(0, len(limited_pilener), batch_size)]

In [None]:
len(pilener_chunks)

In [None]:
combined_data = []
original_idx = 0
pilener_idx = 0

while original_idx < len(original_chunks) or pilener_idx < len(pilener_chunks):
    
    # Add 1 batch of original data
    if original_idx < len(original_chunks):
        combined_data.extend(original_chunks[original_idx])
        original_idx += 1
    
    # Add 2 batches of pilener data
    for _ in range(2):
        if pilener_idx < len(pilener_chunks):
            combined_data.extend(pilener_chunks[pilener_idx])
            pilener_idx += 1
    
    # Stop if we've used all original data (since pilener is 2x larger)
    if original_idx >= len(original_chunks):
        # Add remaining pilener batches
        while pilener_idx < len(pilener_chunks):
            combined_data.extend(pilener_chunks[pilener_idx])
            pilener_idx += 1
        break

In [None]:
combined_data[-17]

In [None]:
processed_output = combined_data

In [None]:
train_dataset = processed_output[:int(len(processed_output)*0.8)]
val_dataset = processed_output[int(len(processed_output)*0.8):int(len(processed_output)*0.9)]
test_dataset = processed_output[int(len(processed_output)*0.9):]

In [6]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

In [7]:
gliner_model = ContrastiveGLiNER.from_pretrained("urchade/gliner_small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/611M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [8]:
gliner_model.config

GLiNERConfig {
  "class_token_index": 128002,
  "decoder_mode": null,
  "dropout": 0.4,
  "embed_ent_token": true,
  "encoder_config": {
    "_attn_implementation_autoset": true,
    "_name_or_path": "microsoft/deberta-v3-small",
    "add_cross_attention": false,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_range": 0.02,

In [9]:
gliner_model.to(device)

ContrastiveGLiNER(
  (model): SpanModel(
    (token_rep_layer): Encoder(
      (bert_layer): Transformer(
        (model): DebertaV2Model(
          (embeddings): DebertaV2Embeddings(
            (word_embeddings): Embedding(128004, 768, padding_idx=0)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): DebertaV2Encoder(
            (layer): ModuleList(
              (0-5): 6 x DebertaV2Layer(
                (attention): DebertaV2Attention(
                  (self): DisentangledSelfAttention(
                    (query_proj): Linear(in_features=768, out_features=768, bias=True)
                    (key_proj): Linear(in_features=768, out_features=768, bias=True)
                    (value_proj): Linear(in_features=768, out_features=768, bias=True)
                    (pos_dropout): Dropout(p=0.1, inplace=False)
                    (dropout): Dropout(p=0.1, inplace=False)
   

In [10]:
data_collator = ContrastiveDataCollator(gliner_model.config, data_processor=gliner_model.data_processor, prepare_labels=True)

In [None]:
num_steps = 17000
data_size = len(train_dataset) # 5400
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

In [12]:
num_epochs

15

In [None]:
training_args = TrainingArguments(
    output_dir="contrastive_gliner_model",
    learning_rate=5e-5,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    dataloader_num_workers=0,
    use_cpu=False,
    report_to="none",
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=3,
)

In [14]:
import torch
from transformers import TrainerCallback

class ClearCacheCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        torch.cuda.empty_cache()

In [None]:
trainer = Trainer(
    model=gliner_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=gliner_model.data_processor.transformer_tokenizer,
    data_collator=data_collator,
    callbacks=[ClearCacheCallback()]
)

  trainer = Trainer(


In [16]:
print("Allocated memory:", torch.cuda.memory_allocated() / 1024**3, "GB")

Allocated memory: 0.56866455078125 GB


In [17]:
torch.cuda.empty_cache()

In [18]:
trainer.train()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss,Validation Loss
338,No log,189575.046875
676,256861.344000,206491.171875
1014,132599.920000,198401.65625
1352,132599.920000,208670.4375
1690,131958.240000,204255.8125
2028,132877.784000,201814.21875
2366,132877.784000,194990.484375
2704,134459.600000,195829.390625
3042,133712.544000,194346.609375
3380,133712.544000,211335.734375


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


TrainOutput(global_step=5055, training_loss=145305.34272997032, metrics={'train_runtime': 7062.513, 'train_samples_per_second': 5.724, 'train_steps_per_second': 0.716, 'total_flos': 0.0, 'train_loss': 145305.34272997032, 'epoch': 15.0})

In [None]:
trainer.save_model("finetuned_gliner_model_NCELoss")

In [20]:
best_ckpt = trainer.state.best_model_checkpoint
print(best_ckpt)

contrastive_gliner_model/checkpoint-338


In [None]:
!zip -r finetuned_gliner_model_NCELoss.zip finetuned_gliner_model_NCELoss
from google.colab import files
files.download("finetuned_gliner_model_NCELoss.zip")


  adding: finetuned_gliner_model_NCELoss3/ (stored 0%)
  adding: finetuned_gliner_model_NCELoss3/added_tokens.json (deflated 36%)
  adding: finetuned_gliner_model_NCELoss3/pytorch_model.bin (deflated 9%)
  adding: finetuned_gliner_model_NCELoss3/special_tokens_map.json (deflated 50%)
  adding: finetuned_gliner_model_NCELoss3/spm.model (deflated 50%)
  adding: finetuned_gliner_model_NCELoss3/gliner_config.json (deflated 65%)
  adding: finetuned_gliner_model_NCELoss3/tokenizer.json (deflated 77%)
  adding: finetuned_gliner_model_NCELoss3/tokenizer_config.json (deflated 79%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
trained_model = ContrastiveGLiNER.from_pretrained("finetuned_gliner_model_NCELoss", load_tokenizer=True)

config.json not found in /content/finetuned_gliner_model_NCELoss3


In [17]:
with open("../new_data/json_outputs.json", "r", encoding="utf-8") as file:
    json_outputs = json.load(file)

print(len(json_outputs))
print(json_outputs[0]['entities'][0]['types'])

2811
['body part']


In [30]:
json_outputs[0]

{'text': 'Any part of your neck  muscles, bones, joints, tendons, ligaments, or nerves  can cause neck problems. Neck pain is very common. Pain may also come from your shoulder, jaw, head, or upper arms. Muscle strain or tension often causes neck pain. The problem is usually overuse, such as from sitting at a computer for too long. Sometimes you can strain your neck muscles from sleeping in an awkward position or overdoing it during exercise. Falls or accidents, including car accidents, are another common cause of neck pain. Whiplash, a soft tissue injury to the neck, is also called neck sprain or strain. Treatment depends on the cause, but may include applying ice, taking pain relievers, getting physical therapy or wearing a cervical collar. You rarely need surgery.',
 'entities': [{'entity': 'neck', 'types': ['body part']},
  {'entity': 'muscles', 'types': ['body part']},
  {'entity': 'bones', 'types': ['body part']},
  {'entity': 'joints', 'types': ['body part']},
  {'entity': 'tend

In [31]:
entity_types_per_text = []
for i, output in enumerate(json_outputs):
  if i == 15:
    break
  entity_types = []
  for j, entity in enumerate(json_outputs[i]['entities']):
    for k, entity_type in enumerate(entity['types']):
      entity_types.append(entity_type)

  entity_types_per_text.append(
      {
          'text': output['text'],
          'entity_types': set(entity_types)
       }
      )

entity_types_per_text

[{'text': 'Any part of your neck  muscles, bones, joints, tendons, ligaments, or nerves  can cause neck problems. Neck pain is very common. Pain may also come from your shoulder, jaw, head, or upper arms. Muscle strain or tension often causes neck pain. The problem is usually overuse, such as from sitting at a computer for too long. Sometimes you can strain your neck muscles from sleeping in an awkward position or overdoing it during exercise. Falls or accidents, including car accidents, are another common cause of neck pain. Whiplash, a soft tissue injury to the neck, is also called neck sprain or strain. Treatment depends on the cause, but may include applying ice, taking pain relievers, getting physical therapy or wearing a cervical collar. You rarely need surgery.',
  'entity_types': {'body part', 'condition', 'event', 'treatment'}},
 {'text': "Heel problems are common and can be painful. Often, they result from too much stress on your heel bone and the tissues that surround it. Th

In [36]:
for item in entity_types_per_text:
  entities = trained_model.predict_entities(item['text'], list(item['entity_types']), threshold=0.5)
  # print(item['text'])
  # print(item['entity_types'])
  for entity in entities:
    print(entity["text"], "=>", entity["label"])
  print("____________________________")

Any part of your neck  muscles, => body part
bones, joints, tendons, ligaments, => body part
or => body part
nerves  can => body part
cause neck problems. Neck pain is => body part
very => body part
common. Pain may also come from your shoulder, => body part
jaw, => body part
head, => body part
or => body part
upper arms. Muscle strain or => body part
tension often causes neck pain. The problem is usually overuse, => body part
such => body part
as => body part
from => body part
sitting at a computer for => body part
too => body part
long. Sometimes you can strain your neck muscles from sleeping in => body part
an => body part
awkward => body part
position or overdoing it during exercise. Falls or accidents, => body part
including => body part
car accidents, => body part
are => body part
another => body part
common cause of neck pain. Whiplash, => body part
a => body part
soft tissue injury to => body part
the neck, => body part
is => body part
also => body part
called => body part
neck

In [37]:
entities

[{'start': 0,
  'end': 48,
  'text': 'Summary : Most men need to pay more attention to',
  'label': 'health condition',
  'score': 0.5000225901603699},
 {'start': 49,
  'end': 54,
  'text': 'their',
  'label': 'health condition',
  'score': 0.5000209808349609},
 {'start': 55,
  'end': 62,
  'text': 'health.',
  'label': 'health condition',
  'score': 0.5000211596488953},
 {'start': 63,
  'end': 81,
  'text': 'Compared to women,',
  'label': 'health condition',
  'score': 0.5000211000442505},
 {'start': 82,
  'end': 94,
  'text': 'men are more',
  'label': 'health condition',
  'score': 0.5000211000442505},
 {'start': 95,
  'end': 104,
  'text': 'likely to',
  'label': 'health condition',
  'score': 0.5000211000442505},
 {'start': 106,
  'end': 115,
  'text': 'Smoke and',
  'label': 'health condition',
  'score': 0.5000211000442505},
 {'start': 116,
  'end': 140,
  'text': 'drink  Make unhealthy or',
  'label': 'health condition',
  'score': 0.5000211596488953},
 {'start': 141,
  'end':