<a href="https://colab.research.google.com/github/decision-labs/geobase-ai.js/blob/automate-task-selection/task-classifier/explore-collab-notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load tokenizer
model_name = "nreimers/BERT-Tiny_L-2_H-128_A-2"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load model with three labels
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nreimers/BERT-Tiny_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from datasets import load_dataset

# Load dataset from CSV
dataset = load_dataset("geobase/geo-task-classifier")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["query"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # Remove text column
tokenized_datasets = tokenized_datasets.remove_columns(["query"])
tokenized_datasets.set_format("torch")


In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=16)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16)


In [None]:
print(tokenized_datasets["train"])

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 81
})


In [None]:
from transformers import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)




In [None]:
import torch
from transformers import get_scheduler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_training_steps = len(train_dataloader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Loop
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")


Epoch 1 - Average Loss: 1.0820
Epoch 2 - Average Loss: 1.0996
Epoch 3 - Average Loss: 1.0634
Epoch 4 - Average Loss: 1.0841
Epoch 5 - Average Loss: 1.0655
Epoch 6 - Average Loss: 1.0416
Epoch 7 - Average Loss: 1.0949
Epoch 8 - Average Loss: 1.0884
Epoch 9 - Average Loss: 1.0815
Epoch 10 - Average Loss: 1.0882
Epoch 11 - Average Loss: 1.1048
Epoch 12 - Average Loss: 1.0900
Epoch 13 - Average Loss: 1.1229
Epoch 14 - Average Loss: 1.0679
Epoch 15 - Average Loss: 1.0813
Epoch 16 - Average Loss: 1.1099
Epoch 17 - Average Loss: 1.0915
Epoch 18 - Average Loss: 1.0805
Epoch 19 - Average Loss: 1.0834
Epoch 20 - Average Loss: 1.0857
Epoch 21 - Average Loss: 1.1165
Epoch 22 - Average Loss: 1.1017
Epoch 23 - Average Loss: 1.0916
Epoch 24 - Average Loss: 1.0981
Epoch 25 - Average Loss: 1.0676
Epoch 26 - Average Loss: 1.0628
Epoch 27 - Average Loss: 1.0558
Epoch 28 - Average Loss: 1.0942
Epoch 29 - Average Loss: 1.0843
Epoch 30 - Average Loss: 1.1024
Epoch 31 - Average Loss: 1.0833
Epoch 32 - Averag

In [None]:
# Save the trained model
model.save_pretrained("bert_tiny_finetuned")

# Save the tokenizer as well
tokenizer.save_pretrained("bert_tiny_finetuned")


('bert_tiny_finetuned/tokenizer_config.json',
 'bert_tiny_finetuned/special_tokens_map.json',
 'bert_tiny_finetuned/vocab.txt',
 'bert_tiny_finetuned/added_tokens.json')

In [None]:


from transformers import Trainer, TrainingArguments

# Assuming train_dataset and eval_dataset are already defined
# and properly formatted (with 'input_ids', 'attention_mask', 'labels')

# Assuming tokenizer is already defined from previous code

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_tiny_trained",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    logging_dir="./logs",
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
    # Add more arguments as needed
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"], # Use the test dataset for evaluation
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,1.328542
2,No log,1.314359
3,No log,1.297669
4,No log,1.268253
5,No log,1.247607
6,No log,1.225565
7,No log,1.196141
8,No log,1.185258
9,No log,1.162513
10,No log,1.128378


Epoch,Training Loss,Validation Loss
1,No log,1.328542
2,No log,1.314359
3,No log,1.297669
4,No log,1.268253
5,No log,1.247607
6,No log,1.225565
7,No log,1.196141
8,No log,1.185258
9,No log,1.162513
10,No log,1.128378


TrainOutput(global_step=550, training_loss=0.7252086015181108, metrics={'train_runtime': 428.1857, 'train_samples_per_second': 9.459, 'train_steps_per_second': 1.284, 'total_flos': 5148682444800.0, 'train_loss': 0.7252086015181108, 'epoch': 50.0})

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
# prompt: write a code to test the model using trainer and test dataset also print  metrices  there is not any accuracy
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader
import numpy as np
# from datasets import load_metric  # This line caused the issue
from evaluate import load # Import load from evaluate

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert_tiny_trained")
tokenizer = BertTokenizer.from_pretrained("bert_tiny_trained")

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Prediction loop
model.eval()
predictions = []
labels = []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predicted_class = np.argmax(logits.cpu().numpy(), axis=1)  # Get the predicted class
    predictions.extend(predicted_class)
    labels.extend(batch["labels"].cpu().numpy())

# Calculate metrics (example: precision, recall, F1-score)
# Changed to use 'evaluate' library
metric = load("f1") # or any other relevant metric
metric.add_batch(predictions=predictions, references=labels)
metrics = metric.compute(average="weighted") #weighted average for multiple labels

# Print the metrics
print(metrics)


# Example for calculating other metrics using scikit-learn:
from sklearn.metrics import classification_report
print(classification_report(labels, predictions))

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'f1': 0.7425641025641025}
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       1.00      0.67      0.80         3
           2       0.00      0.00      0.00         2
           3       0.86      1.00      0.92         6

    accuracy                           0.80        15
   macro avg       0.63      0.67      0.63        15
weighted avg       0.72      0.80      0.74        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
trainer.save_model()

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mhassanch/bert_tiny_trained/commit/c3b2084fd44ac65bb2f435c33143f2c5b4a71949', commit_message='End of training', commit_description='', oid='c3b2084fd44ac65bb2f435c33143f2c5b4a71949', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mhassanch/bert_tiny_trained', endpoint='https://huggingface.co', repo_type='model', repo_id='mhassanch/bert_tiny_trained'), pr_revision=None, pr_num=None)

In [None]:
!pip install optimum

Collecting optimum
  Downloading optimum-1.24.0-py3-none-any.whl.metadata (21 kB)
Downloading optimum-1.24.0-py3-none-any.whl (433 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.6/433.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optimum
Successfully installed optimum-1.24.0


In [None]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pack

In [None]:
!optimum-cli export onnx --model mhassanch/bert_tiny_trained bert_tiny_trained_onnx/

2025-03-05 12:06:12.380532: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741176372.405574   23097 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741176372.412941   23097 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

model_fp32 = '/content/bert_tiny_trained_onnx/model.onnx'
model_quant = '/content/bert_tiny_trained_onnx/model.quant.onnx'
quantized_model = quantize_dynamic(model_fp32, model_quant)



In [None]:
# prompt: write a code to test the onnx quantized model

import onnxruntime as ort
import numpy as np

# Load the quantized ONNX model
sess = ort.InferenceSession("/content/bert_tiny_trained_onnx/model.quant.onnx")

# Get input names
input_names = [input.name for input in sess.get_inputs()]

# Example input data (replace with your actual data)
# Make sure the input data matches the expected shape and type
example_input = {
    'input_ids': np.random.randint(0, 100, size=(1, 128), dtype=np.int64),  # Example input_ids
    'attention_mask': np.ones((1, 128), dtype=np.int64)  # Example attention_mask
}

# Run inference
outputs = sess.run(None, example_input)

# Process the outputs
# The output format might be different depending on your model
predicted_class = np.argmax(outputs[0], axis=1)

print(f"Predicted Class: {predicted_class}")

#Further evaluation can be performed by comparing the predictions
#with the ground truth labels of a test dataset.


ValueError: Required inputs (['token_type_ids']) are missing from input feed (['input_ids', 'attention_mask']).

In [None]:
!pip install onnx onnxruntime

In [None]:
test_queries = [
    'Detect cars in this area',
    'Mask the green field in this area',
    'Shade the rooftop in this residental areas',
    'Highlight the roadways in the map',
    'Identify the airplanes on the airstrip'
    ]

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

label_mapping = {
    0: "object-detection:geobase/WALDO30_yolov8m_640x640",
    1: "zero-shot-object-detection:onnx-community/grounding-dino-tiny-ONNX",
    2: "zero-shot-object-detection:Xenova/owlvit-base-patch32",
    3: "mask-generation:Xenova/slimsam-77-uniform"    # For queries that start with "Segment"
}

# Load the model with explicit label mapping
model = AutoModelForSequenceClassification.from_pretrained(
    "bert_tiny_trained",  # Replace with your saved model path
    num_labels=4,
    id2label=label_mapping,
    label2id={v: k for k, v in label_mapping.items()}
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mhassanch/bert_tiny_trained")

for query in test_queries:
    inputs = tokenizer(query, padding=True, truncation=True, return_tensors="pt")
    output = model(**inputs)

    # Get predicted class
    probabilities = torch.softmax(output.logits, dim=1)  # Apply softmax
    predicted_class_index = torch.argmax(probabilities, dim=1).item()  # Get index of highest probability
    predicted_class_label = model.config.id2label[predicted_class_index]  # Get label from index

    print(f"Query: {query}")
    print(f"Prediction: {predicted_class_label}")
    print("-" * 20)

Query: Detect cars in this area
Prediction: object-detection:geobase/WALDO30_yolov8m_640x640
--------------------
Query: Mask the green field in this area
Prediction: mask-generation:Xenova/slimsam-77-uniform
--------------------
Query: Shade the rooftop in this residental areas
Prediction: mask-generation:Xenova/slimsam-77-uniform
--------------------
Query: Highlight the roadways in the map
Prediction: mask-generation:Xenova/slimsam-77-uniform
--------------------
Query: Identify the airplanes on the airstrip
Prediction: object-detection:geobase/WALDO30_yolov8m_640x640
--------------------
