In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
import pandas as pd
import json
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
import numpy as np
from sklearn.metrics import top_k_accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def jsonConvert(json_list, key_field):
    result_dict = {}
    for obj in json_list:
        key = obj.get(key_field)
        if key is not None:
            obj_copy = {k: v for k, v in obj.items() if k != key_field}
            result_dict[key] = obj_copy
        else:
            raise KeyError(f"Key '{key_field}' not found in JSON object: {obj}")
    return result_dict

In [3]:
test_data = pd.read_json('../datasets/DOTAv1.5/descriptions/val.json')
test_data

Unnamed: 0,filename,description,properties
0,P2215.txt,A remote sensing image containing 25 planes wi...,"[{'class': 'plane', 'count': 25, 'avg_dist': 2..."
1,P1610.txt,A remote sensing image containing 31 planes wi...,"[{'class': 'plane', 'count': 31, 'avg_dist': 2..."
2,P0787.txt,A remote sensing image containing 55 large veh...,"[{'class': 'large-vehicle', 'count': 55, 'avg_..."
3,P1213.txt,A remote sensing image containing 2 ships with...,"[{'class': 'ship', 'count': 2, 'avg_dist': 231..."
4,P0953.txt,A remote sensing image containing 11 harbors w...,"[{'class': 'harbor', 'count': 11, 'avg_dist': ..."
...,...,...,...
453,P0882.txt,A remote sensing image containing 55 small veh...,"[{'class': 'small-vehicle', 'count': 55, 'avg_..."
454,P2378.txt,A remote sensing image containing 1 ground tra...,"[{'class': 'ground-track-field', 'count': 1, '..."
455,P2539.txt,A remote sensing image containing 2 roundabout...,"[{'class': 'roundabout', 'count': 2, 'avg_dist..."
456,P2794.txt,A remote sensing image containing 135 small ve...,"[{'class': 'small-vehicle', 'count': 135, 'avg..."


In [4]:
base_model = './llama2_dota'

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.06s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [7]:
prompt = "Genereate the object bounding box properties for a remote sensing image with the following description as JSON only: A remote sensing image containing 20 small vehicles, 4 tennis courts, 1 basketball courts, 1 soccer ball fields."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description as JSON only: A remote sensing image containing 20 small vehicles, 4 tennis courts, 1 basketball courts, 1 soccer ball fields. [/INST] [{'class':'small-vehicle', 'count': 20, 'avg_dist': None}, {'class': 'tennis-court', 'count': 4, 'avg_dist': None}, {'class': 'basketball-court', 'count': 1, 'avg_dist': None}, {'class':'soccer-ball-field', 'count': 1, 'avg_dist': None}]
```


In [8]:
characters_to_remove = '` \n'
translation_table = str.maketrans('', '', characters_to_remove)
json_only = str(result[0]['generated_text'].split('[/INST]')[1]).translate(translation_table).replace("'", '"').replace("None", "0")
json.loads(json_only)

[{'class': 'small-vehicle', 'count': 20, 'avg_dist': 0},
 {'class': 'tennis-court', 'count': 4, 'avg_dist': 0},
 {'class': 'basketball-court', 'count': 1, 'avg_dist': 0},
 {'class': 'soccer-ball-field', 'count': 1, 'avg_dist': 0}]

In [9]:
descriptions = test_data['description'].values

In [10]:
base_prompt = "Genereate the object bounding box properties for a remote sensing image with the following description as JSON only: "
characters_to_remove = '` \n'
translation_table = str.maketrans('', '', characters_to_remove)

errors = 0
y_pred = []
prompts = []
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=546, truncation=True)
for desc in descriptions:
    prompt = base_prompt + str(desc)
    prompts.append({'text':f"<s>[INST] {prompt} [/INST]"})

for result in tqdm(pipe(KeyDataset(prompts, 'text'))):
    try:
        json_only_result = str(result[0]['generated_text'].split('[/INST]')[1]).translate(translation_table).replace("'", '"').replace("None", "0")
        index = json_only_result.find(']')
        json_only_result = json.loads(json_only_result[:index+1])
        y_pred.append(jsonConvert(json_only_result, 'class'))
    except:
        y_pred.append({})
        errors += 1

y_true = []
for i, row in test_data.iterrows():
    json_truth = json.loads(str(row['properties']).replace("'", '"').replace("None", "0"))
    y_true.append(jsonConvert(json_truth, 'class'))

100%|██████████| 458/458 [18:21<00:00,  2.40s/it]


In [11]:
similarity_mat = []
for y_p in tqdm(y_pred):
    avg_sim = []
    y_p_keys = set(y_p.keys())

    for y_t in y_true:
        y_t_keys = set(y_t.keys())

        total_sim = 0
        for key in y_t_keys:
            try:
                feats = DictVectorizer().fit_transform([y_t[key], y_p[key]])
                similarity = cosine_similarity(feats[0], feats[1])[0][0]
                total_sim += similarity
            except KeyError:
                pass
        avg_sim.append(total_sim/len(y_t_keys))
    similarity_mat.append(avg_sim)

100%|██████████| 458/458 [01:05<00:00,  7.03it/s]


In [12]:
similarity = np.stack(similarity_mat)
y_desc_true = np.arange(len(y_true))

top_k_stats = []
# Description-wise
top_k_stats.append({'k': 1, 'score': top_k_accuracy_score(y_desc_true, similarity, k=1)})
top_k_stats.append({'k': 3, 'score': top_k_accuracy_score(y_desc_true, similarity, k=3)})
top_k_stats.append({'k': 5, 'score': top_k_accuracy_score(y_desc_true, similarity, k=5)})
top_k_stats.append({'k': 10, 'score': top_k_accuracy_score(y_desc_true, similarity, k=10)})
top_k_stats.append({'k': 20, 'score': top_k_accuracy_score(y_desc_true, similarity, k=20)})
top_k_stats.append({'k': 30, 'score': top_k_accuracy_score(y_desc_true, similarity, k=30)})

In [13]:
errors

3

In [15]:
top_k_stats

[{'k': 1, 'score': 0.7860262008733624},
 {'k': 3, 'score': 0.9235807860262009},
 {'k': 5, 'score': 0.9388646288209607},
 {'k': 10, 'score': 0.9934497816593887},
 {'k': 20, 'score': 0.9934497816593887},
 {'k': 30, 'score': 0.9934497816593887}]

In [14]:
avg_correct_cos = 0
for i, sim in enumerate(similarity):
    avg_correct_cos += sim[y_desc_true[i]]

avg_correct_cos /= len(similarity)
avg_correct_cos

0.9934497810842094