In [1]:
import pandas as pd
import os
import json
from scipy.spatial.distance import pdist

In [2]:
def genDesc(filename):
    try:
        bb_file = pd.read_csv(filename, sep=' ', header=None)
        counts = bb_file[8].value_counts()
        text = 'A remote sensing image containing '
        prop = []
        for label, count in counts.items():
            text = text + str(count) + ' ' + label.replace('-', ' ') + 's, '
            class_prop = {'class':label, 'count': count}
            if count >1:
                class_info = bb_file[bb_file[8]==label].drop(9, axis=1)
                class_info['x_centroid'] = class_info[[0,2,4,6]].sum(axis=1)/4
                class_info['y_centroid'] = class_info[[1,3,5,7]].sum(axis=1)/4
                avg_spread = pdist(class_info[['x_centroid', 'y_centroid']].values).mean()
                class_prop['avg_spread'] = avg_spread
            else:
                class_prop['avg_spread'] = None
            prop.append(class_prop)
        text = text[:-2] + "."
        return text, prop
    except:
        return 'A satellite image.', []


In [3]:
train_descriptions = []
val_descriptions = []
train_dir = '../datasets/DOTAv2/labels/DOTA-v2.0_train_original_format/'
val_dir = '../datasets/DOTAv2/labels/DOTA-v2.0_val_original_format/'
train_files = [f for f in os.listdir(train_dir)]
val_files = [f for f in os.listdir(val_dir)]

for f in train_files:
    src_path = os.path.join(train_dir, f)
    desc, prop = genDesc(src_path)
    dict = {
        'filename': f,
        'description': desc,
        'properties': prop
    }
    train_descriptions.append(dict)

for f in val_files:
    src_path = os.path.join(val_dir, f)
    desc, prop = genDesc(src_path)
    dict = {
        'filename': f,
        'description': desc,
        'properties': prop
    }
    val_descriptions.append(dict)

In [4]:
with open("../datasets/DOTAv2/descriptions/train.json", "w") as outfile:
    json.dump(train_descriptions, outfile)
with open("../datasets/DOTAv2/descriptions/val.json", "w") as outfile:
    json.dump(val_descriptions, outfile)

In [5]:
with open("../datasets/DOTAv2/descriptions/train.txt", "w") as outfile:
    for desc in train_descriptions:
        outfile.write(f"<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description: {desc['description']} [/INST] {str(desc['properties'])} </s> \n")

In [6]:
with open("../datasets/DOTAv2/descriptions/train.txt", "w") as outfile:
    for desc in val_descriptions:
        outfile.write(f"<s>[INST] Genereate the object bounding box properties for a remote sensing image with the following description: {desc['description']} [/INST] {str(desc['properties'])} </s> \n")