**Table of contents**<a id='toc0_'></a>    
- [Refine `.json` file](#toc1_1_)    
  - [Extract only target class's json](#toc1_2_)    
  - [Extract 2p prompts](#toc1_3_)    
  - [Extract Random Samples](#toc1_4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Refine `.json` file](#toc0_)
Get the `cond+prompt.json`: Remove `target` field and refine to `source` + `prompt`

In [6]:
# remove target field from json
import json
from collections import OrderedDict

input_file = "../training/fusrs_v2_256_cam_v2/prompts.json"
output_file = "../gen/fusrs_v2_cam_v2/prompts.json"

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        # Load the JSON object from the line
        data = json.loads(line, object_pairs_hook=OrderedDict)

        # Remove the "target" field if it exists
        if "target" in data:
            del data["target"]

        # Rename the "source" field to "condition" and remove the "source" string
        if "source" in data:
            condition_value = data.pop("source")
            condition_value = condition_value.replace("./source/", "./")
            data["condition"] = condition_value

            # Reorder the fields to ensure "condition" comes before "prompt"
            ordered_data = OrderedDict(
                [("condition", data["condition"]), ("prompt", data["prompt"])]
            )
        else:
            ordered_data = data

        # Write the modified JSON object to the output file
        outfile.write(json.dumps(ordered_data) + "\n")

## <a id='toc1_2_'></a>[Extract only target class's json](#toc0_)

In [7]:
import json
import random

types = ["Fishing", "Tanker", "Dredger", "Cargo"]

for ship_type in types:
    keyword = f"{ship_type} ship"

    input_file = "../gen/fusrs_v2_cam_v2/prompts.json"
    # input_file = "../gen/fusrs_v2_cam_4k/cond+prompt.json"
    output_file = f"../gen/fusrs_v2_cam_v2/cls_{ship_type.lower()}.json"

    with open(input_file, "r") as infile:
        lines = infile.readlines()

    # random.shuffle(lines)

    with open(output_file, "w") as outfile:
        for line in lines:
            data = json.loads(line)
            prompt = data["prompt"]

            if keyword.lower() in prompt.lower():
                outfile.write(line)

## <a id='toc1_3_'></a>[Extract 2p prompts](#toc0_)
Too much prompts causes very similar results. Use less Prompts with more random seed. 2P means 2 Prompts.

In [8]:
import random

TYPES = ["Fishing", "Tanker", "Dredger"]
PROMPTS_IMG = 5
PICK = 2


for ship_type in TYPES:
    input_file = f"../gen/fusrs_v2_cam_v2/cls_{ship_type.lower()}.json"
    output_file = f"../gen/fusrs_v2_cam_v2/2p_{ship_type.lower()}.json"

    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        lines_buffer = []
        for i, line in enumerate(infile, start=1):
            lines_buffer.append(line)
            if i % PROMPTS_IMG == 0:
                # Randomly select lines from the buffer
                sampled_lines = random.sample(lines_buffer, PICK)
                # Write the sampled lines to the output file
                outfile.writelines(sampled_lines)
                # Add a new line to separate the 5-line texts
                lines_buffer.clear()

        if lines_buffer:  # Handle any remaining lines
            sampled_line = random.choice(lines_buffer)
            outfile.write(sampled_line)

    print(f"Randomly sampled lines written to {output_file}")

Randomly sampled lines written to ../gen/fusrs_v2_cam_v2/2p_fishing.json
Randomly sampled lines written to ../gen/fusrs_v2_cam_v2/2p_tanker.json
Randomly sampled lines written to ../gen/fusrs_v2_cam_v2/2p_dredger.json


## <a id='toc1_4_'></a>[Extract Random Samples](#toc0_)

In [6]:
import os
import random


def extract_random_lines(input_file, output_file, n):
    with open(input_file, "r") as f:
        lines = f.readlines()
    print(f"Source file {input_file} has {len(lines)} lines.")

    if n > len(lines):
        raise ValueError(
            f"Requested {n} lines, but the input file only has {len(lines)} lines."
        )

    random_lines = random.sample(lines, n)

    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)

    with open(output_file, "w") as f:
        f.writelines(random_lines)


ship_type = "Fishing"
input_file = f"../gen/fusrs_v2_cam_4k/ann_{ship_type.lower()}_4k_cam.txt"
n = 3076  # Replace this with the desired number of lines to extract
output_file = f"../gen/fusrs_v2_cam_4k/meta/ann_{ship_type.lower()}_4k_cam_{n}.txt"

try:
    extract_random_lines(input_file, output_file, n)
    print(
        f"Successfully extracted {n} random lines from {input_file} and wrote them to {output_file}."
    )
except ValueError as e:
    print(e)
except FileNotFoundError:
    print(f"Error: {input_file} not found.")

Source file ../gen/fusrs_v2_cam_4k/ann_fishing_4k_cam.txt has 4070 lines.
Successfully extracted 3076 random lines from ../gen/fusrs_v2_cam_4k/ann_fishing_4k_cam.txt and wrote them to ../gen/fusrs_v2_cam_4k/meta/ann_fishing_4k_cam_3076.txt.


#