In [1]:
from pathlib import Path
import json

In [2]:
# setup paths
dataset_dir = Path("/mnt/ssd2/xin/repo/DART/Liebherr_Product")

# Define the images directory and duplicates directory using Path objects
image_dir = dataset_dir / "images"
meta_dir = dataset_dir / "metadata"
label_dir = dataset_dir / "labels"
response_dir = dataset_dir / "reviews"

with open(meta_dir / "to_gpt.json", "r") as f:
    to_gpt = json.load(f)

with open(meta_dir / "id_to_name.json", "r") as f:
    id_to_name = json.load(f)

In [3]:
# get all files from response_dir
response_files = list(response_dir.rglob("*.json"))
response_ids = sorted([f.stem for f in response_files])

In [4]:
# assert if all files got a response
assert response_ids == sorted(to_gpt)

In [5]:
# check the values of the dict of the content

precisions = set()
recalls = set()
fits = set()
no_json = []

for file in response_files:
    with open(file, "r") as f:
        response = json.load(f)
    # content of response should have format like this
    # '```json\n{"Precision":"No", "Recall":"No", "Fit":"No"}\n```'
    content = response["choices"][0]["message"]["content"]
    start_delimiter = "```json\n"
    end_delimiter = "\n```"
    start_pos = content.find(start_delimiter)
    if start_pos == -1:
        no_json.append(file)
        continue
    end_pos = content.find(end_delimiter, start_pos + len(start_delimiter))
    content = content[start_pos + len(start_delimiter) : end_pos]
    content = json.loads(content)
    if content["Precision"] not in ["Yes", "No"]:
        print(f"Precision is '{content['Precision']}' in '{file}'")
    if content["Recall"] not in ["Yes", "No"]:
        print(f"Recall is '{content['Recall']}' in '{file}'")
    if content["Fit"] not in ["Yes", "No"]:
        print(f"Fit is '{content['Fit']}' in '{file}'")
    precisions.add(content["Precision"])
    recalls.add(content["Recall"])
    fits.add(content["Fit"])

print()
if len(no_json) == 0:
    print("All files have json delimiter")
else:
    for file in no_json:
        print(f"Could not find json delimiter in '{file}'")

print()
print("Unique values for Precision, Recall, Fit:")
print(precisions)
print(recalls)
print(fits)

Precision is 'Cannot be determined' in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/06954.json'
Recall is 'Cannot be determined' in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/06954.json'
Fit is 'Cannot be determined' in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/06954.json'

Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/08502.json'
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/07013.json'
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/06931.json'
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/crawler crane/01092.json'
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/crawler crane/00868.json'
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/crawler crane/01385.json'
Could not find 

In [6]:
no_gpt = []
no_gpt_loose = []
no_gpt_precision = []
no_gpt_recall = []
no_gpt_fit = []
for file in response_files:
    with open(file, "r") as f:
        response = json.load(f)
    # content of response should have format like this
    # '```json\n{"Precision":"No", "Recall":"No", "Fit":"No"}\n```'
    content = response["choices"][0]["message"]["content"]
    start_delimiter = "```json\n"
    end_delimiter = "\n```"
    start_pos = content.find(start_delimiter)
    if start_pos == -1:
        print(f"Could not find json delimiter in '{file}', added to all no lists")
        no_gpt.append(file.stem)
        no_gpt_precision.append(file.stem)
        no_gpt_recall.append(file.stem)
        no_gpt_fit.append(file.stem)
        continue
    end_pos = content.find(end_delimiter, start_pos + len(start_delimiter))
    content = content[start_pos + len(start_delimiter) : end_pos]
    content = json.loads(content)

    # check if the values are valid
    if content["Precision"] not in ["Yes", "No"]:
        print(
            f"Precision is '{content['Precision']}' in '{file}',added to all no lists"
        )
        no_gpt.append(file.stem)
        no_gpt_precision.append(file.stem)
        no_gpt_recall.append(file.stem)
        no_gpt_fit.append(file.stem)
        continue
    if content["Recall"] not in ["Yes", "No"]:
        print(f"Recall is '{content['Recall']}' in '{file}',added to all no lists")
        no_gpt.append(file.stem)
        no_gpt_precision.append(file.stem)
        no_gpt_recall.append(file.stem)
        no_gpt_fit.append(file.stem)
        continue
    if content["Fit"] not in ["Yes", "No"]:
        print(f"Fit is '{content['Fit']}' in '{file}',added to all no lists")
        no_gpt.append(file.stem)
        no_gpt_precision.append(file.stem)
        no_gpt_recall.append(file.stem)
        no_gpt_fit.append(file.stem)
        continue

    # check if the values are 'No'
    if (
        content["Precision"] == "No"
        or content["Recall"] == "No"
        or content["Fit"] == "No"
    ):
        no_gpt.append(file.stem)
        if content["Precision"] == "No":
            no_gpt_precision.append(file.stem)
        if content["Recall"] == "No":
            no_gpt_recall.append(file.stem)
        if content["Fit"] == "No":
            no_gpt_fit.append(file.stem)

    # only 3 no means no
    if (
        content["Precision"] == "No"
        and content["Recall"] == "No"
        and content["Fit"] == "No"
    ):
        no_gpt_loose.append(file.stem)

print(f"no_gpt: {len(no_gpt)}")
print(f"no_gpt_precision: {len(no_gpt_precision)}")
print(f"no_gpt_recall: {len(no_gpt_recall)}")
print(f"no_gpt_fit: {len(no_gpt_fit)}")
print(f"no_gpt_loose: {len(no_gpt_loose)}")

Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/08502.json', added to all no lists
Precision is 'Cannot be determined' in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/06954.json',added to all no lists
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/07013.json', added to all no lists
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/mobile crane/06931.json', added to all no lists
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/crawler crane/01092.json', added to all no lists
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/crawler crane/00868.json', added to all no lists
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/reviews/crawler crane/01385.json', added to all no lists
Could not find json delimiter in '/mnt/ssd2/xin/repo/DART/Liebherr_Product/

In [7]:
# exclude specific files
exclude = [
    "03503",
    "09860",
    "09861",
    "09862",
    "09863",
    "09864",
    "09865",
    "09866",
    "09867",
]
no_gpt = [x for x in no_gpt if x not in exclude]
no_gpt_precision = [x for x in no_gpt_precision if x not in exclude]
no_gpt_recall = [x for x in no_gpt_recall if x not in exclude]
no_gpt_fit = [x for x in no_gpt_fit if x not in exclude]

print(f"no_gpt: {len(no_gpt)}")
print(f"no_gpt_precision: {len(no_gpt_precision)}")
print(f"no_gpt_recall: {len(no_gpt_recall)}")
print(f"no_gpt_fit: {len(no_gpt_fit)}")
print(f"no_gpt_loose: {len(no_gpt_loose)}")

no_gpt: 1997
no_gpt_precision: 1858
no_gpt_recall: 944
no_gpt_fit: 1741
no_gpt_loose: 757


In [8]:
# save
with open(label_dir / "no_gpt.json", "w") as f:
    json.dump(no_gpt, f)
# with open(label_dir / 'no_gpt_precision.json', 'w') as f:
#     json.dump(no_gpt_precision, f)
# with open(label_dir / 'no_gpt_recall.json', 'w') as f:
#     json.dump(no_gpt_recall, f)
# with open(label_dir / 'no_gpt_fit.json', 'w') as f:
#     json.dump(no_gpt_fit, f)
# with open(label_dir / 'no_gpt_loose.json', 'w') as f:
#     json.dump(no_gpt_loose, f)