In [1]:
import json
data = json.load(open("./result_REC_2025-02-10-17_05.json", "r"))
#data = json.load(open("./result_REC_2025-02-09-23_35.json", "r"))
print(len(data))

101


In [2]:
import pandas as pd
import re
def convert_to_corner_from_MC_format(box):
    """
    Convert a bounding box from [top, bottom, left, right] to [[x1, y1], [x2, y2]],
    ensuring all values are consistent.

    Parameters:
    - box: List [top, bottom, left, right].

    Returns:
    - List of two points [[x1, y1], [x2, y2]].
    """
    top, bottom, left, right =box
    # to corner
    x1, y1 = left, top
    x2, y2 = right, bottom

    return [[x1, y1], [x2, y2]]

In [3]:
def find_merged_boundingbox(bboxes_list):
    """
    Merge a bounding box list [[[x1, y1], [x2, y2]],...] into one bounding box 
    represented by the top-left and bottom-right coordinates.

    Parameters:
    - bboxes_list: List of bounding boxes [[[x1, y1], [x2, y2]], ...]

    Returns:
    - List containing two coorindates: [[x_top_left, y_top_left], [x_bottom_right, y_bottom_right]]
    """

    if not bboxes_list:
        return None  # Return None if the list is empty

    # Extract all x and y coordinates
    x1_vals = [box[0][0] for box in bboxes_list]
    y1_vals = [box[0][1] for box in bboxes_list]
    x2_vals = [box[1][0] for box in bboxes_list]
    y2_vals = [box[1][1] for box in bboxes_list]

    # Compute merged bounding box
    x_top_left = min(x1_vals)  # Smallest x1
    y_top_left = min(y1_vals)  # Smallest y1
    x_bottom_right = max(x2_vals)  # Largest x2
    y_bottom_right = max(y2_vals)  # Largest y2

    return [[x_top_left, y_top_left], [x_bottom_right, y_bottom_right]]

def acc_evaluation(iou_list,threshold=0.5):
    predictions = len(iou_list)
    true_positives = 0
    for iou in iou_list:
        if iou >= threshold:
            true_positives += 1

    accuracy = true_positives / predictions if predictions else 0
    return accuracy

def extract_bbox_from_qwen2_str(input_string):
    """
    Extract the bounding box from a string and convert it to floats.

    Parameters:
    - input_string: A string containing bounding box information between <|box_start|> and <|box_end|>.

    Returns:
    - Bounding box as a list of two coordinate pairs [[x1, y1], [x2, y2]] or None if no match is found.
    """
    # Match the bounding box pattern
    # Updated regex pattern to match floats
    pattern = r"<\|box_start\|>\((\d+(?:\.\d+)?),(\d+(?:\.\d+)?)\)?,?\(?(\d+(?:\.\d+)?),(\d+(?:\.\d+)?)\)<\|box_end\|>"
    match = re.search(pattern, input_string)
    if match:
        # Convert coordinates to float and return as a list
        bbox = [
            [float(match.group(1)), float(match.group(2))],
            [float(match.group(3)), float(match.group(4))]
        ]
        return bbox
    return None

def bbox_denormalization_qwen2(normalized_bbox,fig_size=[]):
    """
    Denormalize a bounding box from Qwen2 format to pixel coordinates.
    """
    height,width=fig_size
    x1_n, y1_n = normalized_bbox[0]
    x2_n, y2_n = normalized_bbox[1]

    # x1_n=x1/width*1000
    x1=x1_n/1000*width
    x2=x2_n/1000*width
    y1=y1_n/1000*height
    y2=y2_n/1000*height
    return [[x1,y1],[x2,y2]]


import re
def calculate_iou(box1, box2):
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters:
    - box1: List of two points [[x1, y1], [x2, y2]] for the first box
    - box2: List of two points [[x1, y1], [x2, y2]] for the second box

    Returns:
    - iou: The IoU value as a float
    """

    # Extract coordinates of the two boxes
    x1_min, y1_min = box1[0]
    x1_max, y1_max = box1[1]
    x2_min, y2_min = box2[0]
    x2_max, y2_max = box2[1]

    # Calculate the coordinates of the intersection rectangle
    inter_x1 = max(x1_min, x2_min)
    inter_y1 = max(y1_min, y2_min)
    inter_x2 = min(x1_max, x2_max)
    inter_y2 = min(y1_max, y2_max)

    # Compute the area of the intersection rectangle
    inter_width = max(0, inter_x2 - inter_x1)
    inter_height = max(0, inter_y2 - inter_y1)
    inter_area = inter_width * inter_height

    # Compute the area of both bounding boxes
    box1_area = (x1_max - x1_min) * (y1_max - y1_min)
    box2_area = (x2_max - x2_min) * (y2_max - y2_min)

    # Compute the union area
    union_area = box1_area + box2_area - inter_area

    # Compute IoU
    iou = inter_area / union_area if union_area > 0 else 0

    return iou



In [4]:
from PIL import Image

In [16]:
import os 
import random
random.seed(42)

df_data = []
for k in data:
    for ex in data[k]:
        id_list = ex["label"]["mentions"][0]["alphabet_digit"]
        bboxes = ex["label"]["mentions"][0]["bbox"]
        img_file = ex["label"]["mentions"][0]["img_name"][0]

        dlog = ex["dialogue_log"]
        n_utterances = len(dlog)
        n_add = len([x for x in dlog if ": added " in x])
        n_remove = len([x for x in dlog if ": removed " in x])
        n_arch = len([x for x in dlog if x.startswith("architect")])
        n_builder = len([x for x in dlog if x.startswith("builder")])
        
        img_path=os.path.join("D:/minecraft_tmp/repo/MC_corpus_revisions/Minecraft_Dialogue_Corpus_Merged_2025_06_Feb/basedata/images/", img_file)
        with Image.open(img_path) as img:
            # breakpoint()
            gt_height_width=[img.size[1],img.size[0]]
            
        total_bbox = find_merged_boundingbox([convert_to_corner_from_MC_format(x) for x in bboxes])
        output = ex["output_dict"][0]
        pred_bbox_nor=extract_bbox_from_qwen2_str(output)
       
        
        corrupt_output = False
        if pred_bbox_nor is None:
                #print("*** parsing sucessful ***")
                iou = 0.0
                corrupt_output = True
                #print(output)            
        else:
            #print("*** parsing sucessful ***")
            #print(output)
            #print(pred_bbox_nor)
            #print(total_bbox)
            pred_bbox=bbox_denormalization_qwen2(pred_bbox_nor,gt_height_width)
            
            # alternative predictions
            
            # 1. Entire image
            pred_bbox = [[0,0], [gt_height_width[1], gt_height_width[0]]] # entire image
            
            # 2. Random
            #x1, y1, x2, y2 = random.random(),  random.random(), random.random(), random.random()
            #if x1 > x2:
            #    x1, x2 = x2, x1
            #if y1 > y2:
            #    y1, y2 = y2, y1
            #x1, x2 = x1 * gt_height_width[1], x2 * gt_height_width[1]
            #y1, y2 = y1 * gt_height_width[0], y2 * gt_height_width[0]
            #pred_bbox = [[x1,y1],[x2,y2]]
            
            iou = calculate_iou(pred_bbox, total_bbox)
        #print(id_list)
        #print(bboxes)
        #print(output)
        #print(out_str)
        #print(img_file)

        n_blocks = len(id_list)
        xtl, ytl = total_bbox[0] # top left
        xbr, ybr = total_bbox[1] # bottom right
        
        gold_area = (xbr - xtl) * (ybr - ytl)
        #print(n_blocks)
        #print(gold_area)
        #print(corrupt_output)
        #print(iou)
        
        df_data.append((n_blocks, gold_area, corrupt_output, iou, str(id_list), img_file, pred_bbox, total_bbox, n_utterances, n_add, n_remove, n_arch, n_builder))
        
df = pd.DataFrame(df_data, columns = ["n_blocks", "gold_area", "corrupt_output", "iou", "ids", "img_file", "pred_bbox", "gold_bbox", "n_utterances", "n_add", "n_remove", "n_arch", "n_builder"])
print(len(df))

423


In [6]:
df["ex_id"] = df.index

In [7]:
df.head()

Unnamed: 0,n_blocks,gold_area,corrupt_output,iou,ids,img_file,pred_bbox,gold_bbox,n_utterances,n_add,n_remove,n_arch,n_builder,ex_id
0,1,87892,False,0.0,['a0'],B1-A3-C1-1522435497386_4.png,"[[341.5, 372.5], [409.8, 409.75000000000006]]","[[653, 91], [954, 383]]",8,0,0,3,4,0
1,8,196024,False,0.818547,"['a1', 'a6', 'a7', 'a0', 'a2', 'a4', 'a3', 'a5']",B1-A3-C1-1522435497386_14.png,"[[408.43399999999997, 136.335], [913.854, 610....","[[473, 146], [901, 604]]",18,7,0,4,13,1
2,8,76935,False,0.0,"['a1', 'a6', 'a7', 'a0', 'a2', 'a4', 'a3', 'a5']",B1-A3-C1-1522435497386_17.png,"[[230.854, 547.575], [277.298, 742.02]]","[[670, 75], [785, 744]]",22,8,0,7,14,2
3,1,13200,False,0.0,['a7'],B1-A3-C1-1522435497386_18.png,"[[681.634, 372.5], [845.554, 465.625]]","[[717, 8], [817, 140]]",23,8,0,7,15,3
4,1,69695,False,0.069594,['a9'],B1-A3-C1-1522435497386_25.png,"[[4.098, 3.725], [1360.536, 742.02]]","[[428, 235], [691, 500]]",30,10,1,10,19,4


In [17]:
import numpy as np
#np.mean(df.iou[(df.corrupt_output==False) & (df.n_blocks > 1)])
np.mean(df.iou[(df.corrupt_output==False)])
np.mean(df.iou)

np.float64(0.10855839258830739)

In [18]:
print(np.sum(df.iou>0.25) / len(df))
print(np.sum(df.iou>0.50) / len(df))
print(np.sum(df.iou>0.75) / len(df))

0.1182033096926714
0.026004728132387706
0.004728132387706856


In [None]:
import cv2
from matplotlib import pyplot as plt
import numpy as np

def display_bbs(eid):
    df_ex = df[df.ex_id == eid]
    img_path=os.path.join("D:/minecraft_tmp/repo/MC_corpus_revisions/Minecraft_Dialogue_Corpus_Merged_2025_06_Feb/basedata/images/", list(df_ex.img_file)[0])

    
    image_data = cv2.imread(img_path)
    # add bbox on it
    pred_left, pred_top = [int(x) for x in list(df_ex.pred_bbox)[0][0]]
    pred_right, pred_bottom = [int(x) for x in list(df_ex.pred_bbox)[0][1]]

    gold_left, gold_top = [int(x) for x in list(df_ex.gold_bbox)[0][0]]
    gold_right, gold_bottom = [int(x) for x in list(df_ex.gold_bbox)[0][1]]
    
    cv2.rectangle(image_data, (pred_left, pred_top), (pred_right, pred_bottom), (0,0,100), 10)
    cv2.rectangle(image_data, (gold_left, gold_top), (gold_right, gold_bottom), (0.0,100,0), 10)

    img2 = image_data[:,:,::-1]
    plt.imshow(img2)



In [None]:
display_bbs(4)

In [None]:
# analysing the mistakes
sdf = df[df.corrupt_output == False]
sdf = sdf.sort_values(by = ["iou"])
sdf.head(50)

In [None]:
display_bbs(133)

In [None]:
# analysing the success
sdf = df[df.corrupt_output == False]
sdf = sdf.sort_values(by = ["iou"], ascending = False)
sdf.head(50)

In [None]:
display_bbs(37)

In [None]:
import copy
df.head()
ndf = df[(df.corrupt_output == False) & (df.n_blocks <= 10)]
ndf["total_blocks"] = ndf.n_add - ndf.n_remove + 1
ndf["obj_img_ratio"] = ndf.n_blocks / ndf.total_blocks
ndf["n_dist_blocks"] = ndf.total_blocks - ndf.n_blocks

#print(ndf)
#ndf = ndf.fillna(0)

for k in ["n_blocks","gold_area","total_blocks", "obj_img_ratio", "n_utterances","n_add","n_remove","n_arch","n_builder", "n_dist_blocks"]:
    print(k, " -- ", str(ndf.iou.corr(ndf[k])))
    

In [None]:
import seaborn as sns

In [None]:
ndf2 = ndf[ndf.n_blocks <= 4]
y_data = ndf2["iou"]
x_data = [x for x in ndf2["obj_img_ratio"]]
#x_data = [np.sqrt(x) for x in ndf2["gold_area"]]

hue_data = ["NP" + str(x) for x in ndf2["n_blocks"]]
fig, ax = plt.subplots()
sns.lineplot(x = x_data, y = y_data, hue = hue_data, marker = "o", ax = ax)
ax.set(xlabel="Object to image ratio")
ax.set(ylabel="IoU")
ax.set(xlim=(0, 0.2))
#ax.set(xlim=(0.2, 1))