In [3]:
import xml.etree.ElementTree as ET
import polars as pl

In [3]:
def parse_cvat_tag_annotations(xml_file_path):
    """
    Parse a CVAT annotation XML file and return a dictionary mapping filenames to tags.
    
    Args:
        xml_file_path: Path to the CVAT annotation XML file
        
    Returns:
        dict: A dictionary with filenames as keys and lists of tags as values
    """
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    
    # Create a dictionary to store the results
    filename_to_tags = {}
    
    # Iterate through all image elements
    for image in root.findall('.//image'):
        # Get the filename from the 'name' attribute
        filename = image.get('name')
        
        # Initialize an empty list for this filename
        tags = []
        
        # Find all tag elements for this image
        for tag in image.findall('./tag'):
            # Get the label attribute
            label = tag.get('label')
            if label:
                tags.append(label)
        
        # Add the filename and its tags to the dictionary
        filename_to_tags[filename] = tags
    
    return filename_to_tags

In [4]:
# Example usage
anno = parse_cvat_tag_annotations("/home/dangnh36/datasets/.comp/byu/processed/pseudo_label/round1/annotations.xml")
print(anno)

{'FP/FP-1-tomo_0333fa-300x960x928-spacing16.799999237060547.jpg': ['0'], 'FP/FP-102-tomo_d0d9b6-800x927x959-spacing13.100000381469727.jpg': ['1'], 'FP/FP-102-tomo_d31c96-800x928x960-spacing13.100000381469727.jpg': ['1', '2'], 'FP/FP-103-tomo_dbc66d-300x960x928-spacing16.799999237060547.jpg': ['0'], 'FP/FP-105-tomo_dae195-800x928x960-spacing16.100000381469727.jpg': [], 'FP/FP-105-tomo_e2a336-800x928x928-spacing19.299999237060547.jpg': ['1'], 'FP/FP-106-tomo_e32b81-300x960x928-spacing15.600000381469727.jpg': ['0'], 'FP/FP-107-tomo_e2da77-800x928x960-spacing13.100000381469727.jpg': ['1'], 'FP/FP-108-tomo_e72e60-300x960x928-spacing13.100000381469727.jpg': ['1'], 'FP/FP-109-tomo_dd36c9-500x924x956-spacing16.100000381469727.jpg': ['0'], 'FP/FP-11-tomo_172f08-800x928x960-spacing13.100000381469727.jpg': ['1'], 'FP/FP-11-tomo_1da0da-800x928x960-spacing13.100000381469727.jpg': ['1'], 'FP/FP-110-tomo_e0739f-300x960x928-spacing16.799999237060547.jpg': ['0'], 'FP/FP-110-tomo_e96200-300x928x928-spac

In [5]:
anno = {
    k.split('-')[2]: [int(e) for e in v] for k, v in anno.items()
}
print(len(anno))
print(anno)

129
{'tomo_0333fa': [0], 'tomo_d0d9b6': [1], 'tomo_d31c96': [1, 2], 'tomo_dbc66d': [0], 'tomo_dae195': [], 'tomo_e2a336': [1], 'tomo_e32b81': [0], 'tomo_e2da77': [1], 'tomo_e72e60': [1], 'tomo_dd36c9': [0], 'tomo_172f08': [1], 'tomo_1da0da': [1], 'tomo_e0739f': [0], 'tomo_e96200': [0], 'tomo_e1a034': [0], 'tomo_ef1a1a': [0], 'tomo_ecbc12': [1], 'tomo_f82a15': [0], 'tomo_f8b835': [0], 'tomo_f94504': [1], 'tomo_fbb49b': [0], 'tomo_f1bf2f': [0], 'tomo_fa5d78': [0], 'tomo_fe85f6': [1], 'tomo_fea6e8': [0], 'tomo_ff7c20': [1], 'tomo_136c8d': [0], 'tomo_1dc5f9': [1], 'tomo_28f9c1': [1], 'tomo_23c8a4': [1, 3], 'tomo_2aeb29': [0], 'tomo_2b996c': [0], 'tomo_2cace2': [0], 'tomo_39b15b': [0], 'tomo_285454': [0], 'tomo_2a89bb': [1], 'tomo_3b1cc9': [1], 'tomo_2c9f35': [1], 'tomo_464108': [1], 'tomo_040b80': [1], 'tomo_098751': [0], 'tomo_2dd6bd': [0], 'tomo_2fb12d': [0], 'tomo_4c1ca8': [0], 'tomo_4d528f': [1], 'tomo_50f0bf': [1], 'tomo_466489': [1], 'tomo_4e41c2': [0], 'tomo_5308e8': [1], 'tomo_47ac

In [6]:
# with corrected FN, not using CVAT
anno.update({
    'tomo_401341': [0, 1],
    'tomo_2b3cdf': [0, 1],
    'tomo_fe050c': [0, 1],
    'tomo_5b34b2': [0, 1],
})

In [17]:
len(anno)

133

In [7]:
print(len([v for v in anno.values() if v!= [0]]))
print(sum([len(v) - 1 for v in anno.values() if len(v) > 1]))

60
17


In [8]:
dfs = []

for fold_idx in range(5):
    csv_path = f'/home/dangnh36/datasets/.comp/byu/processed/pseudo_label/round1/fold{fold_idx}.csv'
    df = pl.scan_csv(csv_path).with_columns(pl.lit(fold_idx).alias('fold')).collect()
    print(f'Fold {fold_idx}: {df.shape}')
    dfs.append(df)
pseudo_df = pl.concat(dfs)
pseudo_df

Fold 0: (152, 6)
Fold 1: (164, 6)
Fold 2: (152, 6)
Fold 3: (159, 6)
Fold 4: (166, 6)


tomo_id,motor_z,motor_y,motor_x,conf,fold
str,f64,f64,f64,f64,i32
"""tomo_19a313""",72.8125,600.5,297.5,0.706055,0
"""tomo_05f919""",118.5,463.0,840.5,0.70459,0
"""tomo_05f919""",126.6875,684.5,890.0,0.48877,0
"""tomo_16136a""",140.5,567.0,430.0,0.254639,0
"""tomo_0e9757""",-1.0,-1.0,-1.0,0.0,0
…,…,…,…,…,…
"""tomo_fb08b5""",102.0625,676.5,553.5,0.734863,4
"""tomo_fbb49b""",140.5,407.0,346.25,0.058685,4
"""tomo_fc3c39""",125.1875,536.5,491.0,0.662109,4
"""tomo_fc5ae4""",23.921875,512.5,307.25,0.4140625,4


In [9]:
pseudo_df['tomo_id'].n_unique()

603

In [10]:
gt_df = pl.scan_csv('/home/dangnh36/datasets/.comp/byu/raw/train_labels.csv').with_columns(
    pl.lit('origin').alias('source'),
    pl.lit(1.0).alias('conf')
).collect()
gt_df

row_id,tomo_id,Motor axis 0,Motor axis 1,Motor axis 2,Array shape (axis 0),Array shape (axis 1),Array shape (axis 2),Voxel spacing,Number of motors,source,conf
i64,str,f64,f64,f64,i64,i64,i64,f64,i64,str,f64
0,"""tomo_003acc""",-1.0,-1.0,-1.0,500,1912,1847,6.5,0,"""origin""",1.0
1,"""tomo_00e047""",169.0,546.0,603.0,300,959,928,15.6,1,"""origin""",1.0
2,"""tomo_00e463""",235.0,403.0,137.0,500,924,956,19.7,6,"""origin""",1.0
3,"""tomo_00e463""",243.0,363.0,153.0,500,924,956,19.7,6,"""origin""",1.0
4,"""tomo_00e463""",222.0,379.0,144.0,500,924,956,19.7,6,"""origin""",1.0
…,…,…,…,…,…,…,…,…,…,…,…
732,"""tomo_fe050c""",138.0,134.0,542.0,300,959,928,15.6,1,"""origin""",1.0
733,"""tomo_fe85f6""",-1.0,-1.0,-1.0,800,928,960,13.1,0,"""origin""",1.0
734,"""tomo_fea6e8""",-1.0,-1.0,-1.0,300,960,928,16.8,0,"""origin""",1.0
735,"""tomo_ff505c""",111.0,816.0,678.0,300,959,928,15.6,1,"""origin""",1.0


In [11]:
NEW_SOURCE = 'round1'

new_rows = []

for (tomo_id, ), tomo_gt in gt_df.group_by('tomo_id', maintain_order=True):
    # print(tomo_id, len(tomo_gt), type(tomo_gt))
    if tomo_id in anno and len(anno[tomo_id]) > 0:
        tags = anno[tomo_id]
        assert len(tags) > 0
        assert len(tomo_gt) == 1
        # assert tomo_gt[0, 'Motor axis 0'] == tomo_gt[0, 'Motor axis 1'] == tomo_gt[0, 'Motor axis 2'] == -1.0
        pseudo_label = pseudo_df.filter(pl.col('tomo_id') == tomo_id)
        assert all([0<=e<=5 for e in tags])            
        
        for tag in tags:
            if tag == 0:
                # 0 mean keep the current GT with posibly more pos annotations
                # if GT is negative, still be negative, no change
                if len(tomo_gt) == 1 and tomo_gt[0, 'Motor axis 2'] == -1.0:
                    # ensure [0] is the only tag
                    assert len(tags) == 1 and tags[0] == 0
                assert len(tomo_gt) == 1
                # assert tomo_gt[0, 'Motor axis 2'] == -1.0
                new_row = tomo_gt.clone()
                new_row = new_row.with_columns(
                    pl.lit(NEW_SOURCE).alias('source')
                )
                new_rows.append(new_row)
            else:
                new_row = tomo_gt.clone()
                assert len(new_row) == 1
                pseudo_row = pseudo_label[tag-1]
                assert len(pseudo_row) == 1
                new_row[0, 'Motor axis 0'] = pseudo_row[0, 'motor_z']
                new_row[0, 'Motor axis 1'] = pseudo_row[0, 'motor_y']
                new_row[0, 'Motor axis 2'] = pseudo_row[0, 'motor_x']
                new_row[0, 'source'] = NEW_SOURCE
                new_row[0, 'conf'] = pseudo_row[0, 'conf']
                new_rows.append(new_row)
    else:
        new_rows.append(tomo_gt)
        # if len(tomo_gt) > 1:
        #     print('ting!')

new_df = pl.concat(new_rows).select(pl.col('*').exclude('row_id')).with_row_index('row_id')
new_df

row_id,tomo_id,Motor axis 0,Motor axis 1,Motor axis 2,Array shape (axis 0),Array shape (axis 1),Array shape (axis 2),Voxel spacing,Number of motors,source,conf
u32,str,f64,f64,f64,i64,i64,i64,f64,i64,str,f64
0,"""tomo_003acc""",-1.0,-1.0,-1.0,500,1912,1847,6.5,0,"""origin""",1.0
1,"""tomo_00e047""",169.0,546.0,603.0,300,959,928,15.6,1,"""origin""",1.0
2,"""tomo_00e463""",235.0,403.0,137.0,500,924,956,19.7,6,"""origin""",1.0
3,"""tomo_00e463""",243.0,363.0,153.0,500,924,956,19.7,6,"""origin""",1.0
4,"""tomo_00e463""",222.0,379.0,144.0,500,924,956,19.7,6,"""origin""",1.0
…,…,…,…,…,…,…,…,…,…,…,…
749,"""tomo_fe050c""",143.125,93.875,405.75,300,959,928,15.6,1,"""round1""",0.655273
750,"""tomo_fe85f6""",463.5,903.5,747.0,800,928,960,13.1,0,"""round1""",0.329102
751,"""tomo_fea6e8""",-1.0,-1.0,-1.0,300,960,928,16.8,0,"""round1""",1.0
752,"""tomo_ff505c""",111.0,816.0,678.0,300,959,928,15.6,1,"""origin""",1.0


In [12]:
new_df

row_id,tomo_id,Motor axis 0,Motor axis 1,Motor axis 2,Array shape (axis 0),Array shape (axis 1),Array shape (axis 2),Voxel spacing,Number of motors,source,conf
u32,str,f64,f64,f64,i64,i64,i64,f64,i64,str,f64
0,"""tomo_003acc""",-1.0,-1.0,-1.0,500,1912,1847,6.5,0,"""origin""",1.0
1,"""tomo_00e047""",169.0,546.0,603.0,300,959,928,15.6,1,"""origin""",1.0
2,"""tomo_00e463""",235.0,403.0,137.0,500,924,956,19.7,6,"""origin""",1.0
3,"""tomo_00e463""",243.0,363.0,153.0,500,924,956,19.7,6,"""origin""",1.0
4,"""tomo_00e463""",222.0,379.0,144.0,500,924,956,19.7,6,"""origin""",1.0
…,…,…,…,…,…,…,…,…,…,…,…
749,"""tomo_fe050c""",143.125,93.875,405.75,300,959,928,15.6,1,"""round1""",0.655273
750,"""tomo_fe85f6""",463.5,903.5,747.0,800,928,960,13.1,0,"""round1""",0.329102
751,"""tomo_fea6e8""",-1.0,-1.0,-1.0,300,960,928,16.8,0,"""round1""",1.0
752,"""tomo_ff505c""",111.0,816.0,678.0,300,959,928,15.6,1,"""origin""",1.0


In [13]:
new_df.filter(pl.col('tomo_id') == 'tomo_6521dc')

row_id,tomo_id,Motor axis 0,Motor axis 1,Motor axis 2,Array shape (axis 0),Array shape (axis 1),Array shape (axis 2),Voxel spacing,Number of motors,source,conf
u32,str,f64,f64,f64,i64,i64,i64,f64,i64,str,f64
305,"""tomo_6521dc""",200.5,463.0,233.375,300,960,928,15.6,0,"""round1""",0.631348
306,"""tomo_6521dc""",126.6875,693.0,471.25,300,960,928,15.6,0,"""round1""",0.285645
307,"""tomo_6521dc""",151.25,734.0,553.5,300,960,928,15.6,0,"""round1""",0.26001


In [14]:
new_df.filter(pl.col('source') == 'round1').n_unique('tomo_id')

132

In [None]:
new_df.write_csv('/home/dangnh36/datasets/.comp/byu/processed/train_labels_v2.csv')

In [9]:
pl.read_csv('/home/dangnh36/datasets/.comp/byu/processed/gt.csv')['num_motors'].sum()

451

In [11]:
df = pl.scan_csv('/home/dangnh36/datasets/.comp/byu/processed/train_labels_v2.csv').collect()
df = df.rename({'Motor axis 0': 'motor_z', 'Motor axis 1': 'motor_y', 'Motor axis 2': 'motor_x', 'Array shape (axis 0)': 'Z', 'Array shape (axis 1)': 'Y', 'Array shape (axis 2)': 'X',
               'Voxel spacing': 'voxel_spacing', 'Number of motors': 'ori_num_motors'}).with_columns(
    (pl.col('motor_z') >= 0).alias('has_motor')
).with_columns(
    (pl.len().over('tomo_id') * pl.col('has_motor')).alias('num_motors')
).with_columns(
    (pl.col('Z') * pl.col('voxel_spacing')).alias('ZA'),
    (pl.col('Y') * pl.col('voxel_spacing')).alias('YA'),
    (pl.col('X') * pl.col('voxel_spacing')).alias('XA'),
    (pl.col('motor_z') * pl.col('voxel_spacing')).alias('motor_zA'),
    (pl.col('motor_y') * pl.col('voxel_spacing')).alias('motor_yA'),
    (pl.col('motor_x') * pl.col('voxel_spacing')).alias('motor_xA')
).with_columns(
    (pl.col('Z') * pl.col('Y') * pl.col('X')).alias('V'),
    (pl.col('ZA') * pl.col('YA') * pl.col('XA')).alias('VA')
)

print('DF:')
display(df)

# display(df.filter(pl.col('ori_num_motors') != pl.col('num_motors')))

df2 = df.with_columns(
    pl.concat_list(['motor_z', 'motor_y', 'motor_x']).alias('motor_zyx'),
    pl.concat_list(['motor_zA', 'motor_yA', 'motor_xA']).alias('motor_zyxA')
).group_by('tomo_id', maintain_order=True).agg(
    pl.col('Z', 'Y', 'X', 'voxel_spacing', 'ori_num_motors', 'num_motors', 'ZA', 'YA', 'XA', 'V', 'VA', 'motor_z', 'motor_y', 'motor_x', 'motor_zA', 'motor_yA', 'motor_xA').first(),
    pl.col('motor_zyx', 'motor_zyxA').map_elements(lambda x: str([e for e in x.to_list() if e[0] >= 0]), return_dtype = pl.String)
)
print('DF2:')
display(df2)


DF:


row_id,tomo_id,motor_z,motor_y,motor_x,Z,Y,X,voxel_spacing,ori_num_motors,source,conf,has_motor,num_motors,ZA,YA,XA,motor_zA,motor_yA,motor_xA,V,VA
i64,str,f64,f64,f64,i64,i64,i64,f64,i64,str,f64,bool,u32,f64,f64,f64,f64,f64,f64,i64,f64
0,"""tomo_003acc""",-1.0,-1.0,-1.0,500,1912,1847,6.5,0,"""origin""",1.0,false,0,3250.0,12428.0,12005.5,-6.5,-6.5,-6.5,1765732000,4.8491e11
1,"""tomo_00e047""",169.0,546.0,603.0,300,959,928,15.6,1,"""origin""",1.0,true,1,4680.0,14960.4,14476.8,2636.4,8517.6,9406.8,266985600,1.0136e12
2,"""tomo_00e463""",235.0,403.0,137.0,500,924,956,19.7,6,"""origin""",1.0,true,6,9850.0,18202.8,18833.2,4629.5,7939.1,2698.9,441672000,3.3767e12
3,"""tomo_00e463""",243.0,363.0,153.0,500,924,956,19.7,6,"""origin""",1.0,true,6,9850.0,18202.8,18833.2,4787.1,7151.1,3014.1,441672000,3.3767e12
4,"""tomo_00e463""",222.0,379.0,144.0,500,924,956,19.7,6,"""origin""",1.0,true,6,9850.0,18202.8,18833.2,4373.4,7466.3,2836.8,441672000,3.3767e12
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
749,"""tomo_fe050c""",143.125,93.875,405.75,300,959,928,15.6,1,"""round1""",0.655273,true,2,4680.0,14960.4,14476.8,2232.75,1464.45,6329.7,266985600,1.0136e12
750,"""tomo_fe85f6""",463.5,903.5,747.0,800,928,960,13.1,0,"""round1""",0.329102,true,1,10480.0,12156.8,12576.0,6071.85,11835.85,9785.7,712704000,1.6022e12
751,"""tomo_fea6e8""",-1.0,-1.0,-1.0,300,960,928,16.8,0,"""round1""",1.0,false,0,5040.0,16128.0,15590.4,-16.8,-16.8,-16.8,267264000,1.2673e12
752,"""tomo_ff505c""",111.0,816.0,678.0,300,959,928,15.6,1,"""origin""",1.0,true,1,4680.0,14960.4,14476.8,1731.6,12729.6,10576.8,266985600,1.0136e12


DF2:


tomo_id,Z,Y,X,voxel_spacing,ori_num_motors,num_motors,ZA,YA,XA,V,VA,motor_z,motor_y,motor_x,motor_zA,motor_yA,motor_xA,motor_zyx,motor_zyxA
str,i64,i64,i64,f64,i64,u32,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,str,str
"""tomo_003acc""",500,1912,1847,6.5,0,0,3250.0,12428.0,12005.5,1765732000,4.8491e11,-1.0,-1.0,-1.0,-6.5,-6.5,-6.5,"""[]""","""[]"""
"""tomo_00e047""",300,959,928,15.6,1,1,4680.0,14960.4,14476.8,266985600,1.0136e12,169.0,546.0,603.0,2636.4,8517.6,9406.8,"""[[169.0, 546.0, 603.0]]""","""[[2636.4, 8517.6, 9406.8]]"""
"""tomo_00e463""",500,924,956,19.7,6,6,9850.0,18202.8,18833.2,441672000,3.3767e12,235.0,403.0,137.0,4629.5,7939.1,2698.9,"""[[235.0, 403.0, 137.0], [243.0…","""[[4629.5, 7939.099999999999, 2…"
"""tomo_01a877""",300,960,928,13.1,1,1,3930.0,12576.0,12156.8,267264000,6.0083e11,147.0,638.0,286.0,1925.7,8357.8,3746.6,"""[[147.0, 638.0, 286.0]]""","""[[1925.7, 8357.8, 3746.6]]"""
"""tomo_02862f""",300,959,928,15.6,1,1,4680.0,14960.4,14476.8,266985600,1.0136e12,101.0,351.0,120.0,1575.6,5475.6,1872.0,"""[[101.0, 351.0, 120.0]]""","""[[1575.6, 5475.599999999999, 1…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""tomo_fe050c""",300,959,928,15.6,1,2,4680.0,14960.4,14476.8,266985600,1.0136e12,138.0,134.0,542.0,2152.8,2090.4,8455.2,"""[[138.0, 134.0, 542.0], [143.1…","""[[2152.7999999999997, 2090.4, …"
"""tomo_fe85f6""",800,928,960,13.1,0,1,10480.0,12156.8,12576.0,712704000,1.6022e12,463.5,903.5,747.0,6071.85,11835.85,9785.7,"""[[463.5, 903.5, 747.0]]""","""[[6071.849999999999, 11835.85,…"
"""tomo_fea6e8""",300,960,928,16.8,0,0,5040.0,16128.0,15590.4,267264000,1.2673e12,-1.0,-1.0,-1.0,-16.8,-16.8,-16.8,"""[]""","""[]"""
"""tomo_ff505c""",300,959,928,15.6,1,1,4680.0,14960.4,14476.8,266985600,1.0136e12,111.0,816.0,678.0,1731.6,12729.6,10576.8,"""[[111.0, 816.0, 678.0]]""","""[[1731.6, 12729.6, 10576.8]]"""


In [10]:
df2['num_motors'].sum()

523

In [12]:
df2.write_csv('/home/dangnh36/datasets/.comp/byu/processed/gt_v2.csv')