In [1]:
import mdai
import pandas as pd
import json
import matplotlib.pyplot as plt
from matplotlib import gridspec
from PIL import Image 

In [2]:
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

# Load ROI info

In [3]:
df_roi = pd.read_csv('../data/roi_info.csv')

In [4]:
df_roi

Unnamed: 0,image_id,xmin,ymin,xmax,ymax
0,efab084dd1c47a1f5750e4c7243751b1,1964,416,2780,2535
1,349cb2da5b3a4ac1e58fc1ecddf1ef94,28,651,724,2578
2,182052c727c75eabc616db7d2f2e6d45,1918,346,2792,2497
3,5a5f443b83effe5e0fd1f0cda0659f33,14,494,844,2691
4,8aff72c26877956d98baa43d26db6e8b,0,512,905,3159
...,...,...,...,...,...
19995,470aff1ecea82d3f41e7ebce2229fd62,0,488,1047,2409
19996,816dad42c2d4dabdffe75f1103c8fd9e,18,670,722,2985
19997,f3740a36f7852a5985a1f014afe137c0,2060,908,2780,3204
19998,4e78e9d0746f7ac2a7e80433fca84835,0,200,852,3185


# Load Vindr-original boxes

In [5]:
df_boxes_vindr_original = pd.read_csv('../data/finding_annotations.csv')
df_boxes_vindr_original.dropna(subset=['xmin'], inplace=True)
df_boxes_vindr_original = df_boxes_vindr_original[['image_id', 'xmin', 'ymin', 'xmax', 'ymax']]

In [6]:
df_boxes_vindr_original

Unnamed: 0,image_id,xmin,ymin,xmax,ymax
0,4e3a578fe535ea4f5258d3f7f4419db8,2355.139893,1731.640015,2482.979980,1852.750000
1,dac39351b0f3a8c670b7f8dc88029364,2386.679932,1240.609985,2501.800049,1354.040039
2,c83f780904f25eacb44e9030f32c66e1,2279.179932,1166.510010,2704.439941,2184.260010
3,893528bc38a0362928a89364f1b692fd,1954.270020,1443.640015,2589.760010,2193.810059
4,318264c881bf12f2c1efe5f93920cc37,2172.300049,1967.410034,2388.699951,2147.159912
...,...,...,...,...,...
2249,bdf1539e07e60cfcb5e7833f5b63fa86,1959.930054,1535.310059,2110.709961,1707.640015
2250,bdf1539e07e60cfcb5e7833f5b63fa86,1734.920044,2052.899902,2061.189941,2421.520020
2251,bdf1539e07e60cfcb5e7833f5b63fa86,1993.109985,2288.360107,2200.810059,2441.879883
2252,997e15346547ff56b4209ac73e74556f,74.847801,1417.400024,727.844971,2225.929932


# Load Vindr-corrected boxes

In [7]:
df_corrected = mdai.common_utils.json_to_dataframe('../data/vindr-mammo-cleaned.json')
df_boxes_vindr_corrected = df_corrected['annotations']
df_boxes_vindr_corrected.dropna(subset=['data'], inplace=True)
df_boxes_vindr_corrected = df_boxes_vindr_corrected[['SOPInstanceUID', 'data']]
df_boxes_vindr_corrected.rename(columns={'SOPInstanceUID': 'image_id'}, inplace=True)

In [8]:
def extract_coordinates(row):
    if pd.notna(row):
        xmin = row['x']
        xmax = xmin + row['width']
        ymin = row['y']
        ymax = xmin + row['height']
        return pd.Series([xmin, ymin, xmax, ymax])
    else:
        return pd.Series([None, None, None, None])

In [9]:
# Apply the function to create new columns
df_boxes_vindr_corrected[['xmin', 'ymin', 'xmax', 'ymax']] = df_boxes_vindr_corrected['data'].apply(extract_coordinates)

# Drop the original 'data' column if needed
df_boxes_vindr_corrected.drop('data', axis=1, inplace=True)

In [10]:
df_boxes_vindr_corrected

Unnamed: 0,image_id,xmin,ymin,xmax,ymax
2,dac39351b0f3a8c670b7f8dc88029364,2377.615440,1224.905130,2517.012170,2515.153400
3,4e3a578fe535ea4f5258d3f7f4419db8,2334.770360,1716.627880,2511.218600,2496.700650
10,c83f780904f25eacb44e9030f32c66e1,2279.179932,1166.510010,2704.439941,3296.929932
11,893528bc38a0362928a89364f1b692fd,1954.270020,1443.640015,2589.760010,2704.440063
18,318264c881bf12f2c1efe5f93920cc37,2172.300049,1967.410034,2388.699951,2352.049927
...,...,...,...,...,...
32438,31573a0308d41e44ed826b9a15e6af94,195.195999,1316.410034,580.585022,584.216019
32439,f51e0c5db5201a2cf58b424437007197,56.120890,1691.946630,494.159810,596.825230
32444,b50e0e1100d4f3fb389534cee03a2b03,2059.669922,1748.079956,2675.568445,2538.216447
32445,b50e0e1100d4f3fb389534cee03a2b03,2542.350098,1392.020020,2809.939941,2791.490112


# Merge dataframes

## Roi and Vindr-Original

In [11]:
roi_and_original = pd.merge(df_roi, df_boxes_vindr_original, on='image_id', suffixes=('_roi', '_box_org'))
roi_and_original['xmin_merge'] = roi_and_original[['xmin_roi', 'xmin_box_org']].min(axis=1)
roi_and_original['ymin_merge'] = roi_and_original[['ymin_roi', 'ymin_box_org']].min(axis=1)
roi_and_original['xmax_merge'] = roi_and_original[['xmax_roi', 'xmax_box_org']].max(axis=1)
roi_and_original['ymax_merge'] = roi_and_original[['ymax_roi', 'ymax_box_org']].max(axis=1)
roi_and_original['roi_area'] = (roi_and_original['xmax_roi'] - roi_and_original['xmin_roi']) * (roi_and_original['ymax_roi'] - roi_and_original['ymin_roi'])
roi_and_original['merged_area'] = (roi_and_original['xmax_merge'] - roi_and_original['xmin_merge']) * (roi_and_original['ymax_merge'] - roi_and_original['ymin_merge'])
roi_and_original['roi_growth_pct'] = 100 * ((roi_and_original['merged_area'] - roi_and_original['roi_area']) / roi_and_original['roi_area'])
roi_and_original.sort_values(by='roi_growth_pct', ascending=False).head(100)

Unnamed: 0,image_id,xmin_roi,ymin_roi,xmax_roi,ymax_roi,xmin_box_org,ymin_box_org,xmax_box_org,ymax_box_org,xmin_merge,ymin_merge,xmax_merge,ymax_merge,roi_area,merged_area,roi_growth_pct
1380,f60e4a6ab3f58f13e178fe8d98e532c5,1625,780,2776,2970,2266.98999,29.1217,2785.300049,1145.920044,1625.0,29.1217,2785.300049,2970.0,2520690,3412301.0,35.371713
1090,53c092e62d94e3193734c7bdf81a6b49,1777,677,2767,2644,2399.52002,86.989403,2613.300049,425.355988,1777.0,86.989403,2767.0,2644.0,1947330,2531440.0,29.995455
1675,3249215304caf281062d80f4b3798634,1946,736,2807,2930,2344.23999,264.675995,2519.219971,486.041992,1946.0,264.675995,2807.0,2930.0,1889034,2294844.0,21.482407
2041,b723b49547d7fae7e4117db8d93a4fdf,0,413,867,2551,21.892799,-5.22405,233.996002,224.880997,0.0,-5.22405,867.0,2551.0,1853646,2216246.0,19.561462
1836,e751e5a947089704457741eea74c2280,1846,680,2783,2897,2374.669922,259.161987,2629.669922,657.315979,1846.0,259.161987,2783.0,2897.0,2077329,2471654.0,18.982319
6,48d67e59b596a5d42b2ca037839979e1,0,381,1114,2459,470.050161,3.9113,795.643982,489.759003,0.0,3.9113,1114.0,2459.0,2314892,2734969.0,18.146713
798,ad1966572be8828018237cd3ff44ae65,0,392,1252,2649,23.6308,32.5144,210.136002,188.938004,0.0,32.5144,1252.0,2649.0,2825764,3275840.0,15.927585
171,c1edef455c1d305e6674fcf0d9c3c195,0,396,1278,2778,262.185856,43.581213,455.799808,291.525611,0.0,43.581213,1278.0,2778.0,3044196,3494587.0,14.795079
924,a0d04a7372a60d2a83c06d731a619c16,1,299,884,2319,4.07346,0.381865,384.636993,583.603027,1.0,0.381865,884.0,2319.0,1783660,2047340.0,14.783076
1966,06a407691cadae64cea5de8e8634a4ac,0,337,1362,2650,0.54154,-3.77153,221.567993,607.68103,0.0,-3.77153,1362.0,2650.0,3150306,3614437.0,14.732881


## Roi and Vindr-corrected

In [12]:
roi_and_corrected = pd.merge(df_roi, df_boxes_vindr_corrected, on='image_id', suffixes=('_roi', '_box_cor'))
roi_and_corrected['xmin_merge'] = roi_and_corrected[['xmin_roi', 'xmin_box_cor']].min(axis=1)
roi_and_corrected['ymin_merge'] = roi_and_corrected[['ymin_roi', 'ymin_box_cor']].min(axis=1)
roi_and_corrected['xmax_merge'] = roi_and_corrected[['xmax_roi', 'xmax_box_cor']].max(axis=1)
roi_and_corrected['ymax_merge'] = roi_and_corrected[['ymax_roi', 'ymax_box_cor']].max(axis=1)
roi_and_corrected['roi_area'] = (roi_and_corrected['xmax_roi'] - roi_and_corrected['xmin_roi']) * (roi_and_corrected['ymax_roi'] - roi_and_corrected['ymin_roi'])
roi_and_corrected['merged_area'] = (roi_and_corrected['xmax_merge'] - roi_and_corrected['xmin_merge']) * (roi_and_corrected['ymax_merge'] - roi_and_corrected['ymin_merge'])
roi_and_corrected['roi_growth_pct'] = 100 * ((roi_and_corrected['merged_area'] - roi_and_corrected['roi_area']) / roi_and_corrected['roi_area'])
roi_and_corrected.sort_values(by='roi_growth_pct', ascending=False).head(100)

Unnamed: 0,image_id,xmin_roi,ymin_roi,xmax_roi,ymax_roi,xmin_box_cor,ymin_box_cor,xmax_box_cor,ymax_box_cor,xmin_merge,ymin_merge,xmax_merge,ymax_merge,roi_area,merged_area,roi_growth_pct
1727,f56a57a109aed21840b83c8453e06a6e,2054,574,2787,2062,2166.399902,685.97998,2800.550049,3206.579956,2054.0,574.0,2800.550049,3206.579956,1090704,1965353.0,80.191206
743,baebd41037a14dee4a2f864330be4328,2202,839,2772,2567,2278.030029,1045.030029,2790.219971,3595.939941,2202.0,839.0,2790.219971,3595.939941,984960,1621687.0,64.644974
741,3b95bda18918892c9a44422bd01b9ffa,2041,353,2776,2279,2107.040039,812.500977,2788.189941,3394.299072,2041.0,353.0,2788.189941,3394.299072,1415610,2272428.0,60.526422
503,bc0ae93db877210820958502af7a1461,2261,663,2765,2296,2702.139893,1313.410034,2795.429932,3021.599854,2261.0,663.0,2795.429932,3021.599854,823032,1260506.0,53.153991
1912,437601b542a20b112a2dded87a434b3c,1928,553,2780,2463,1486.9073,2267.58748,1486.9073,1486.9073,1486.9073,553.0,2780.0,2463.0,1627320,2469807.0,51.771444
1714,b6d0903ba96ff1157a6b055bd56181d3,0,595,1320,2976,1977.62503,1887.82987,1977.62503,1977.62503,0.0,595.0,1977.62503,2976.0,3142920,4708725.0,49.820078
19,f1f79501410a98782c0a03b48e66df83,2093,361,2766,1997,2216.669922,983.671021,2659.77002,2693.278931,2093.0,361.0,2766.0,2693.278931,1101028,1569624.0,42.559837
1343,614ee08af2499715e2b20b97d387fd92,1740,269,2784,2381,1854.03304,556.0573,2423.993,3260.31588,1740.0,269.0,2784.0,3260.31588,2204928,3122934.0,41.634275
167,1c1d29264b338e71d94bf33db3a2849f,2216,519,2763,2285,2753.88867,1814.43994,2798.96997,2852.59704,2216.0,519.0,2798.96997,2852.59704,966002,1360417.0,40.829625
854,4b9618d7a794a67ec6b53a187254b700,2083,420,2796,2482,2417.70588,903.62665,2812.0,3245.3957,2083.0,420.0,2812.0,3245.3957,1470206,2059713.0,40.09693


## Roi + Vindr-original + Vindr-corrected (All Together)

In [17]:
# Merge all three DataFrames
merged_df = pd.merge(df_roi, df_boxes_vindr_original, on='image_id', suffixes=('', '_box_org'))
merged_df = pd.merge(merged_df, df_boxes_vindr_corrected, on='image_id', suffixes=('_roi', '_box_corr'))

# Calculate merged coordinates
merged_df['xmin_merge'] = merged_df[['xmin_roi', 'xmin_box_org', 'xmin_box_corr']].min(axis=1)
merged_df['ymin_merge'] = merged_df[['ymin_roi', 'ymin_box_org', 'ymin_box_corr']].min(axis=1)
merged_df['xmax_merge'] = merged_df[['xmax_roi', 'xmax_box_org', 'xmax_box_corr']].max(axis=1)
merged_df['ymax_merge'] = merged_df[['ymax_roi', 'ymax_box_org', 'ymax_box_corr']].max(axis=1)

# Calculate roi_area and merged_area
merged_df['roi_area'] = (merged_df['xmax_roi'] - merged_df['xmin_roi']) * (merged_df['ymax_roi'] - merged_df['ymin_roi'])
merged_df['merged_area'] = (merged_df['xmax_merge'] - merged_df['xmin_merge']) * (merged_df['ymax_merge'] - merged_df['ymin_merge'])

# Calculate roi_growth_pct
merged_df['roi_growth_pct'] = 100 * ((merged_df['merged_area'] - merged_df['roi_area']) / merged_df['roi_area'])
merged_df.sort_values(by='roi_growth_pct', ascending=False).head(100)

Unnamed: 0,image_id,xmin_roi,ymin_roi,xmax_roi,ymax_roi,xmin_box_org,ymin_box_org,xmax_box_org,ymax_box_org,xmin_box_corr,ymin_box_corr,xmax_box_corr,ymax_box_corr,xmin_merge,ymin_merge,xmax_merge,ymax_merge,roi_area,merged_area,roi_growth_pct
2314,f56a57a109aed21840b83c8453e06a6e,2054,574,2787,2062,2048.76001,985.838013,2249.620117,1532.469971,2166.399902,685.97998,2800.550049,3206.579956,2048.76001,574.0,2800.550049,3206.579956,1090704,1979147.0,81.455958
2315,f56a57a109aed21840b83c8453e06a6e,2054,574,2787,2062,2166.399902,685.97998,2800.550049,1726.160034,2166.399902,685.97998,2800.550049,3206.579956,2054.0,574.0,2800.550049,3206.579956,1090704,1965353.0,80.191206
1021,baebd41037a14dee4a2f864330be4328,2202,839,2772,2567,2278.030029,1045.030029,2790.219971,2362.939941,2278.030029,1045.030029,2790.219971,3595.939941,2202.0,839.0,2790.219971,3595.939941,984960,1621687.0,64.644974
1019,3b95bda18918892c9a44422bd01b9ffa,2041,353,2776,2279,2107.040039,812.500977,2788.189941,2099.76001,2107.040039,812.500977,2788.189941,3394.299072,2041.0,353.0,2788.189941,3394.299072,1415610,2272428.0,60.526422
684,bc0ae93db877210820958502af7a1461,2261,663,2765,2296,2702.139893,1313.410034,2795.429932,1632.869995,2702.139893,1313.410034,2795.429932,3021.599854,2261.0,663.0,2795.429932,3021.599854,823032,1260506.0,53.153991
2290,b6d0903ba96ff1157a6b055bd56181d3,0,595,1320,2976,169.811005,1190.640015,538.967781,1469.071057,1977.62503,1887.82987,1977.62503,1977.62503,0.0,595.0,1977.62503,2976.0,3142920,4708725.0,49.820078
2293,b6d0903ba96ff1157a6b055bd56181d3,0,595,1320,2976,165.473951,1092.928334,257.302222,1175.926963,1977.62503,1887.82987,1977.62503,1977.62503,0.0,595.0,1977.62503,2976.0,3142920,4708725.0,49.820078
21,f1f79501410a98782c0a03b48e66df83,2093,361,2766,1997,2216.669922,983.671021,2659.77002,1460.280029,2216.669922,983.671021,2659.77002,2693.278931,2093.0,361.0,2766.0,2693.278931,1101028,1569624.0,42.559837
1865,614ee08af2499715e2b20b97d387fd92,1740,269,2784,2381,2386.669922,865.728027,2641.280029,1273.109985,1854.03304,556.0573,2423.993,3260.31588,1740.0,269.0,2784.0,3260.31588,2204928,3122934.0,41.634275
1863,614ee08af2499715e2b20b97d387fd92,1740,269,2784,2381,1924.719971,1058.51001,2372.120117,1647.76001,1854.03304,556.0573,2423.993,3260.31588,1740.0,269.0,2784.0,3260.31588,2204928,3122934.0,41.634275


In [20]:
df_csv = merged_df[['image_id', 'xmin_roi', 'ymin_roi', 'xmax_roi', 'ymax_roi']]
df_csv.rename(columns={'xmin_roi': 'xmin', 'ymin_roi': 'ymin', 'xmax_roi': 'xmax', 'ymax_roi': 'ymax'}, inplace=True)
df_csv.to_csv('../data/out/roi_info_merged.csv', index=False)

In [13]:
# images_dir = ''
# N = 7
# top_images = merged_df.head(N)

# # Calculate the number of rows and columns based on N
# num_cols = min(N, 6)
# num_rows = -(-N // 6)  # Ceiling division to calculate the number of rows

# # Create a subplot grid
# fig = plt.figure(figsize=(15, 5 * num_rows))
# gs = gridspec.GridSpec(num_rows, num_cols, width_ratios=[1] * num_cols)

# for i, (_, row) in enumerate(top_images.iterrows()):
#     image_id = row['image_id']
#     image_path = f"{images_dir}/{image_id}.png"  # Assuming images have a .png extension

#     # Open and plot the image
#     ax = plt.subplot(gs[i])
#     img = Image.open(image_path)
#     ax.imshow(img)
#     ax.axis('off')
#     ax.set_title(f"Image {image_id}\nGrowth Pct: {row['roi_growth_pct']}%")

# plt.tight_layout()
# plt.show()