# Setup

In [1]:
import boto3
import pandas as pd
import io
import tqdm

# Compare CoralNET sources in pyspacer with new images

In [2]:
s3 = boto3.client('s3')
bucket_name = '2310-coralnet-public-sources'

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Delimiter='/'
)

# Get common prefixes (folders)
if 'CommonPrefixes' in response:
    folders = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
    print(f"{len(folders)} Folders found:")
else:
    print("No folders found in the bucket")

580 Folders found:


In [3]:
len(folders)

580

In [4]:
labelset = set()
data = []
for source in tqdm.tqdm(folders):
    if not source.startswith('s'):
        print(source)

    file_key = f'{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    row = [source, len(annotations_df), annotations_df['Image ID'].nunique(), annotations_df['Label ID'].nunique()]
    data.append(row)
    labelset = labelset | set(annotations_df['Label ID'].unique())

  0%|          | 2/580 [00:00<00:38, 15.12it/s]

coralnet-public-images/
File coralnet-public-images/annotations.csv not found in bucket


 37%|███▋      | 213/580 [00:18<00:28, 12.96it/s]

File s295/annotations.csv not found in bucket


 49%|████▉     | 286/580 [00:23<00:15, 19.20it/s]

File s3182/annotations.csv not found in bucket


 53%|█████▎    | 305/580 [00:24<00:17, 16.13it/s]

File s3342/annotations.csv not found in bucket


 54%|█████▍    | 314/580 [00:25<00:21, 12.40it/s]

File s3363/annotations.csv not found in bucket


 71%|███████▏  | 414/580 [00:36<00:12, 13.77it/s]

File s372/annotations.csv not found in bucket


100%|██████████| 580/580 [00:49<00:00, 11.73it/s]


In [5]:
print(f"The total number of unique labels is {len(labelset)}")

The total number of unique labels is 2245


In [6]:
coralnet_summary_df = pd.DataFrame(data, columns=['Source', 'Num Annotations', 'Num Images', 'Num Unique Labels'])
coralnet_summary_df

Unnamed: 0,Source,Num Annotations,Num Images,Num Unique Labels
0,s1073/,45000,225,27
1,s1076/,41800,209,28
2,s109/,197100,3942,78
3,s1097/,16860,1686,78
4,s1162/,25,5,4
...,...,...,...,...
569,s841/,58200,3880,75
570,s842/,30309,940,3
571,s843/,56400,2820,64
572,s921/,1860,62,27


In [7]:
print("There are a total of %d annotations across %d images (average %d annotations per image) and %d unique labels." %
      (coralnet_summary_df["Num Annotations"].sum(), coralnet_summary_df["Num Images"].sum(), 
       coralnet_summary_df["Num Annotations"].sum() / coralnet_summary_df["Num Images"].sum(), len(labelset)))

There are a total of 11066639 annotations across 421642 images (average 26 annotations per image) and 2245 unique labels.


In [8]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Delimiter='/'
)
# Get common prefixes (folders)
if 'CommonPrefixes' in response:
    folders_new = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
    folder = 'coralnet-public-images/'
    # List subfolders within coralnet-public-images
    paginator = s3.get_paginator('list_objects_v2')
    sub_response = {'CommonPrefixes': []}
    for page in paginator.paginate(Bucket=bucket_name, Prefix=folder, Delimiter='/'):
        if 'CommonPrefixes' in page:
            sub_response['CommonPrefixes'].extend(page['CommonPrefixes'])
    if 'CommonPrefixes' in sub_response:
        print("Subfolders in coralnet-public-images/:")
        folders_new = [prefix['Prefix'] for prefix in sub_response['CommonPrefixes']]
        folders_new = [folder.replace("coralnet-public-images/", "") for folder in folders_new]
        # for subfolder in sub_response['CommonPrefixes']:
        #     print(f"- {subfolder['Prefix']}")
    else:
        print("No subfolders found in coralnet-public-images/")
else:
    print("No folders found in the bucket")

Subfolders in coralnet-public-images/:


In [9]:
labelset2 = set()
data2 = []
for source in tqdm.tqdm(folders_new):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    row = [source, len(annotations_df), annotations_df['Name'].nunique(), annotations_df['Label ID'].nunique()]
    data2.append(row)
    labelset2 = labelset2 | set(annotations_df['Label ID'].unique())

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  2%|▏         | 23/1468 [00:02<01:24, 17.17it/s]

File coralnet-public-images/s1288/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  2%|▏         | 34/1468 [00:03<01:56, 12.34it/s]

File coralnet-public-images/s1388/annotations.csv not found in bucket
File coralnet-public-images/s1514/annotations.csv not found in bucket


  3%|▎         | 45/1468 [00:04<01:56, 12.23it/s]

File coralnet-public-images/s1579/annotations.csv not found in bucket
File coralnet-public-images/s1580/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  7%|▋         | 107/1468 [00:12<02:22,  9.55it/s]

File coralnet-public-images/s2112/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 11%|█         | 163/1468 [00:18<02:30,  8.70it/s]

File coralnet-public-images/s2615/annotations.csv not found in bucket
File coralnet-public-images/s2616/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 13%|█▎        | 193/1468 [00:22<01:54, 11.10it/s]

File coralnet-public-images/s2795/annotations.csv not found in bucket


 16%|█▌        | 228/1468 [00:25<01:52, 10.97it/s]

File coralnet-public-images/s2897/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 16%|█▌        | 237/1468 [00:26<02:16,  9.02it/s]

File coralnet-public-images/s2947/annotations.csv not found in bucket
File coralnet-public-images/s295/annotations.csv not found in bucket


 16%|█▋        | 239/1468 [00:27<01:58, 10.36it/s]

File coralnet-public-images/s2959/annotations.csv not found in bucket


 17%|█▋        | 256/1468 [00:28<01:21, 14.79it/s]

File coralnet-public-images/s3015/annotations.csv not found in bucket


 18%|█▊        | 261/1468 [00:28<01:28, 13.68it/s]

File coralnet-public-images/s3058/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 22%|██▏       | 323/1468 [00:34<01:46, 10.72it/s]

File coralnet-public-images/s3294/annotations.csv not found in bucket


 23%|██▎       | 332/1468 [00:34<01:45, 10.72it/s]

File coralnet-public-images/s3342/annotations.csv not found in bucket


 23%|██▎       | 339/1468 [00:35<01:49, 10.30it/s]

File coralnet-public-images/s3354/annotations.csv not found in bucket
File coralnet-public-images/s3361/annotations.csv not found in bucket


 23%|██▎       | 342/1468 [00:35<01:25, 13.12it/s]

File coralnet-public-images/s3363/annotations.csv not found in bucket


 24%|██▎       | 346/1468 [00:36<01:34, 11.83it/s]

File coralnet-public-images/s3371/annotations.csv not found in bucket


 25%|██▍       | 360/1468 [00:38<02:48,  6.58it/s]

File coralnet-public-images/s3411/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 26%|██▌       | 384/1468 [00:44<05:07,  3.52it/s]

File coralnet-public-images/s3465/annotations.csv not found in bucket
File coralnet-public-images/s3466/annotations.csv not found in bucket


 27%|██▋       | 394/1468 [00:45<02:23,  7.49it/s]

File coralnet-public-images/s3479/annotations.csv not found in bucket


 28%|██▊       | 404/1468 [00:46<01:20, 13.19it/s]

File coralnet-public-images/s3496/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 29%|██▉       | 423/1468 [00:49<01:37, 10.76it/s]

File coralnet-public-images/s3545/annotations.csv not found in bucket
File coralnet-public-images/s3551/annotations.csv not found in bucket


 29%|██▉       | 429/1468 [00:50<02:05,  8.28it/s]

File coralnet-public-images/s3559/annotations.csv not found in bucket
File coralnet-public-images/s3567/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 30%|██▉       | 436/1468 [00:52<03:11,  5.40it/s]

File coralnet-public-images/s3581/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 31%|███       | 457/1468 [00:55<02:19,  7.27it/s]

File coralnet-public-images/s372/annotations.csv not found in bucket


 31%|███▏      | 459/1468 [00:55<02:54,  5.79it/s]

File coralnet-public-images/s373/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 38%|███▊      | 562/1468 [01:06<01:08, 13.20it/s]

File coralnet-public-images/s4148/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 44%|████▍     | 651/1468 [01:17<00:46, 17.60it/s]

File coralnet-public-images/s4559/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 50%|████▉     | 730/1468 [01:26<01:12, 10.21it/s]

File coralnet-public-images/s4957/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 51%|█████▏    | 753/1468 [01:28<00:40, 17.77it/s]

File coralnet-public-images/s5013/annotations.csv not found in bucket


 62%|██████▏   | 915/1468 [01:42<00:32, 16.88it/s]

File coralnet-public-images/s554/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 96%|█████████▌| 1406/1468 [02:37<00:04, 13.39it/s]

File coralnet-public-images/s7176/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
100%|██████████| 1468/1468 [02:45<00:00,  8.88it/s]


In [10]:
print(f"The total number of unique labels is {len(labelset2)}")

The total number of unique labels is 3312


In [11]:
coralnet_summary_upd_df = pd.DataFrame(data2, columns=['Source', 'Num Annotations', 'Num Images', 'Num Unique Labels'])
coralnet_summary_upd_df

Unnamed: 0,Source,Num Annotations,Num Images,Num Unique Labels
0,s1073/,45000,225,27
1,s1076/,41800,209,28
2,s109/,198500,3970,78
3,s1162/,25,5,4
4,s1184/,50,10,7
...,...,...,...,...
1424,s841/,58200,3880,75
1425,s842/,93739,2592,3
1426,s843/,56400,2820,64
1427,s921/,3150,105,27


In [12]:
print("There are a total of %d annotations across %d images (average %d annotations per image) and %d unique labels." %
      (coralnet_summary_upd_df["Num Annotations"].sum(), coralnet_summary_upd_df["Num Images"].sum(), 
       coralnet_summary_upd_df["Num Annotations"].sum() / coralnet_summary_upd_df["Num Images"].sum(), len(labelset2)))

There are a total of 21397835 annotations across 515658 images (average 41 annotations per image) and 3312 unique labels.


In [13]:
coralnet_comparison_df = pd.merge(coralnet_summary_df, coralnet_summary_upd_df, 
                    on='Source', 
                    how='outer',
                    suffixes=('_orig', '_upd'))
coralnet_comparison_df = coralnet_comparison_df.sort_values('Source')
coralnet_comparison_df

Unnamed: 0,Source,Num Annotations_orig,Num Images_orig,Num Unique Labels_orig,Num Annotations_upd,Num Images_upd,Num Unique Labels_upd
0,s1073/,45000.0,225.0,27.0,45000.0,225.0,27.0
1,s1076/,41800.0,209.0,28.0,41800.0,209.0,28.0
2,s109/,197100.0,3942.0,78.0,198500.0,3970.0,78.0
3,s1097/,16860.0,1686.0,78.0,,,
4,s1162/,25.0,5.0,4.0,25.0,5.0,4.0
...,...,...,...,...,...,...,...
1474,s841/,58200.0,3880.0,75.0,58200.0,3880.0,75.0
1475,s842/,30309.0,940.0,3.0,93739.0,2592.0,3.0
1476,s843/,56400.0,2820.0,64.0,56400.0,2820.0,64.0
1477,s921/,1860.0,62.0,27.0,3150.0,105.0,27.0


In [14]:
coralnet_comparison_df.isna().sum(axis=0)

Source                      0
Num Annotations_orig      905
Num Images_orig           905
Num Unique Labels_orig    905
Num Annotations_upd        50
Num Images_upd             50
Num Unique Labels_upd      50
dtype: int64

In [15]:
coralnet_comparison_df[coralnet_comparison_df["Num Annotations_orig"].isna()]

Unnamed: 0,Source,Num Annotations_orig,Num Images_orig,Num Unique Labels_orig,Num Annotations_upd,Num Images_upd,Num Unique Labels_upd
6,s1189/,,,,0.0,0.0,0.0
21,s1294/,,,,0.0,0.0,0.0
27,s1356/,,,,0.0,0.0,0.0
38,s1545/,,,,0.0,0.0,0.0
42,s1577/,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1467,s7494/,,,,30.0,1.0,4.0
1468,s7513/,,,,2400.0,96.0,18.0
1469,s7519/,,,,1575.0,63.0,6.0
1470,s7525/,,,,60.0,2.0,4.0


# Check unmapped labels

In [16]:
import requests
def initialize_coralnet_mapping(
    mapping_endpoint="https://api.datamermaid.org/v1/classification/labelmappings/?provider=CoralNet",
):
    response = requests.get(mapping_endpoint)
    data = response.json()
    labelset = data["results"]

    while data["next"]:
        response = requests.get(data["next"])
        data = response.json()
        labelset.extend(data["results"])
    label_mapping = {
        label["provider_id"]: label["benthic_attribute_name"] for label in labelset
    }
    return label_mapping

labelmapping = initialize_coralnet_mapping()

In [17]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

labels = []

for source in tqdm.tqdm(folders_new):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    labels = labels + list(annotations_df['Label ID'])

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  1%|▏         | 22/1468 [00:02<01:33, 15.38it/s]

File coralnet-public-images/s1288/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  2%|▏         | 34/1468 [00:03<02:08, 11.17it/s]

File coralnet-public-images/s1388/annotations.csv not found in bucket
File coralnet-public-images/s1514/annotations.csv not found in bucket


  3%|▎         | 45/1468 [00:04<01:55, 12.33it/s]

File coralnet-public-images/s1579/annotations.csv not found in bucket
File coralnet-public-images/s1580/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  7%|▋         | 107/1468 [00:12<02:20,  9.67it/s]

File coralnet-public-images/s2112/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 11%|█         | 163/1468 [00:19<02:27,  8.84it/s]

File coralnet-public-images/s2615/annotations.csv not found in bucket
File coralnet-public-images/s2616/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 13%|█▎        | 192/1468 [00:23<02:00, 10.56it/s]

File coralnet-public-images/s2795/annotations.csv not found in bucket


 16%|█▌        | 229/1468 [00:28<02:17,  9.02it/s]

File coralnet-public-images/s2897/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 16%|█▌        | 235/1468 [00:29<02:48,  7.31it/s]

File coralnet-public-images/s2947/annotations.csv not found in bucket
File coralnet-public-images/s295/annotations.csv not found in bucket


 16%|█▋        | 239/1468 [00:29<02:14,  9.17it/s]

File coralnet-public-images/s2959/annotations.csv not found in bucket


 17%|█▋        | 255/1468 [00:31<01:43, 11.67it/s]

File coralnet-public-images/s3015/annotations.csv not found in bucket


 18%|█▊        | 261/1468 [00:31<01:49, 11.00it/s]

File coralnet-public-images/s3058/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 22%|██▏       | 325/1468 [00:38<02:05,  9.12it/s]

File coralnet-public-images/s3294/annotations.csv not found in bucket


 23%|██▎       | 332/1468 [00:39<02:21,  8.04it/s]

File coralnet-public-images/s3342/annotations.csv not found in bucket


 23%|██▎       | 339/1468 [00:40<02:07,  8.86it/s]

File coralnet-public-images/s3354/annotations.csv not found in bucket
File coralnet-public-images/s3361/annotations.csv not found in bucket


 23%|██▎       | 343/1468 [00:40<01:39, 11.27it/s]

File coralnet-public-images/s3363/annotations.csv not found in bucket


 24%|██▎       | 347/1468 [00:41<02:02,  9.17it/s]

File coralnet-public-images/s3371/annotations.csv not found in bucket


 25%|██▍       | 360/1468 [00:43<03:29,  5.29it/s]

File coralnet-public-images/s3411/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 26%|██▌       | 384/1468 [00:50<06:57,  2.59it/s]

File coralnet-public-images/s3465/annotations.csv not found in bucket
File coralnet-public-images/s3466/annotations.csv not found in bucket


 27%|██▋       | 393/1468 [00:52<02:57,  6.05it/s]

File coralnet-public-images/s3479/annotations.csv not found in bucket


 27%|██▋       | 403/1468 [00:53<02:33,  6.92it/s]

File coralnet-public-images/s3496/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 29%|██▊       | 421/1468 [00:58<02:33,  6.82it/s]

File coralnet-public-images/s3545/annotations.csv not found in bucket


 29%|██▉       | 423/1468 [00:58<02:30,  6.93it/s]

File coralnet-public-images/s3551/annotations.csv not found in bucket


 29%|██▉       | 428/1468 [00:59<02:52,  6.02it/s]

File coralnet-public-images/s3559/annotations.csv not found in bucket


 29%|██▉       | 431/1468 [01:00<02:15,  7.68it/s]

File coralnet-public-images/s3567/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 30%|██▉       | 436/1468 [01:01<04:40,  3.68it/s]

File coralnet-public-images/s3581/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 31%|███       | 457/1468 [01:06<04:28,  3.77it/s]

File coralnet-public-images/s372/annotations.csv not found in bucket


 31%|███▏      | 461/1468 [01:07<03:37,  4.63it/s]

File coralnet-public-images/s373/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 38%|███▊      | 560/1468 [01:29<02:21,  6.40it/s]

File coralnet-public-images/s4148/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 44%|████▍     | 648/1468 [01:50<02:40,  5.11it/s]

File coralnet-public-images/s4559/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 50%|████▉     | 730/1468 [02:11<02:53,  4.24it/s]

File coralnet-public-images/s4957/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 51%|█████     | 751/1468 [02:16<02:24,  4.96it/s]

File coralnet-public-images/s5013/annotations.csv not found in bucket


 62%|██████▏   | 913/1468 [02:55<02:02,  4.52it/s]

File coralnet-public-images/s554/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 96%|█████████▌| 1404/1468 [05:23<00:20,  3.10it/s]

File coralnet-public-images/s7176/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
100%|██████████| 1468/1468 [05:46<00:00,  4.23it/s]


In [18]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

for i, source in tqdm.tqdm(enumerate(folders_new)):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}labelset.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    if i==0:
        labelset_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    else:
        labelset_df = pd.concat([labelset_df, pd.read_csv(io.BytesIO(obj['Body'].read()))], ignore_index=True)

1468it [01:17, 19.02it/s]


In [19]:
labelset_df

Unnamed: 0,Label ID,Name,Short Code
0,438,Montipora capitata,Moncap
1,2182,Montipora capitata algal overgrowth,Moncap AO
2,2185,Montipora capitata bleached,Moncap BL
3,2184,Montipora capitata tissue loss,Moncap TL
4,439,Montipora flabellata,Monfla
...,...,...,...
73693,3356,Water_H20_RR,WATE**R
73694,3190,algae others,AL
73695,3345,fleshy seawed,FS*
73696,3183,corals:Macroalga,MA**


In [20]:
coralnet_name_map = {k:v for k, v in zip(labelset_df['Label ID'], labelset_df['Name'])}

In [21]:
label_counts = pd.Series(labels).value_counts().reset_index()
label_counts.columns = ['CoralNet ID', 'Count']
label_counts = label_counts.sort_values("Count", ascending=False)
label_counts['CoralNet Name'] = label_counts['CoralNet ID'].map(lambda x: coralnet_name_map.get(x, None))
label_counts['Mermaid Name'] = label_counts['CoralNet ID'].map(lambda x: labelmapping.get(str(x), None))
label_counts

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
0,82,3356391,Turf algae,Turf algae
1,84,1745694,Sand,Sand
2,6911,1247926,Not coral,
3,101,951415,CCA (crustose coralline algae),Crustose coralline algae
4,2513,693971,Pavement,Other
...,...,...,...,...
3268,354,1,Molluscs: Chitons,Other invertebrates
3269,3352,1,Coral Juvenile,
3270,9056,1,Gelidium latifolium,
3271,1900,1,ARMS-CREP-Corallimorph,


In [22]:
total_annotations = label_counts['Count'].sum()
mapped_annotations = label_counts[label_counts['Mermaid Name'].notna()]["Count"].sum()
unmapped_annotations = label_counts[label_counts['Mermaid Name'].isna()]["Count"].sum()

print(f"Currently, there are {label_counts.shape[0]} labels with {total_annotations} unique coral reef annotations.")
print(f"Out of these, {label_counts['Mermaid Name'].notna().sum()} labels with {mapped_annotations} ({mapped_annotations/total_annotations:.2%}) annotations have a label mapping to Mermaid.")
print(f"The remaining, {label_counts['Mermaid Name'].isna().sum()} labels with {unmapped_annotations} ({unmapped_annotations/total_annotations:.2%}) annotations do not have a label mapping to Mermaid.")

Currently, there are 3312 labels with 21397835 unique coral reef annotations.
Out of these, 683 labels with 16466400 (76.95%) annotations have a label mapping to Mermaid.
The remaining, 2629 labels with 4931435 (23.05%) annotations do not have a label mapping to Mermaid.


In [23]:
label_counts[label_counts["Mermaid Name"].isna()]

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
2,6911,1247926,Not coral,
19,1871,178294,ARMS-CREP-CCA,
20,7462,171024,Blank_tile,
27,626,125453,PLATE,
31,5770,112701,Mesh,
...,...,...,...,...
3267,7260,1,Halophila,
3269,3352,1,Coral Juvenile,
3270,9056,1,Gelidium latifolium,
3271,1900,1,ARMS-CREP-Corallimorph,


In [24]:
label_counts[(label_counts["Mermaid Name"].isna())*(label_counts["Count"]>2000)]

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
2,6911,1247926,Not coral,
19,1871,178294,ARMS-CREP-CCA,
20,7462,171024,Blank_tile,
27,626,125453,PLATE,
31,5770,112701,Mesh,
...,...,...,...,...
566,1723,2064,Patella sp.,
569,7787,2033,Mytilus galloprovincialis,
571,7576,2015,Botrylloides niger,
572,7855,2013,Hormosiraceae banksii,
