# Setup

In [1]:
import boto3
import pandas as pd
import io
import tqdm

# Compare CoralNET sources in pyspacer with new images

In [2]:
s3 = boto3.client('s3')
bucket_name = '2310-coralnet-public-sources'

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Delimiter='/'
)

# Get common prefixes (folders)
if 'CommonPrefixes' in response:
    folders = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
    print(f"{len(folders)} Folders found:")
else:
    print("No folders found in the bucket")

580 Folders found:


In [3]:
len(folders)

580

In [4]:
labelset = set()
data = []
for source in tqdm.tqdm(folders):
    if not source.startswith('s'):
        print(source)

    file_key = f'{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    row = [source, len(annotations_df), annotations_df['Image ID'].nunique(), annotations_df['Label ID'].nunique()]
    data.append(row)
    labelset = labelset | set(annotations_df['Label ID'].unique())

  0%|          | 0/580 [00:00<?, ?it/s]

coralnet-public-images/
File coralnet-public-images/annotations.csv not found in bucket


 37%|███▋      | 212/580 [00:23<00:40,  9.17it/s]

File s295/annotations.csv not found in bucket


 49%|████▉     | 286/580 [00:30<00:19, 15.31it/s]

File s3182/annotations.csv not found in bucket


 52%|█████▏    | 304/580 [00:32<00:27, 10.17it/s]

File s3342/annotations.csv not found in bucket


 54%|█████▍    | 313/580 [00:33<00:32,  8.26it/s]

File s3363/annotations.csv not found in bucket


 72%|███████▏  | 416/580 [00:50<00:15, 10.50it/s]

File s372/annotations.csv not found in bucket


100%|██████████| 580/580 [01:06<00:00,  8.77it/s]


In [5]:
print(f"The total number of unique labels is {len(labelset)}")

The total number of unique labels is 2245


In [6]:
coralnet_summary_df = pd.DataFrame(data, columns=['Source', 'Num Annotations', 'Num Images', 'Num Unique Labels'])
coralnet_summary_df

Unnamed: 0,Source,Num Annotations,Num Images,Num Unique Labels
0,s1073/,45000,225,27
1,s1076/,41800,209,28
2,s109/,197100,3942,78
3,s1097/,16860,1686,78
4,s1162/,25,5,4
...,...,...,...,...
569,s841/,58200,3880,75
570,s842/,30309,940,3
571,s843/,56400,2820,64
572,s921/,1860,62,27


In [7]:
print("There are a total of %d annotations across %d images (average %d annotations per image) and %d unique labels." %
      (coralnet_summary_df["Num Annotations"].sum(), coralnet_summary_df["Num Images"].sum(), 
       coralnet_summary_df["Num Annotations"].sum() / coralnet_summary_df["Num Images"].sum(), len(labelset)))

There are a total of 11066639 annotations across 421642 images (average 26 annotations per image) and 2245 unique labels.


In [8]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Delimiter='/'
)
# Get common prefixes (folders)
if 'CommonPrefixes' in response:
    folders_new = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
    folder = 'coralnet-public-images/'
    # List subfolders within coralnet-public-images
    paginator = s3.get_paginator('list_objects_v2')
    sub_response = {'CommonPrefixes': []}
    for page in paginator.paginate(Bucket=bucket_name, Prefix=folder, Delimiter='/'):
        if 'CommonPrefixes' in page:
            sub_response['CommonPrefixes'].extend(page['CommonPrefixes'])
    if 'CommonPrefixes' in sub_response:
        print("Subfolders in coralnet-public-images/:")
        folders_new = [prefix['Prefix'] for prefix in sub_response['CommonPrefixes']]
        folders_new = [folder.replace("coralnet-public-images/", "") for folder in folders_new]
        # for subfolder in sub_response['CommonPrefixes']:
        #     print(f"- {subfolder['Prefix']}")
    else:
        print("No subfolders found in coralnet-public-images/")
else:
    print("No folders found in the bucket")

Subfolders in coralnet-public-images/:


In [9]:
labelset2 = set()
data2 = []
for source in tqdm.tqdm(folders_new):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    row = [source, len(annotations_df), annotations_df['Name'].nunique(), annotations_df['Label ID'].nunique()]
    data2.append(row)
    labelset2 = labelset2 | set(annotations_df['Label ID'].unique())

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  2%|▏         | 35/1506 [00:07<04:35,  5.33it/s]

File coralnet-public-images/s1514/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].r

File coralnet-public-images/s3342/annotations.csv not found in bucket


 22%|██▏       | 337/1506 [01:29<02:30,  7.78it/s]

File coralnet-public-images/s3354/annotations.csv not found in bucket
File coralnet-public-images/s3361/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].r

File coralnet-public-images/s5013/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 97%|█████████▋| 1468/1506 [04:36<00:01, 30.37it/s]

File coralnet-public-images/s7538/annotations.csv not found in bucket
File coralnet-public-images/s7539/annotations.csv not found in bucket
File coralnet-public-images/s7540/annotations.csv not found in bucket
File coralnet-public-images/s7541/annotations.csv not found in bucket
File coralnet-public-images/s7543/annotations.csv not found in bucket
File coralnet-public-images/s7544/annotations.csv not found in bucket
File coralnet-public-images/s7545/annotations.csv not found in bucket
File coralnet-public-images/s7546/annotations.csv not found in bucket
File coralnet-public-images/s7547/annotations.csv not found in bucket
File coralnet-public-images/s7548/annotations.csv not found in bucket
File coralnet-public-images/s7549/annotations.csv not found in bucket
File coralnet-public-images/s7550/annotations.csv not found in bucket
File coralnet-public-images/s7551/annotations.csv not found in bucket
File coralnet-public-images/s7552/annotations.csv not found in bucket
File coralnet-public

 98%|█████████▊| 1483/1506 [04:36<00:00, 48.75it/s]

File coralnet-public-images/s7555/annotations.csv not found in bucket
File coralnet-public-images/s7556/annotations.csv not found in bucket
File coralnet-public-images/s7559/annotations.csv not found in bucket
File coralnet-public-images/s7560/annotations.csv not found in bucket
File coralnet-public-images/s7561/annotations.csv not found in bucket
File coralnet-public-images/s7562/annotations.csv not found in bucket
File coralnet-public-images/s7563/annotations.csv not found in bucket
File coralnet-public-images/s7564/annotations.csv not found in bucket
File coralnet-public-images/s7566/annotations.csv not found in bucket
File coralnet-public-images/s7567/annotations.csv not found in bucket
File coralnet-public-images/s7568/annotations.csv not found in bucket
File coralnet-public-images/s7569/annotations.csv not found in bucket
File coralnet-public-images/s7570/annotations.csv not found in bucket
File coralnet-public-images/s7571/annotations.csv not found in bucket


 99%|█████████▉| 1491/1506 [04:37<00:00, 55.28it/s]

File coralnet-public-images/s7572/annotations.csv not found in bucket
File coralnet-public-images/s7573/annotations.csv not found in bucket
File coralnet-public-images/s7580/annotations.csv not found in bucket
File coralnet-public-images/s7584/annotations.csv not found in bucket
File coralnet-public-images/s7594/annotations.csv not found in bucket
File coralnet-public-images/s7613/annotations.csv not found in bucket
File coralnet-public-images/s7614/annotations.csv not found in bucket
File coralnet-public-images/s7629/annotations.csv not found in bucket
File coralnet-public-images/s7638/annotations.csv not found in bucket
File coralnet-public-images/s7653/annotations.csv not found in bucket
File coralnet-public-images/s7663/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
100%|██████████| 1506/1506 [04:40<00:00,  5.38it/s]


In [10]:
print(f"The total number of unique labels is {len(labelset2)}")

The total number of unique labels is 3378


In [11]:
coralnet_summary_upd_df = pd.DataFrame(data2, columns=['Source', 'Num Annotations', 'Num Images', 'Num Unique Labels'])
coralnet_summary_upd_df

Unnamed: 0,Source,Num Annotations,Num Images,Num Unique Labels
0,s1076/,41800,209,28
1,s109/,198500,3970,78
2,s1162/,25,5,4
3,s1184/,50,10,7
4,s1189/,0,0,0
...,...,...,...,...
1455,s841/,58200,3880,75
1456,s842/,93739,2592,3
1457,s843/,56400,2820,64
1458,s921/,3150,105,27


In [12]:
print("There are a total of %d annotations across %d images (average %d annotations per image) and %d unique labels." %
      (coralnet_summary_upd_df["Num Annotations"].sum(), coralnet_summary_upd_df["Num Images"].sum(), 
       coralnet_summary_upd_df["Num Annotations"].sum() / coralnet_summary_upd_df["Num Images"].sum(), len(labelset2)))

There are a total of 35154107 annotations across 1144868 images (average 30 annotations per image) and 3378 unique labels.


In [13]:
coralnet_comparison_df = pd.merge(coralnet_summary_df, coralnet_summary_upd_df, 
                    on='Source', 
                    how='outer',
                    suffixes=('_orig', '_upd'))
coralnet_comparison_df = coralnet_comparison_df.sort_values('Source')
coralnet_comparison_df

Unnamed: 0,Source,Num Annotations_orig,Num Images_orig,Num Unique Labels_orig,Num Annotations_upd,Num Images_upd,Num Unique Labels_upd
0,s1073/,45000.0,225.0,27.0,,,
1,s1076/,41800.0,209.0,28.0,41800.0,209.0,28.0
2,s109/,197100.0,3942.0,78.0,198500.0,3970.0,78.0
3,s1097/,16860.0,1686.0,78.0,,,
4,s1162/,25.0,5.0,4.0,25.0,5.0,4.0
...,...,...,...,...,...,...,...
1481,s841/,58200.0,3880.0,75.0,58200.0,3880.0,75.0
1482,s842/,30309.0,940.0,3.0,93739.0,2592.0,3.0
1483,s843/,56400.0,2820.0,64.0,56400.0,2820.0,64.0
1484,s921/,1860.0,62.0,27.0,3150.0,105.0,27.0


In [14]:
coralnet_comparison_df.isna().sum(axis=0)

Source                      0
Num Annotations_orig      912
Num Images_orig           912
Num Unique Labels_orig    912
Num Annotations_upd        26
Num Images_upd             26
Num Unique Labels_upd      26
dtype: int64

In [15]:
coralnet_comparison_df[coralnet_comparison_df["Num Annotations_orig"].isna()]

Unnamed: 0,Source,Num Annotations_orig,Num Images_orig,Num Unique Labels_orig,Num Annotations_upd,Num Images_upd,Num Unique Labels_upd
6,s1189/,,,,0.0,0.0,0.0
21,s1294/,,,,0.0,0.0,0.0
27,s1356/,,,,0.0,0.0,0.0
38,s1545/,,,,0.0,0.0,0.0
42,s1577/,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1474,s7494/,,,,30.0,1.0,4.0
1475,s7513/,,,,2400.0,96.0,18.0
1476,s7519/,,,,1575.0,63.0,6.0
1477,s7525/,,,,60.0,2.0,4.0


# Check unmapped labels

In [16]:
import requests
def initialize_coralnet_mapping(
    mapping_endpoint="https://api.datamermaid.org/v1/classification/labelmappings/?provider=CoralNet",
):
    response = requests.get(mapping_endpoint)
    data = response.json()
    labelset = data["results"]

    while data["next"]:
        response = requests.get(data["next"])
        data = response.json()
        labelset.extend(data["results"])
    label_mapping = {
        label["provider_id"]: label["benthic_attribute_name"] for label in labelset
    }
    return label_mapping

labelmapping = initialize_coralnet_mapping()

In [17]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

labels = []

for source in tqdm.tqdm(folders_new):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    labels = labels + list(annotations_df['Label ID'])

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  2%|▏         | 36/1506 [00:06<04:07,  5.95it/s]

File coralnet-public-images/s1514/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].r

File coralnet-public-images/s3342/annotations.csv not found in bucket


 22%|██▏       | 335/1506 [01:38<05:06,  3.83it/s]

File coralnet-public-images/s3354/annotations.csv not found in bucket


 23%|██▎       | 339/1506 [01:39<03:27,  5.62it/s]

File coralnet-public-images/s3361/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].r

File coralnet-public-images/s5013/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 98%|█████████▊| 1475/1506 [10:22<00:01, 17.38it/s]

File coralnet-public-images/s7538/annotations.csv not found in bucket
File coralnet-public-images/s7539/annotations.csv not found in bucket
File coralnet-public-images/s7540/annotations.csv not found in bucket
File coralnet-public-images/s7541/annotations.csv not found in bucket
File coralnet-public-images/s7543/annotations.csv not found in bucket
File coralnet-public-images/s7544/annotations.csv not found in bucket
File coralnet-public-images/s7545/annotations.csv not found in bucket
File coralnet-public-images/s7546/annotations.csv not found in bucket
File coralnet-public-images/s7547/annotations.csv not found in bucket
File coralnet-public-images/s7548/annotations.csv not found in bucket
File coralnet-public-images/s7549/annotations.csv not found in bucket
File coralnet-public-images/s7550/annotations.csv not found in bucket
File coralnet-public-images/s7551/annotations.csv not found in bucket
File coralnet-public-images/s7552/annotations.csv not found in bucket
File coralnet-public

 99%|█████████▉| 1493/1506 [10:23<00:00, 35.77it/s]

File coralnet-public-images/s7559/annotations.csv not found in bucket
File coralnet-public-images/s7560/annotations.csv not found in bucket
File coralnet-public-images/s7561/annotations.csv not found in bucket
File coralnet-public-images/s7562/annotations.csv not found in bucket
File coralnet-public-images/s7563/annotations.csv not found in bucket
File coralnet-public-images/s7564/annotations.csv not found in bucket
File coralnet-public-images/s7566/annotations.csv not found in bucket
File coralnet-public-images/s7567/annotations.csv not found in bucket
File coralnet-public-images/s7568/annotations.csv not found in bucket
File coralnet-public-images/s7569/annotations.csv not found in bucket
File coralnet-public-images/s7570/annotations.csv not found in bucket
File coralnet-public-images/s7571/annotations.csv not found in bucket
File coralnet-public-images/s7572/annotations.csv not found in bucket
File coralnet-public-images/s7573/annotations.csv not found in bucket
File coralnet-public

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
100%|██████████| 1506/1506 [10:29<00:00,  2.39it/s]


In [18]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

for i, source in tqdm.tqdm(enumerate(folders_new)):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}labelset.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    if i==0:
        labelset_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    else:
        labelset_df = pd.concat([labelset_df, pd.read_csv(io.BytesIO(obj['Body'].read()))], ignore_index=True)

1471it [01:44, 35.85it/s]

File coralnet-public-images/s7538/labelset.csv not found in bucket
File coralnet-public-images/s7539/labelset.csv not found in bucket
File coralnet-public-images/s7540/labelset.csv not found in bucket
File coralnet-public-images/s7541/labelset.csv not found in bucket
File coralnet-public-images/s7543/labelset.csv not found in bucket
File coralnet-public-images/s7544/labelset.csv not found in bucket
File coralnet-public-images/s7545/labelset.csv not found in bucket
File coralnet-public-images/s7546/labelset.csv not found in bucket
File coralnet-public-images/s7547/labelset.csv not found in bucket
File coralnet-public-images/s7548/labelset.csv not found in bucket
File coralnet-public-images/s7549/labelset.csv not found in bucket
File coralnet-public-images/s7550/labelset.csv not found in bucket
File coralnet-public-images/s7551/labelset.csv not found in bucket
File coralnet-public-images/s7552/labelset.csv not found in bucket
File coralnet-public-images/s7553/labelset.csv not found in bu

1488it [01:44, 55.36it/s]

File coralnet-public-images/s7556/labelset.csv not found in bucket
File coralnet-public-images/s7559/labelset.csv not found in bucket
File coralnet-public-images/s7560/labelset.csv not found in bucket
File coralnet-public-images/s7561/labelset.csv not found in bucket
File coralnet-public-images/s7562/labelset.csv not found in bucket
File coralnet-public-images/s7563/labelset.csv not found in bucket
File coralnet-public-images/s7564/labelset.csv not found in bucket
File coralnet-public-images/s7566/labelset.csv not found in bucket
File coralnet-public-images/s7567/labelset.csv not found in bucket
File coralnet-public-images/s7568/labelset.csv not found in bucket
File coralnet-public-images/s7569/labelset.csv not found in bucket
File coralnet-public-images/s7570/labelset.csv not found in bucket
File coralnet-public-images/s7571/labelset.csv not found in bucket
File coralnet-public-images/s7572/labelset.csv not found in bucket
File coralnet-public-images/s7573/labelset.csv not found in bu

1497it [01:44, 62.50it/s]

File coralnet-public-images/s7584/labelset.csv not found in bucket
File coralnet-public-images/s7594/labelset.csv not found in bucket
File coralnet-public-images/s7613/labelset.csv not found in bucket
File coralnet-public-images/s7614/labelset.csv not found in bucket
File coralnet-public-images/s7629/labelset.csv not found in bucket
File coralnet-public-images/s7638/labelset.csv not found in bucket
File coralnet-public-images/s7653/labelset.csv not found in bucket
File coralnet-public-images/s7663/labelset.csv not found in bucket


1506it [01:45, 14.32it/s]


In [19]:
labelset_df

Unnamed: 0,Label ID,Name,Short Code
0,438,Montipora capitata,Moncap
1,2182,Montipora capitata algal overgrowth,Moncap AO
2,2185,Montipora capitata bleached,Moncap BL
3,2184,Montipora capitata tissue loss,Moncap TL
4,439,Montipora flabellata,Monfla
...,...,...,...
73454,3356,Water_H20_RR,WATE**R
73455,3190,algae others,AL
73456,3345,fleshy seawed,FS*
73457,3183,corals:Macroalga,MA**


In [20]:
coralnet_name_map = {k:v for k, v in zip(labelset_df['Label ID'], labelset_df['Name'])}

In [21]:
label_counts = pd.Series(labels).value_counts().reset_index()
label_counts.columns = ['CoralNet ID', 'Count']
label_counts = label_counts.sort_values("Count", ascending=False)
label_counts['CoralNet Name'] = label_counts['CoralNet ID'].map(lambda x: coralnet_name_map.get(x, None))
label_counts['Mermaid Name'] = label_counts['CoralNet ID'].map(lambda x: labelmapping.get(str(x), None))
label_counts

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
0,82,5513010,Turf algae,Turf algae
1,84,3102147,Sand,Sand
2,1348,1741948,CRED-Turf growing on hard substrate,Turf algae
3,101,1438919,CCA (crustose coralline algae),Crustose coralline algae
4,6911,1247926,Not coral,
...,...,...,...,...
3221,1476,1,Buccinum undatum,
3222,1845,1,Polychaete_branching,
3223,7229,1,Caulastraea,
3224,4066,1,Fish.,


In [22]:
total_annotations = label_counts['Count'].sum()
mapped_annotations = label_counts[label_counts['Mermaid Name'].notna()]["Count"].sum()
unmapped_annotations = label_counts[label_counts['Mermaid Name'].isna()]["Count"].sum()

print(f"Currently, there are {label_counts.shape[0]} labels with {total_annotations} unique coral reef annotations.")
print(f"Out of these, {label_counts['Mermaid Name'].notna().sum()} labels with {mapped_annotations} ({mapped_annotations/total_annotations:.2%}) annotations have a label mapping to Mermaid.")
print(f"The remaining, {label_counts['Mermaid Name'].isna().sum()} labels with {unmapped_annotations} ({unmapped_annotations/total_annotations:.2%}) annotations do not have a label mapping to Mermaid.")

Currently, there are 3378 labels with 35154107 unique coral reef annotations.
Out of these, 702 labels with 29843625 (84.89%) annotations have a label mapping to Mermaid.
The remaining, 2676 labels with 5310482 (15.11%) annotations do not have a label mapping to Mermaid.


In [23]:
label_counts[label_counts["Mermaid Name"].isna()]

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
4,6911,1247926,Not coral,
26,2784,260039,Turfy Biotic Matrix,
34,1871,178294,ARMS-CREP-CCA,
35,7462,171024,Blank_tile,
45,626,125453,PLATE,
...,...,...,...,...
3221,1476,1,Buccinum undatum,
3222,1845,1,Polychaete_branching,
3223,7229,1,Caulastraea,
3224,4066,1,Fish.,


In [24]:
label_counts[(label_counts["Mermaid Name"].isna())*(label_counts["Count"]>2000)]

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
4,6911,1247926,Not coral,
26,2784,260039,Turfy Biotic Matrix,
34,1871,178294,ARMS-CREP-CCA,
35,7462,171024,Blank_tile,
45,626,125453,PLATE,
...,...,...,...,...
638,1723,2064,Patella sp.,
641,7787,2033,Mytilus galloprovincialis,
643,7576,2015,Botrylloides niger,
644,7855,2013,Hormosiraceae banksii,
