# Setup

In [1]:
import boto3
import pandas as pd
import io
import tqdm

# Compare CoralNET sources in pyspacer with new images

In [2]:
s3 = boto3.client('s3')
bucket_name = '2310-coralnet-public-sources'

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Delimiter='/'
)

# Get common prefixes (folders)
if 'CommonPrefixes' in response:
    folders = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
    print(f"{len(folders)} Folders found:")
else:
    print("No folders found in the bucket")

580 Folders found:


In [3]:
len(folders)

580

In [4]:
labelset = set()
data = []
for source in tqdm.tqdm(folders):
    if not source.startswith('s'):
        print(source)

    file_key = f'{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    row = [source, len(annotations_df), annotations_df['Image ID'].nunique(), annotations_df['Label ID'].nunique()]
    data.append(row)
    labelset = labelset | set(annotations_df['Label ID'].unique())

  0%|          | 2/580 [00:00<00:42, 13.75it/s]

coralnet-public-images/
File coralnet-public-images/annotations.csv not found in bucket


 37%|███▋      | 213/580 [00:20<00:30, 11.94it/s]

File s295/annotations.csv not found in bucket


 49%|████▉     | 286/580 [00:25<00:19, 15.37it/s]

File s3182/annotations.csv not found in bucket


 53%|█████▎    | 305/580 [00:27<00:19, 14.23it/s]

File s3342/annotations.csv not found in bucket


 54%|█████▍    | 314/580 [00:28<00:23, 11.12it/s]

File s3363/annotations.csv not found in bucket


 72%|███████▏  | 416/580 [00:40<00:14, 11.47it/s]

File s372/annotations.csv not found in bucket


100%|██████████| 580/580 [00:53<00:00, 10.84it/s]


In [5]:
print(f"The total number of unique labels is {len(labelset)}")

The total number of unique labels is 2245


In [7]:
coralnet_summary_df = pd.DataFrame(data, columns=['Source', 'Num Annotations', 'Num Images', 'Num Unique Labels'])
coralnet_summary_df

Unnamed: 0,Source,Num Annotations,Num Images,Num Unique Labels
0,s1073/,45000,225,27
1,s1076/,41800,209,28
2,s109/,197100,3942,78
3,s1097/,16860,1686,78
4,s1162/,25,5,4
...,...,...,...,...
569,s841/,58200,3880,75
570,s842/,30309,940,3
571,s843/,56400,2820,64
572,s921/,1860,62,27


In [8]:
print("There are a total of %d annotations across %d images (average %d annotations per image) and %d unique labels." %
      (coralnet_summary_df["Num Annotations"].sum(), coralnet_summary_df["Num Images"].sum(), 
       coralnet_summary_df["Num Annotations"].sum() / coralnet_summary_df["Num Images"].sum(), len(labelset)))

There are a total of 11066639 annotations across 421642 images (average 26 annotations per image) and 2245 unique labels.


In [9]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Delimiter='/'
)
# Get common prefixes (folders)
if 'CommonPrefixes' in response:
    folders_new = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
    folder = 'coralnet-public-images/'
    # List subfolders within coralnet-public-images
    sub_response = s3.list_objects_v2(
        Bucket=bucket_name,
        Prefix=folder,
        Delimiter='/'
    )
    if 'CommonPrefixes' in sub_response:
        print("Subfolders in coralnet-public-images/:")
        folders_new = [prefix['Prefix'] for prefix in sub_response['CommonPrefixes']]
        folders_new = [folder.replace("coralnet-public-images/", "") for folder in folders_new]
        for subfolder in sub_response['CommonPrefixes']:
            print(f"- {subfolder['Prefix']}")
    else:
        print("No subfolders found in coralnet-public-images/")
else:
    print("No folders found in the bucket")

Subfolders in coralnet-public-images/:
- coralnet-public-images/s1073/
- coralnet-public-images/s1076/
- coralnet-public-images/s109/
- coralnet-public-images/s1162/
- coralnet-public-images/s1184/
- coralnet-public-images/s1212/
- coralnet-public-images/s1264/
- coralnet-public-images/s1265/
- coralnet-public-images/s1266/
- coralnet-public-images/s1268/
- coralnet-public-images/s1269/
- coralnet-public-images/s1270/
- coralnet-public-images/s1271/
- coralnet-public-images/s1272/
- coralnet-public-images/s1273/
- coralnet-public-images/s1274/
- coralnet-public-images/s1276/
- coralnet-public-images/s1277/
- coralnet-public-images/s1288/
- coralnet-public-images/s1300/
- coralnet-public-images/s1301/
- coralnet-public-images/s1304/
- coralnet-public-images/s1308/
- coralnet-public-images/s1353/
- coralnet-public-images/s1357/
- coralnet-public-images/s1358/
- coralnet-public-images/s1360/
- coralnet-public-images/s1368/
- coralnet-public-images/s1388/
- coralnet-public-images/s1432/
- 

In [10]:
labelset2 = set()
data2 = []
for source in tqdm.tqdm(folders_new):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    row = [source, len(annotations_df), annotations_df['Name'].nunique(), annotations_df['Label ID'].nunique()]
    data2.append(row)
    labelset2 = labelset2 | set(annotations_df['Label ID'].unique())

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  4%|▎         | 20/557 [00:02<00:32, 16.60it/s]

File coralnet-public-images/s1288/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  5%|▌         | 30/557 [00:03<00:51, 10.27it/s]

File coralnet-public-images/s1388/annotations.csv not found in bucket


  7%|▋         | 39/557 [00:04<00:51,  9.98it/s]

File coralnet-public-images/s1579/annotations.csv not found in bucket
File coralnet-public-images/s1580/annotations.csv not found in bucket


  8%|▊         | 43/557 [00:04<00:44, 11.60it/s]

File coralnet-public-images/s1645/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 17%|█▋        | 93/557 [00:11<00:46,  9.99it/s]

File coralnet-public-images/s2112/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 26%|██▌       | 145/557 [00:18<00:43,  9.48it/s]

File coralnet-public-images/s2615/annotations.csv not found in bucket
File coralnet-public-images/s2616/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 30%|███       | 168/557 [00:21<00:32, 11.90it/s]

File coralnet-public-images/s2795/annotations.csv not found in bucket


 35%|███▌      | 196/557 [00:24<00:39,  9.07it/s]

File coralnet-public-images/s2897/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 36%|███▌      | 200/557 [00:25<00:56,  6.34it/s]

File coralnet-public-images/s2947/annotations.csv not found in bucket
File coralnet-public-images/s295/annotations.csv not found in bucket


 37%|███▋      | 205/557 [00:25<00:37,  9.49it/s]

File coralnet-public-images/s2959/annotations.csv not found in bucket


 40%|███▉      | 221/557 [00:27<00:26, 12.91it/s]

File coralnet-public-images/s3015/annotations.csv not found in bucket


 40%|████      | 225/557 [00:27<00:24, 13.54it/s]

File coralnet-public-images/s3058/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 52%|█████▏    | 289/557 [00:33<00:28,  9.28it/s]

File coralnet-public-images/s3342/annotations.csv not found in bucket


 53%|█████▎    | 295/557 [00:33<00:26,  9.88it/s]

File coralnet-public-images/s3354/annotations.csv not found in bucket
File coralnet-public-images/s3361/annotations.csv not found in bucket


 54%|█████▎    | 299/557 [00:34<00:20, 12.51it/s]

File coralnet-public-images/s3363/annotations.csv not found in bucket


 54%|█████▍    | 303/557 [00:34<00:22, 11.50it/s]

File coralnet-public-images/s3371/annotations.csv not found in bucket


 56%|█████▌    | 313/557 [00:35<00:29,  8.19it/s]

File coralnet-public-images/s3401/annotations.csv not found in bucket


 56%|█████▋    | 314/557 [00:36<00:28,  8.52it/s]

File coralnet-public-images/s3411/annotations.csv not found in bucket


 57%|█████▋    | 317/557 [00:36<00:30,  7.95it/s]

File coralnet-public-images/s3413/annotations.csv not found in bucket
File coralnet-public-images/s3414/annotations.csv not found in bucket


 57%|█████▋    | 320/557 [00:36<00:28,  8.35it/s]

File coralnet-public-images/s3416/annotations.csv not found in bucket


 58%|█████▊    | 325/557 [00:37<00:40,  5.80it/s]

File coralnet-public-images/s3421/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 62%|██████▏   | 343/557 [00:39<00:10, 19.88it/s]

File coralnet-public-images/s3446/annotations.csv not found in bucket
File coralnet-public-images/s3460/annotations.csv not found in bucket
File coralnet-public-images/s3465/annotations.csv not found in bucket
File coralnet-public-images/s3466/annotations.csv not found in bucket
File coralnet-public-images/s3467/annotations.csv not found in bucket
File coralnet-public-images/s3478/annotations.csv not found in bucket
File coralnet-public-images/s3479/annotations.csv not found in bucket


 63%|██████▎   | 351/557 [00:39<00:13, 15.79it/s]

File coralnet-public-images/s3496/annotations.csv not found in bucket
File coralnet-public-images/s3497/annotations.csv not found in bucket


 64%|██████▍   | 357/557 [00:40<00:17, 11.76it/s]

File coralnet-public-images/s3499/annotations.csv not found in bucket
File coralnet-public-images/s3500/annotations.csv not found in bucket


 65%|██████▌   | 363/557 [00:41<00:20,  9.57it/s]

File coralnet-public-images/s3522/annotations.csv not found in bucket


 67%|██████▋   | 372/557 [00:41<00:15, 12.13it/s]

File coralnet-public-images/s3545/annotations.csv not found in bucket
File coralnet-public-images/s3551/annotations.csv not found in bucket
File coralnet-public-images/s3554/annotations.csv not found in bucket


 67%|██████▋   | 375/557 [00:42<00:14, 12.52it/s]

File coralnet-public-images/s3559/annotations.csv not found in bucket
File coralnet-public-images/s3567/annotations.csv not found in bucket


 68%|██████▊   | 381/557 [00:42<00:15, 11.56it/s]

File coralnet-public-images/s3577/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 72%|███████▏  | 399/557 [00:45<00:23,  6.83it/s]

File coralnet-public-images/s372/annotations.csv not found in bucket


 72%|███████▏  | 403/557 [00:45<00:21,  7.14it/s]

File coralnet-public-images/s373/annotations.csv not found in bucket


 86%|████████▌ | 480/557 [00:53<00:05, 12.98it/s]

File coralnet-public-images/s4009/annotations.csv not found in bucket


 89%|████████▉ | 495/557 [00:54<00:04, 14.22it/s]

File coralnet-public-images/s4148/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 97%|█████████▋| 540/557 [01:01<00:02,  5.98it/s]

File coralnet-public-images/s554/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 99%|█████████▉| 551/557 [01:03<00:01,  5.48it/s]

File coralnet-public-images/s800/annotations.csv not found in bucket


100%|██████████| 557/557 [01:04<00:00,  8.61it/s]


In [11]:
print(f"The total number of unique labels is {len(labelset2)}")

The total number of unique labels is 2098


In [12]:
coralnet_summary_upd_df = pd.DataFrame(data2, columns=['Source', 'Num Annotations', 'Num Images', 'Num Unique Labels'])
coralnet_summary_upd_df

Unnamed: 0,Source,Num Annotations,Num Images,Num Unique Labels
0,s1073/,45000,225,27
1,s1076/,41800,209,28
2,s109/,198500,3970,78
3,s1162/,25,5,4
4,s1184/,50,10,7
...,...,...,...,...
502,s841/,58200,3880,75
503,s842/,93739,2592,3
504,s843/,56400,2820,64
505,s921/,3150,105,27


In [13]:
print("There are a total of %d annotations across %d images (average %d annotations per image) and %d unique labels." %
      (coralnet_summary_upd_df["Num Annotations"].sum(), coralnet_summary_upd_df["Num Images"].sum(), 
       coralnet_summary_upd_df["Num Annotations"].sum() / coralnet_summary_upd_df["Num Images"].sum(), len(labelset2)))

There are a total of 8088751 annotations across 183619 images (average 44 annotations per image) and 2098 unique labels.


In [14]:
coralnet_comparison_df = pd.merge(coralnet_summary_df, coralnet_summary_upd_df, 
                    on='Source', 
                    how='outer',
                    suffixes=('_orig', '_upd'))
coralnet_comparison_df = coralnet_comparison_df.sort_values('Source')
coralnet_comparison_df

Unnamed: 0,Source,Num Annotations_orig,Num Images_orig,Num Unique Labels_orig,Num Annotations_upd,Num Images_upd,Num Unique Labels_upd
0,s1073/,45000.0,225.0,27.0,45000.0,225.0,27.0
1,s1076/,41800.0,209.0,28.0,41800.0,209.0,28.0
2,s109/,197100.0,3942.0,78.0,198500.0,3970.0,78.0
3,s1097/,16860.0,1686.0,78.0,,,
4,s1162/,25.0,5.0,4.0,25.0,5.0,4.0
...,...,...,...,...,...,...,...
570,s841/,58200.0,3880.0,75.0,58200.0,3880.0,75.0
571,s842/,30309.0,940.0,3.0,93739.0,2592.0,3.0
572,s843/,56400.0,2820.0,64.0,56400.0,2820.0,64.0
573,s921/,1860.0,62.0,27.0,3150.0,105.0,27.0


In [15]:
coralnet_comparison_df.isna().sum(axis=0)

Source                     0
Num Annotations_orig       1
Num Images_orig            1
Num Unique Labels_orig     1
Num Annotations_upd       68
Num Images_upd            68
Num Unique Labels_upd     68
dtype: int64

In [17]:
coralnet_comparison_df[coralnet_comparison_df["Num Annotations_orig"].isna()]

Unnamed: 0,Source,Num Annotations_orig,Num Images_orig,Num Unique Labels_orig,Num Annotations_upd,Num Images_upd,Num Unique Labels_upd
555,s5027/,,,,520.0,40.0,19.0


# Check unmapped labels

In [16]:
import requests
def initialize_coralnet_mapping(
    mapping_endpoint="https://api.datamermaid.org/v1/classification/labelmappings/?provider=CoralNet",
):
    response = requests.get(mapping_endpoint)
    data = response.json()
    labelset = data["results"]

    while data["next"]:
        response = requests.get(data["next"])
        data = response.json()
        labelset.extend(data["results"])
    label_mapping = {
        label["provider_id"]: label["benthic_attribute_name"] for label in labelset
    }
    return label_mapping

labelmapping = initialize_coralnet_mapping()

In [17]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

labels = []

for source in tqdm.tqdm(folders_new):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}annotations.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    labels = labels + list(annotations_df['Label ID'])

  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  4%|▍         | 21/557 [00:02<00:34, 15.64it/s]

File coralnet-public-images/s1288/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  5%|▌         | 30/557 [00:03<00:43, 12.03it/s]

File coralnet-public-images/s1388/annotations.csv not found in bucket


  7%|▋         | 39/557 [00:03<00:44, 11.70it/s]

File coralnet-public-images/s1579/annotations.csv not found in bucket
File coralnet-public-images/s1580/annotations.csv not found in bucket


  8%|▊         | 43/557 [00:04<00:40, 12.79it/s]

File coralnet-public-images/s1645/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 17%|█▋        | 93/557 [00:11<00:48,  9.55it/s]

File coralnet-public-images/s2112/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 26%|██▌       | 144/557 [00:18<00:47,  8.70it/s]

File coralnet-public-images/s2615/annotations.csv not found in bucket
File coralnet-public-images/s2616/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 30%|███       | 168/557 [00:21<00:33, 11.75it/s]

File coralnet-public-images/s2795/annotations.csv not found in bucket


 35%|███▌      | 196/557 [00:24<00:43,  8.37it/s]

File coralnet-public-images/s2897/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 36%|███▌      | 200/557 [00:25<01:09,  5.13it/s]

File coralnet-public-images/s2947/annotations.csv not found in bucket
File coralnet-public-images/s295/annotations.csv not found in bucket


 37%|███▋      | 205/557 [00:26<00:40,  8.67it/s]

File coralnet-public-images/s2959/annotations.csv not found in bucket


 39%|███▉      | 220/557 [00:27<00:28, 11.75it/s]

File coralnet-public-images/s3015/annotations.csv not found in bucket


 40%|████      | 225/557 [00:28<00:28, 11.50it/s]

File coralnet-public-images/s3058/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 52%|█████▏    | 289/557 [00:34<00:34,  7.75it/s]

File coralnet-public-images/s3342/annotations.csv not found in bucket


 53%|█████▎    | 295/557 [00:35<00:30,  8.46it/s]

File coralnet-public-images/s3354/annotations.csv not found in bucket
File coralnet-public-images/s3361/annotations.csv not found in bucket


 54%|█████▎    | 299/557 [00:35<00:22, 11.25it/s]

File coralnet-public-images/s3363/annotations.csv not found in bucket


 54%|█████▍    | 303/557 [00:36<00:26,  9.74it/s]

File coralnet-public-images/s3371/annotations.csv not found in bucket


 56%|█████▌    | 313/557 [00:38<00:34,  7.11it/s]

File coralnet-public-images/s3401/annotations.csv not found in bucket


 57%|█████▋    | 316/557 [00:38<00:24,  9.78it/s]

File coralnet-public-images/s3411/annotations.csv not found in bucket


 57%|█████▋    | 318/557 [00:38<00:33,  7.11it/s]

File coralnet-public-images/s3413/annotations.csv not found in bucket
File coralnet-public-images/s3414/annotations.csv not found in bucket


 57%|█████▋    | 320/557 [00:39<00:36,  6.55it/s]

File coralnet-public-images/s3416/annotations.csv not found in bucket


 58%|█████▊    | 325/557 [00:40<00:54,  4.27it/s]

File coralnet-public-images/s3421/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 62%|██████▏   | 343/557 [00:41<00:11, 17.89it/s]

File coralnet-public-images/s3446/annotations.csv not found in bucket
File coralnet-public-images/s3460/annotations.csv not found in bucket
File coralnet-public-images/s3465/annotations.csv not found in bucket
File coralnet-public-images/s3466/annotations.csv not found in bucket
File coralnet-public-images/s3467/annotations.csv not found in bucket
File coralnet-public-images/s3478/annotations.csv not found in bucket
File coralnet-public-images/s3479/annotations.csv not found in bucket


 63%|██████▎   | 353/557 [00:42<00:16, 12.49it/s]

File coralnet-public-images/s3496/annotations.csv not found in bucket
File coralnet-public-images/s3497/annotations.csv not found in bucket


 64%|██████▍   | 357/557 [00:43<00:22,  8.83it/s]

File coralnet-public-images/s3499/annotations.csv not found in bucket


 64%|██████▍   | 359/557 [00:43<00:21,  9.09it/s]

File coralnet-public-images/s3500/annotations.csv not found in bucket


 65%|██████▌   | 364/557 [00:44<00:23,  8.15it/s]

File coralnet-public-images/s3522/annotations.csv not found in bucket


 66%|██████▋   | 370/557 [00:45<00:21,  8.68it/s]

File coralnet-public-images/s3545/annotations.csv not found in bucket


 67%|██████▋   | 372/557 [00:45<00:21,  8.75it/s]

File coralnet-public-images/s3551/annotations.csv not found in bucket
File coralnet-public-images/s3554/annotations.csv not found in bucket


 68%|██████▊   | 377/557 [00:45<00:17, 10.37it/s]

File coralnet-public-images/s3559/annotations.csv not found in bucket
File coralnet-public-images/s3567/annotations.csv not found in bucket


 69%|██████▊   | 382/557 [00:46<00:19,  8.97it/s]

File coralnet-public-images/s3577/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 72%|███████▏  | 399/557 [00:49<00:34,  4.63it/s]

File coralnet-public-images/s372/annotations.csv not found in bucket


 72%|███████▏  | 403/557 [00:50<00:29,  5.29it/s]

File coralnet-public-images/s373/annotations.csv not found in bucket


 86%|████████▌ | 478/557 [01:01<00:10,  7.68it/s]

File coralnet-public-images/s4009/annotations.csv not found in bucket


 89%|████████▊ | 494/557 [01:03<00:06, 10.02it/s]

File coralnet-public-images/s4148/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 97%|█████████▋| 540/557 [01:14<00:04,  3.43it/s]

File coralnet-public-images/s554/annotations.csv not found in bucket


  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
  annotations_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
 99%|█████████▉| 551/557 [01:17<00:01,  3.59it/s]

File coralnet-public-images/s800/annotations.csv not found in bucket


100%|██████████| 557/557 [01:19<00:00,  7.05it/s]


In [18]:
s3 = boto3.client('s3')
bucket_name = 'dev-datamermaid-sm-sources'

for i, source in tqdm.tqdm(enumerate(folders_new)):
    if not source.startswith('s'):
        print(source)

    file_key = f'coralnet-public-images/{source}labelset.csv'

    # Get the file from S3
    try:
        s3.head_object(Bucket=bucket_name, Key=file_key)
    except s3.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print(f"File {file_key} not found in bucket")
            continue
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    if i==0:
        labelset_df = pd.read_csv(io.BytesIO(obj['Body'].read()))
    else:
        labelset_df = pd.concat([labelset_df, pd.read_csv(io.BytesIO(obj['Body'].read()))], ignore_index=True)

557it [00:32, 17.25it/s]


In [19]:
labelset_df

Unnamed: 0,Label ID,Name,Short Code
0,438,Montipora capitata,Moncap
1,2182,Montipora capitata algal overgrowth,Moncap AO
2,2185,Montipora capitata bleached,Moncap BL
3,2184,Montipora capitata tissue loss,Moncap TL
4,439,Montipora flabellata,Monfla
...,...,...,...
22530,3356,Water_H20_RR,WATE**R
22531,3190,algae others,AL
22532,3345,fleshy seawed,FS*
22533,3183,corals:Macroalga,MA**


In [20]:
coralnet_name_map = {k:v for k, v in zip(labelset_df['Label ID'], labelset_df['Name'])}

In [21]:
label_counts = pd.Series(labels).value_counts().reset_index()
label_counts.columns = ['CoralNet ID', 'Count']
label_counts = label_counts.sort_values("Count", ascending=False)
label_counts['CoralNet Name'] = label_counts['CoralNet ID'].map(lambda x: coralnet_name_map.get(x, None))
label_counts['Mermaid Name'] = label_counts['CoralNet ID'].map(lambda x: labelmapping.get(str(x), None))
label_counts

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
0,82,1198991,Turf algae,Turf algae
1,84,599557,Sand,Sand
2,101,383890,CCA (crustose coralline algae),Crustose coralline algae
3,4114,240715,Substrate: Consolidated (hard),Bare substrate
4,7462,170229,Blank_tile,
...,...,...,...,...
2054,6622,1,Reef Star submassive,
2055,7967,1,Polycyathus senegalensis,
2021,3166,1,Corals:Stony Coral:Branching,
2022,2192,1,Montipora patula tissue loss,


In [22]:
total_annotations = label_counts['Count'].sum()
mapped_annotations = label_counts[label_counts['Mermaid Name'].notna()]["Count"].sum()
unmapped_annotations = label_counts[label_counts['Mermaid Name'].isna()]["Count"].sum()

print(f"Currently, there are {label_counts.shape[0]} labels with {total_annotations} unique coral reef annotations.")
print(f"Out of these, {label_counts['Mermaid Name'].notna().sum()} labels with {mapped_annotations} ({mapped_annotations/total_annotations:.2%}) annotations have a label mapping to Mermaid.")
print(f"The remaining, {label_counts['Mermaid Name'].isna().sum()} labels with {unmapped_annotations} ({unmapped_annotations/total_annotations:.2%}) annotations do not have a label mapping to Mermaid.")

Currently, there are 2098 labels with 8088751 unique coral reef annotations.
Out of these, 647 labels with 6289897 (77.76%) annotations have a label mapping to Mermaid.
The remaining, 1451 labels with 1798854 (22.24%) annotations do not have a label mapping to Mermaid.


In [23]:
label_counts[label_counts["Mermaid Name"].isna()]

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
4,7462,170229,Blank_tile,
9,5770,112701,Mesh,
12,1639,86682,sediment (fine),
18,7454,69869,Green_algal_stain,
21,2787,65529,ARMS-Biofilm,
...,...,...,...,...
2054,6622,1,Reef Star submassive,
2055,7967,1,Polycyathus senegalensis,
2021,3166,1,Corals:Stony Coral:Branching,
2022,2192,1,Montipora patula tissue loss,


In [25]:
label_counts[(label_counts["Mermaid Name"].isna())*(label_counts["Count"]>2000)]

Unnamed: 0,CoralNet ID,Count,CoralNet Name,Mermaid Name
4,7462,170229,Blank_tile,
9,5770,112701,Mesh,
12,1639,86682,sediment (fine),
18,7454,69869,Green_algal_stain,
21,2787,65529,ARMS-Biofilm,
...,...,...,...,...
358,1499,2051,Diplosoma listerianum,
360,7787,2033,Mytilus galloprovincialis,
362,7576,2015,Botrylloides niger,
363,7855,2013,Hormosiraceae banksii,
