# Setup

In [1]:
from coralnet_scraper import CoralNetDownloader
import getpass
import boto3
import numpy as np 

In [2]:
def check_s3_prefix_exists(bucket_name, s3_prefix, source_id):
    s3 = boto3.client('s3')
    prefix = f"{s3_prefix}/s{source_id}/annotations.csv"
    
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, MaxKeys=1)
    
    if 'Contents' in response:
        print(f"Prefix exists: {prefix}")
        return True
    else:
        print(f"Prefix does not exist: {prefix}")
        return False

# Scrape Single Source

In [3]:
username = input("CoralNet username: ")
password = getpass.getpass("CoralNet password: ")

In [5]:
source_ids = ["109"]

bucket_name = "dev-datamermaid-sm-sources"
prefix = "coralnet-public-images"

In [19]:
downloader = CoralNetDownloader(username=username, password=password)
for source_id in source_ids:
    if check_s3_prefix_exists(bucket_name=bucket_name, s3_prefix=prefix, source_id=source_id):
        continue
    downloader.download_source(source_id=source_id, bucket_name=bucket_name, s3_prefix=prefix)

Prefix exists: coralnet-public-images/s109/images/


# Check sources in 2310-coralnet-public-sources

In [6]:
s3 = boto3.client('s3')
coralnet_bucket_name = '2310-coralnet-public-sources'

response = s3.list_objects_v2(Bucket=coralnet_bucket_name, Delimiter='/')
if 'CommonPrefixes' in response:
    subdirectories = [prefix['Prefix'] for prefix in response['CommonPrefixes']]
else:
    print("No subdirectories found.")

In [7]:
coralnet_sources = np.sort([int(source[1:-1]) for source in subdirectories if source[0]=="s"]).astype(str)

In [9]:
downloader = CoralNetDownloader(username=username, password=password)
for source_id in coralnet_sources:
    print("Source ID", source_id)
    if check_s3_prefix_exists(bucket_name=bucket_name, s3_prefix=prefix, source_id=source_id):
        continue
    downloader.download_source(source_id=source_id, bucket_name=bucket_name, s3_prefix=prefix)

Source ID 23
Prefix exists: coralnet-public-images/s23/images/
Source ID 57
Prefix exists: coralnet-public-images/s57/images/
Source ID 69
Prefix exists: coralnet-public-images/s69/images/
Source ID 70
Prefix exists: coralnet-public-images/s70/images/
Source ID 109
Prefix exists: coralnet-public-images/s109/images/
Source ID 155
Prefix exists: coralnet-public-images/s155/images/
Source ID 172
Prefix exists: coralnet-public-images/s172/images/
Source ID 173
Prefix exists: coralnet-public-images/s173/images/
Source ID 174
Prefix exists: coralnet-public-images/s174/images/
Source ID 258
Prefix exists: coralnet-public-images/s258/images/
Source ID 290
Prefix exists: coralnet-public-images/s290/images/
Source ID 295
Prefix does not exist: coralnet-public-images/s295/images/

=== Downloading Source 295 ===
âœ“ Login successful
ERROR: Permission check failed for source 295: HTTPSConnectionPool(host='coralnet.ucsd.edu', port=443): Read timed out. (read timeout=30)


Exception: Cannot access source 295

# Get All Sources on the CoralNet Webpage

In [10]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

url = "https://coralnet.ucsd.edu/source/about/"

resp = requests.get(url, timeout=50)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")
anchors = soup.find_all("a", href=True)

# Build absolute URLs and keep only http/https links
links = sorted({
    urljoin(url, a["href"])
    for a in anchors
    if urlparse(urljoin(url, a["href"])).scheme in ("http", "https")
})

In [11]:
print("Found", len(links), "links on the page.")

Found 1720 links on the page.


In [12]:
source_links = [link for link in links if "/source/" in link]

In [13]:
print("Found", len(source_links), "links on the page.")

Found 1711 links on the page.


In [17]:
all_coralnet_sources = [int(link.split("/")[-2]) for link in source_links]
all_coralnet_sources

[1073,
 1076,
 109,
 1162,
 1184,
 1189,
 1212,
 1264,
 1265,
 1266,
 1268,
 1269,
 1270,
 1271,
 1272,
 1273,
 1274,
 1276,
 1277,
 1288,
 1294,
 1300,
 1301,
 1304,
 1308,
 1353,
 1356,
 1357,
 1358,
 1360,
 1368,
 1388,
 1432,
 1440,
 1514,
 1519,
 1532,
 1545,
 1546,
 155,
 1554,
 1556,
 1577,
 1578,
 1579,
 1580,
 1588,
 1598,
 1622,
 1634,
 1642,
 1645,
 1653,
 1655,
 1669,
 1686,
 1695,
 1718,
 172,
 1724,
 1725,
 173,
 174,
 1745,
 1749,
 1767,
 1768,
 1771,
 1773,
 1776,
 1781,
 1783,
 1789,
 1810,
 1821,
 1822,
 1826,
 1831,
 1832,
 1835,
 1848,
 1895,
 1900,
 1933,
 1940,
 1944,
 1968,
 1970,
 1972,
 1981,
 1982,
 1983,
 1986,
 1987,
 1988,
 1989,
 1990,
 1991,
 1998,
 2001,
 2030,
 2043,
 2044,
 2048,
 2050,
 2052,
 2055,
 2064,
 2070,
 2074,
 2083,
 2084,
 2088,
 2091,
 2112,
 2120,
 2121,
 2122,
 2123,
 2124,
 2125,
 2127,
 2130,
 2131,
 2152,
 2153,
 2159,
 2162,
 2170,
 2172,
 2180,
 2190,
 2194,
 2200,
 2208,
 2211,
 2221,
 2222,
 2236,
 2238,
 2239,
 2246,
 2258,
 226

In [None]:
source_links

['https://coralnet.ucsd.edu/source/1073/',
 'https://coralnet.ucsd.edu/source/1076/',
 'https://coralnet.ucsd.edu/source/109/',
 'https://coralnet.ucsd.edu/source/1162/',
 'https://coralnet.ucsd.edu/source/1184/',
 'https://coralnet.ucsd.edu/source/1189/',
 'https://coralnet.ucsd.edu/source/1212/',
 'https://coralnet.ucsd.edu/source/1264/',
 'https://coralnet.ucsd.edu/source/1265/',
 'https://coralnet.ucsd.edu/source/1266/',
 'https://coralnet.ucsd.edu/source/1268/',
 'https://coralnet.ucsd.edu/source/1269/',
 'https://coralnet.ucsd.edu/source/1270/',
 'https://coralnet.ucsd.edu/source/1271/',
 'https://coralnet.ucsd.edu/source/1272/',
 'https://coralnet.ucsd.edu/source/1273/',
 'https://coralnet.ucsd.edu/source/1274/',
 'https://coralnet.ucsd.edu/source/1276/',
 'https://coralnet.ucsd.edu/source/1277/',
 'https://coralnet.ucsd.edu/source/1288/',
 'https://coralnet.ucsd.edu/source/1294/',
 'https://coralnet.ucsd.edu/source/1300/',
 'https://coralnet.ucsd.edu/source/1301/',
 'https://co