In [None]:
# Download iNaturalist and LichenPortal 

In [1]:
import os
import pandas as pd
import requests
# %pip install tqdm
from tqdm import tqdm

In [2]:
# Set paths
output_dir = "/Users/eabowman/Dropbox/LichenProject/dataset"
os.makedirs(output_dir, exist_ok=True)

In [3]:
# --- LichenPortal ---
lichen_df = pd.read_csv("/Users/eabowman/Dropbox/LichenProject/references/LichenPortal_Texas_22Apr24_filtered.csv", encoding='latin1')

In [4]:
# --- iNaturalist ---
inat_df = pd.read_csv("/Users/eabowman/Dropbox/LichenProject/references/iNaturalist_21Apr25.csv")  # your iNat CSV

In [5]:
# Combine both sources
combined = pd.DataFrame({
    'scientific_name': pd.concat([
        lichen_df['scientificName'],
        inat_df['scientific_name']
    ], ignore_index=True),
    'image_url': pd.concat([
        lichen_df['goodQualityAccessURI'],
        inat_df['image_url']
    ], ignore_index=True)
})

In [6]:
# Check column names
print(combined.columns)

Index(['scientific_name', 'image_url'], dtype='object')


In [7]:
# Clean species names to create valid folder names
combined['clean_name'] = combined['scientific_name'].str.replace(r'\s+', '_', regex=True)

In [11]:
combined = combined.dropna(subset=['clean_name', 'image_url'])

In [12]:
print(combined['clean_name'].unique())

['Cladonia_hammeri' 'Caloplaca_texana' 'Placopyrenium_caeruleopulvinum'
 ... 'Ramalina_fraxinea' 'Parmotrema_diffractaicum'
 'Speerschneidera_euploca']


In [13]:
# Download function
def download_image(row):
    folder = os.path.join(output_dir, row['clean_name'])
    os.makedirs(folder, exist_ok=True)

    image_name = os.path.basename(row['image_url']).split('?')[0]  # Remove any query strings
    filename = os.path.join(folder, image_name)

    try:
        response = requests.get(row['image_url'], timeout=10)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
    except Exception as e:
        print(f"Failed to download {row['image_url']}: {e}")

In [14]:
# Download images with a progress bar
for _, row in tqdm(combined.iterrows(), total=len(combined)):
    if pd.notna(row['clean_name']) and pd.notna(row['image_url']):
        download_image(row)

  0%|▏                                                                            | 43/14077 [00:53<1:32:55,  2.52it/s]

Failed to download http://bornnaturalist.org/images/20070213/4.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070213/4.jpg


  0%|▏                                                                            | 44/14077 [00:53<1:20:13,  2.92it/s]

Failed to download http://bornnaturalist.org/images/20070213/24.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070213/24.jpg


  0%|▏                                                                            | 45/14077 [00:54<1:11:47,  3.26it/s]

Failed to download http://bornnaturalist.org/images/20070213/32.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070213/32.jpg


  0%|▎                                                                            | 47/14077 [00:55<1:22:11,  2.84it/s]

Failed to download http://bornnaturalist.org/images/20070221/58.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/58.jpg


  0%|▎                                                                            | 48/14077 [00:55<1:13:31,  3.18it/s]

Failed to download http://bornnaturalist.org/images/20070221/42.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/42.jpg


  0%|▎                                                                            | 49/14077 [00:55<1:06:58,  3.49it/s]

Failed to download http://bornnaturalist.org/images/20070221/43.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/43.jpg


  0%|▎                                                                            | 50/14077 [00:55<1:02:05,  3.77it/s]

Failed to download http://bornnaturalist.org/images/20070221/54.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/54.jpg


  0%|▎                                                                              | 51/14077 [00:55<58:59,  3.96it/s]

Failed to download http://bornnaturalist.org/images/20070221/4.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/4.jpg


  0%|▎                                                                              | 53/14077 [00:56<59:31,  3.93it/s]

Failed to download http://bornnaturalist.org/images/20070223/66.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070223/66.jpg


  0%|▎                                                                              | 54/14077 [00:56<57:56,  4.03it/s]

Failed to download http://bornnaturalist.org/images/20070223/65.jpg : 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070223/65.jpg%20


  0%|▎                                                                              | 55/14077 [00:56<56:45,  4.12it/s]

Failed to download http://bornnaturalist.org/images/20070224/54.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070224/54.jpg


  0%|▎                                                                              | 56/14077 [00:57<55:56,  4.18it/s]

Failed to download http://bornnaturalist.org/images/20070226/3.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070226/3.jpg


  0%|▎                                                                              | 57/14077 [00:57<54:47,  4.26it/s]

Failed to download http://bornnaturalist.org/images/20091006/23.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091006/23.jpg


  0%|▎                                                                              | 58/14077 [00:57<54:08,  4.32it/s]

Failed to download http://bornnaturalist.org/images/20091007/196.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091007/196.jpg


  0%|▎                                                                              | 59/14077 [00:57<53:13,  4.39it/s]

Failed to download http://bornnaturalist.org/images/20091007/316.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091007/316.jpg


  0%|▎                                                                              | 60/14077 [00:58<52:48,  4.42it/s]

Failed to download http://bornnaturalist.org/images/20091027/196.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091027/196.jpg


  0%|▎                                                                              | 61/14077 [00:58<52:28,  4.45it/s]

Failed to download http://bornnaturalist.org/images/20091027/248.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091027/248.jpg


  0%|▎                                                                              | 62/14077 [00:58<52:41,  4.43it/s]

Failed to download http://bornnaturalist.org/images/20091027/291.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091027/291.jpg


  0%|▎                                                                              | 63/14077 [00:58<53:01,  4.40it/s]

Failed to download http://bornnaturalist.org/images/20100312/7.jpg: 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20100312/7.jpg


  0%|▎                                                                              | 64/14077 [00:58<52:30,  4.45it/s]

Failed to download http://bornnaturalist.org/images/20070102/45.jpg : 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070102/45.jpg%20


 92%|█████████████████████████████████████████████████████████████████▉      | 12889/14077 [1:30:37<1:07:01,  3.38s/it]

Failed to download https://inaturalist-open-data.s3.amazonaws.com/photos/194746401/medium.png: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


100%|██████████████████████████████████████████████████████████████████████████| 14077/14077 [1:37:58<00:00,  2.39it/s]


In [17]:
# check to see if the iNaturalist files have the same file names
# Extract base filenames from image URLs
combined['filename'] = combined['image_url'].apply(lambda url: os.path.basename(str(url)).split('?')[0])

# Find duplicates
duplicates = combined['filename'][combined['filename'].duplicated(keep=False)]

# Print or inspect duplicates
print("Duplicate filenames:")
print(duplicates.value_counts())

Duplicate filenames:
filename
medium.jpg                       1349
medium.jpeg                       897
medium.JPG                         10
medium.png                          6
ASUL031876_03_print.jpg             2
ASUL031916_04_print.jpg             2
ASUL031907_03_print.jpg             2
ASUL031943_03_print.jpg             2
ASUL031911_04_print.jpg             2
ASUL031909_03_print.jpg             2
ASUL031899_03_print.jpg             2
ASUL031921_04_print.jpg             2
54.jpg                              2
ASUL031872_03_print.jpg             2
ASUL031852_03_print.jpg             2
ASUL031939_03_print.jpg             2
ASUL031922_03_print.jpg             2
4.jpg                               2
ASUL031906_04_print.jpg             2
ASUL031919_03_print.jpg             2
ASUL031947_02_print.jpg             2
ASUL031940_03_print.jpg             2
ASUL031941_03_print.jpg             2
ASUL031928_03_print.jpg             2
ASUL031905_03_print.jpg             2
ASUL031867_02_print.

In [24]:
# Download function with a way to fix the duplicates
# 1. Extract raw filenames from image_url
combined['raw_filename'] = combined['image_url'].apply(
    lambda x: os.path.basename(str(x).split('?')[0])
)

In [25]:
# 2. Find duplicate filenames
filename_counts = combined['raw_filename'].value_counts()
duplicate_names = filename_counts[filename_counts > 1].index

In [26]:
# 3. Filter DataFrame to only those with duplicate filenames
duplicates_df = combined[combined['raw_filename'].isin(duplicate_names)]

print(f"Found {len(duplicates_df)} rows with duplicate raw filenames.")

Found 2322 rows with duplicate raw filenames.


In [27]:
# 4. Re-download only duplicates
def download_image_unique(row):
    folder = os.path.join(output_dir, row['clean_name'] if pd.notnull(row['clean_name']) else "Unknown")
    os.makedirs(folder, exist_ok=True)

    # Get file extension
    url = str(row['image_url'])
    ext = os.path.splitext(url.split('?')[0])[1] or '.jpg'

    # Make filename unique using a hash
    unique_id = hashlib.md5(url.encode()).hexdigest()
    filename = os.path.join(folder, f"{unique_id}{ext}")

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(filename, 'wb') as f:
            f.write(response.content)
    except Exception as e:
        print(f"Failed: {url} - {e}")

In [28]:
# Re-download just the duplicates
from tqdm import tqdm

for _, row in tqdm(duplicates_df.iterrows(), total=len(duplicates_df)):
    download_image_unique(row)

  0%|                                                                                 | 1/2322 [00:00<17:50,  2.17it/s]

Failed: http://bornnaturalist.org/images/20070213/4.jpg - 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070213/4.jpg


  0%|                                                                                 | 2/2322 [00:00<13:11,  2.93it/s]

Failed: http://bornnaturalist.org/images/20070221/54.jpg - 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/54.jpg


  0%|                                                                                 | 3/2322 [00:00<11:44,  3.29it/s]

Failed: http://bornnaturalist.org/images/20070221/4.jpg - 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070221/4.jpg


  0%|▏                                                                                | 4/2322 [00:01<10:54,  3.54it/s]

Failed: http://bornnaturalist.org/images/20070224/54.jpg - 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20070224/54.jpg


  0%|▏                                                                                | 5/2322 [00:01<10:26,  3.70it/s]

Failed: http://bornnaturalist.org/images/20091007/196.jpg - 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091007/196.jpg


  0%|▏                                                                                | 6/2322 [00:01<10:09,  3.80it/s]

Failed: http://bornnaturalist.org/images/20091027/196.jpg - 404 Client Error: Not Found for url: https://bornnaturalist.org/images/20091027/196.jpg


 17%|█████████████                                                                | 394/2322 [03:03<1:45:05,  3.27s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32406124/medium.jpeg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████                                                                | 395/2322 [03:14<2:50:36,  5.31s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32440230/medium.jpeg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▏                                                               | 396/2322 [03:24<3:35:54,  6.73s/it]

Failed: https://static.inaturalist.org/photos/32531374/medium.jpg - HTTPSConnectionPool(host='static.inaturalist.org', port=443): Read timed out. (read timeout=10)


 17%|█████████████▏                                                               | 397/2322 [03:34<4:07:35,  7.72s/it]

Failed: https://static.inaturalist.org/photos/32531432/medium.jpg - HTTPSConnectionPool(host='static.inaturalist.org', port=443): Read timed out. (read timeout=10)


 17%|█████████████▏                                                               | 398/2322 [03:44<4:29:40,  8.41s/it]

Failed: https://static.inaturalist.org/photos/32655147/medium.jpg - HTTPSConnectionPool(host='static.inaturalist.org', port=443): Read timed out. (read timeout=10)


 17%|█████████████▏                                                               | 399/2322 [03:54<4:45:30,  8.91s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32689970/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▎                                                               | 400/2322 [04:04<4:56:24,  9.25s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32822307/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▎                                                               | 401/2322 [04:14<5:04:04,  9.50s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32832325/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▎                                                               | 402/2322 [04:24<5:09:21,  9.67s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32844720/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▎                                                               | 403/2322 [04:34<5:13:03,  9.79s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32855101/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▍                                                               | 404/2322 [04:44<5:15:34,  9.87s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/32917559/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▍                                                               | 405/2322 [04:54<5:17:19,  9.93s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/33098305/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 17%|█████████████▍                                                               | 406/2322 [05:04<5:18:27,  9.97s/it]

Failed: https://inaturalist-open-data.s3.amazonaws.com/photos/33110522/medium.jpg - HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)


 18%|█████████████▍                                                               | 407/2322 [05:14<5:19:24, 10.01s/it]

Failed: https://static.inaturalist.org/photos/33210359/medium.jpg - HTTPSConnectionPool(host='static.inaturalist.org', port=443): Read timed out. (read timeout=10)


100%|██████████████████████████████████████████████████████████████████████████████| 2322/2322 [16:23<00:00,  2.36it/s]


In [29]:
# Log duplicates
duplicates_df.to_csv("duplicate_images_to_redownload.csv", index=False)