## Scrapping images of cetaceans from Wikipedia page

In [1]:
import pandas as pd

In [29]:
path_1 = "data/HappyWhales/location_cetacea_full_info.csv"
path_2 = "data/HappyWhales/obis_seamap_custom_part1_points_csv.csv"

# columns_to_keep_df1 = COLUMNS_TO_KEEP_1
# columns_to_keep_df2 = COLUMNS_TO_KEEP_2 

MAPPING = {
            'scientific': 'scientific_name',
            'common': 'common_name',
            "sp_code" : 'species_name',
            'tsn': 'itis_tsn',
            'count': 'individual_count',
            'obs_date': 'event_date',
            'oid' : 'organism_id'
            }

column_mapping = MAPPING

In [None]:
df_location_1 = pd.read_csv(path_1, encoding="utf-8")
#df_location_1['organism_id'] = df_location_1['organism_id'].str.extract(r'(\d+)$')
dataset_id = df_location_1["dataset_id"].unique()

df_location_2 = pd.read_csv(path_2, low_memory=False)
#df_location_2 = df_location_2[~df_location_2["dataset_id"].isin(dataset_id)][columns_to_keep_df2]

df_location_2 = df_location_2.rename(columns=column_mapping)

df_location = pd.concat([df_location_1, df_location_2], ignore_index=True)

species_list = df_location['species_name'].dropna().unique()
species_list

array(['Humpback Whale', 'Northern bottlenose whale', 'Franciscana',
       'Gray Whale', "Arnoux's Beaked Whale", 'Killer Whale',
       "Dall's Porpoise", 'Fin Whale', 'Harbor Porpoise', 'Minke Whale',
       'Atlantic White-sided Dolphin', 'Blue Whale',
       "Baird's Beaked Whale", 'Bottlenose Dolphin', 'Dwarf sperm whale',
       "Bryde's whale", 'Atlantic Spotted Dolphin', 'False Killer Whale',
       'Long-finned Pilot Whale', 'North Atlantic Right Whale',
       'Hourglass Dolphin', "Commerson's Dolphin",
       "Cuvier's beaked whale", "Haviside's Dolphin",
       'Indian humpback dolphin', "Omurai's Whale", 'Common minke whale',
       'Beluga', 'Bowhead', "Cuvier's Beaked Whale",
       'Northern Right Whale Dolphin', 'Melon-headed Whale',
       "Hector's Dolphin", 'Dusky Dolphin', 'Black Dolphin',
       "Layard's beaked whale", 'Clymene Dolphin',
       "Blainville's Beaked Whale", "Burmeister's porpoise",
       "Gervais' beaked whale", 'Amazon River Dolphin',
       'I

In [2]:
IDs_and_names = pd.read_csv("images_and_sounds/aphiaIDs_and_names.csv").set_index("AphiaID")
IDs_and_names

Unnamed: 0_level_0,Description,Common name
AphiaID,Unnamed: 1_level_1,Unnamed: 2_level_1
105740,Alopias,Thresher shark
105835,Alopias superciliosus,Bigeye thresher shark
105836,Alopias vulpinus,common thresher shark
231390,Arctocephalus,
231435,Arctocephalus australis,South American fur seal
...,...,...
255003,Zalophus californianus,California sea lion
255004,Zalophus japonicus,Japanese sea lion
255005,Zalophus wollebaeki,Galápagos sea lion
136986,Ziphiidae,Beaked whales


In [33]:
import requests, os
from tqdm import tqdm

# Set the folder name where images will be stored
my_folder = 'wiki_images'
os.makedirs(my_folder, exist_ok=True)

# Base URL for Wikipedia API
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

# Wikipedia article to download the main image from
my_list = species_list#IDs_and_names["Common name"].dropna().tolist()

# Define User-Agent header to respect Wikipedia's user-client policy :,)
HEADERS = {
    "User-Agent": "WhereWereWhales: cetaceans images from Wikipedia"
}


def get_image_url(partial_url):
    """ Gets the main image URL from a Wikipedia article """
    try:
        api_res = requests.get(query + partial_url, headers=HEADERS).json()
        first_part = api_res['query']['pages']
        for key, value in first_part.items():
            if 'original' in value:
                return value['original']['source']
    except Exception as exc:
        print(exc)
        print("Partial URL: " + partial_url)
    return None


def download_image(the_url, the_page):
    res = requests.get(the_url, headers=HEADERS) 
    res.raise_for_status()
    
    if res.status_code != 200:
        print(f"Error scraping {the_page}: {res.status_code}")

    # Get the correct file extension
    file_ext = '.' + the_url.split('.')[-1].lower()
    image_file = os.path.join(my_folder, os.path.basename(the_page + file_ext))

    # Save the image
    with open(image_file, 'wb') as f:
        for chunk in res.iter_content(100000):
            f.write(chunk)

    #print(f"Downloaded: {image_file}")

# Download images from Wikipedia articles
for the_page in tqdm(my_list, desc="Scraping Wikipedia images", unit="pages"):
    the_url = get_image_url(the_page)
    if the_url:
        #print(f"Downloading an image of a magnificent {the_page}...")
        download_image(the_url, the_page)
    else:
        print(f"No image found for {the_page} :(")

print("All done!")


Scraping Wikipedia images:   2%|▏         | 1/46 [00:28<21:07, 28.17s/pages]

No image found for Humpback Whale :(


Scraping Wikipedia images:   7%|▋         | 3/46 [00:29<04:53,  6.81s/pages]

No image found for Franciscana :(


Scraping Wikipedia images:   9%|▊         | 4/46 [00:30<02:59,  4.28s/pages]

No image found for Gray Whale :(


Scraping Wikipedia images:  11%|█         | 5/46 [00:30<01:58,  2.88s/pages]

No image found for Arnoux's Beaked Whale :(


Scraping Wikipedia images:  13%|█▎        | 6/46 [00:30<01:21,  2.03s/pages]

No image found for Killer Whale :(


Scraping Wikipedia images:  15%|█▌        | 7/46 [00:31<00:58,  1.49s/pages]

No image found for Dall's Porpoise :(


Scraping Wikipedia images:  17%|█▋        | 8/46 [00:31<00:43,  1.14s/pages]

No image found for Fin Whale :(


Scraping Wikipedia images:  20%|█▉        | 9/46 [00:31<00:33,  1.10pages/s]

No image found for Harbor Porpoise :(


Scraping Wikipedia images:  22%|██▏       | 10/46 [00:32<00:26,  1.34pages/s]

No image found for Minke Whale :(


Scraping Wikipedia images:  24%|██▍       | 11/46 [00:32<00:22,  1.57pages/s]

No image found for Atlantic White-sided Dolphin :(


Scraping Wikipedia images:  26%|██▌       | 12/46 [00:33<00:19,  1.75pages/s]

No image found for Blue Whale :(


Scraping Wikipedia images:  28%|██▊       | 13/46 [00:33<00:17,  1.92pages/s]

No image found for Baird's Beaked Whale :(


Scraping Wikipedia images:  30%|███       | 14/46 [00:33<00:15,  2.05pages/s]

No image found for Bottlenose Dolphin :(


Scraping Wikipedia images:  37%|███▋      | 17/46 [00:36<00:17,  1.67pages/s]

No image found for Atlantic Spotted Dolphin :(


Scraping Wikipedia images:  39%|███▉      | 18/46 [00:36<00:15,  1.85pages/s]

No image found for False Killer Whale :(


Scraping Wikipedia images:  41%|████▏     | 19/46 [00:37<00:13,  2.02pages/s]

No image found for Long-finned Pilot Whale :(


Scraping Wikipedia images:  43%|████▎     | 20/46 [00:37<00:11,  2.19pages/s]

No image found for North Atlantic Right Whale :(


Scraping Wikipedia images:  46%|████▌     | 21/46 [00:37<00:10,  2.33pages/s]

No image found for Hourglass Dolphin :(


Scraping Wikipedia images:  48%|████▊     | 22/46 [00:38<00:10,  2.36pages/s]

No image found for Commerson's Dolphin :(


Scraping Wikipedia images:  52%|█████▏    | 24/46 [00:39<00:13,  1.64pages/s]

No image found for Haviside's Dolphin :(


Scraping Wikipedia images:  54%|█████▍    | 25/46 [00:40<00:12,  1.72pages/s]

No image found for Indian humpback dolphin :(


Scraping Wikipedia images:  57%|█████▋    | 26/46 [00:40<00:10,  1.91pages/s]

No image found for Omurai's Whale :(


Scraping Wikipedia images:  61%|██████    | 28/46 [00:41<00:09,  1.92pages/s]

No image found for Beluga :(


Scraping Wikipedia images:  63%|██████▎   | 29/46 [00:42<00:08,  2.08pages/s]

No image found for Bowhead :(


Scraping Wikipedia images:  65%|██████▌   | 30/46 [00:42<00:07,  2.18pages/s]

No image found for Cuvier's Beaked Whale :(


Scraping Wikipedia images:  67%|██████▋   | 31/46 [00:43<00:06,  2.30pages/s]

No image found for Northern Right Whale Dolphin :(


Scraping Wikipedia images:  70%|██████▉   | 32/46 [00:43<00:06,  2.32pages/s]

No image found for Melon-headed Whale :(


Scraping Wikipedia images:  72%|███████▏  | 33/46 [00:43<00:05,  2.40pages/s]

No image found for Hector's Dolphin :(


Scraping Wikipedia images:  74%|███████▍  | 34/46 [00:44<00:04,  2.40pages/s]

No image found for Dusky Dolphin :(


Scraping Wikipedia images:  76%|███████▌  | 35/46 [00:44<00:04,  2.48pages/s]

No image found for Black Dolphin :(


Scraping Wikipedia images:  78%|███████▊  | 36/46 [00:45<00:04,  2.25pages/s]

No image found for Layard's beaked whale :(


Scraping Wikipedia images:  80%|████████  | 37/46 [00:45<00:04,  1.92pages/s]

No image found for Clymene Dolphin :(


Scraping Wikipedia images:  83%|████████▎ | 38/46 [00:46<00:04,  1.91pages/s]

No image found for Blainville's Beaked Whale :(


Scraping Wikipedia images:  87%|████████▋ | 40/46 [00:47<00:03,  1.75pages/s]

No image found for Gervais' beaked whale :(


Scraping Wikipedia images:  89%|████████▉ | 41/46 [00:48<00:02,  1.91pages/s]

No image found for Amazon River Dolphin :(


Scraping Wikipedia images:  98%|█████████▊| 45/46 [00:54<00:01,  1.43s/pages]

No image found for North Pacific Right Whale :(


Scraping Wikipedia images: 100%|██████████| 46/46 [00:55<00:00,  1.21s/pages]

All done!



