## Scrapping images of cetaceans from Wikipedia page

In [1]:
import pandas as pd

In [2]:
path_1 = "data/HappyWhales/location_cetacea_full_info.csv"
path_2 = "data/HappyWhales/obis_seamap_custom_part1_points_csv.csv"

# columns_to_keep_df1 = COLUMNS_TO_KEEP_1
# columns_to_keep_df2 = COLUMNS_TO_KEEP_2 

MAPPING = {
            'scientific': 'scientific_name',
            'common': 'common_name',
            "sp_code" : 'species_name',
            'tsn': 'itis_tsn',
            'count': 'individual_count',
            'obs_date': 'event_date',
            'oid' : 'organism_id'
            }

column_mapping = MAPPING

In [3]:
df_location_1 = pd.read_csv(path_1, encoding="utf-8")
#df_location_1['organism_id'] = df_location_1['organism_id'].str.extract(r'(\d+)$')
dataset_id = df_location_1["dataset_id"].unique()

df_location_2 = pd.read_csv(path_2, low_memory=False)
#df_location_2 = df_location_2[~df_location_2["dataset_id"].isin(dataset_id)][columns_to_keep_df2]

df_location_2 = df_location_2.rename(columns=column_mapping)

df_location = pd.concat([df_location_1, df_location_2], ignore_index=True)

species_list = df_location['species_name'].dropna().unique()
species_list

array(['Bottlenose Dolphin', 'Short-beaked Common Dolphin',
       'Dusky Dolphin', "Peale's Dolphin", "Commerson's Dolphin",
       "Hector's Dolphin", 'Southern Right Whale Dolphin',
       "Risso's Dolphin", 'Melon-headed Whale', 'False Killer Whale',
       'Killer Whale', "Dall's Porpoise", 'Beluga',
       "Arnoux's Beaked Whale", "Baird's Beaked Whale",
       "Cuvier's Beaked Whale", 'Southern Bottlenose Whale',
       "Blainville's Beaked Whale", 'Minke Whale', 'Blue Whale',
       'Bowhead', 'Long-finned Pilot Whale', "Bryde's whale",
       'Atlantic Spotted Dolphin', 'North Pacific Right Whale',
       'Sei Whale', 'Black Dolphin', 'Northern bottlenose whale',
       'Atlantic White-sided Dolphin', 'Harbor Porpoise',
       'Northern Right Whale Dolphin', 'Amazon River Dolphin',
       "Layard's beaked whale", 'Pantropical Spotted Dolphin',
       'Clymene Dolphin', 'Striped Dolphin', 'Rough-toothed Dolphin',
       'Franciscana', 'Hourglass Dolphin', "Gervais' beaked whale

In [4]:
IDs_and_names = pd.read_csv("images_and_sounds/aphiaIDs_and_names.csv").set_index("AphiaID")
IDs_and_names

Unnamed: 0_level_0,Description,Common name
AphiaID,Unnamed: 1_level_1,Unnamed: 2_level_1
105740,Alopias,Thresher shark
105835,Alopias superciliosus,Bigeye thresher shark
105836,Alopias vulpinus,common thresher shark
231390,Arctocephalus,
231435,Arctocephalus australis,South American fur seal
...,...,...
255003,Zalophus californianus,California sea lion
255004,Zalophus japonicus,Japanese sea lion
255005,Zalophus wollebaeki,Galápagos sea lion
136986,Ziphiidae,Beaked whales


In [5]:
mapping_names = {descr : com_nam for _, (descr, com_nam) in IDs_and_names.iterrows()}
mapping_names

{'Alopias': 'Thresher shark',
 'Alopias superciliosus': 'Bigeye thresher shark',
 'Alopias vulpinus': 'common thresher shark',
 'Arctocephalus ': nan,
 'Arctocephalus australis': 'South American fur seal',
 'Arctocephalus galapagoensis': 'Galápagos fur seal',
 'Arctocephalus gazella': 'Antarctic fur seal',
 'Arctocephalus philippii ': 'Juan Fernandez fur seal',
 'Arctocephalus pusillus': 'Cape fur seal',
 'Arctocephalus tropicalis': 'subantarctic fur seal',
 'Balaena mysticetus': 'Bowhead whale',
 'Balaenidae': 'Right and bowhead whales',
 'Balaenoptera acutorostrata': 'Common minke whale',
 'Balaenoptera bonaerensis': 'Antarctic minke whale',
 'Balaenoptera borealis': 'Sei whale',
 'Balaenoptera edeni': "Bryde's whale",
 'Balaenoptera musculus': 'Blue whale',
 'Balaenoptera omurai': "Omura's whale",
 'Balaenoptera physalus': 'Fin whale',
 'Balaenopteridae ': 'Rorquals',
 'Berardius arnuxii': "Arnoux's beaked whale",
 'Berardius bairdii': "Baird's beaked whale",
 'Callorhinus ': nan,
 

In [6]:
philogenetic_tree_names = ['Stenella longirostris',
'Stenella clymene',
'Lagenodelphis hosei',
'Delphinus delphis',
'Stenella coeruleoalba',
'Stenella attenuata',
'Stenella frontalis',
'Tursiops aduncus',
'Tursiops truncatus',
'Sousa chinensis',
'Sotalia guianensis',
'Globicephala melas',
'Globicephala macrorhynchus',
'Peponocephala electra',
'Feresa attenuata',
'Pseudorca crassidens',
'Grampus griseus',
'Steno bredanensis',
'Orcaella brevirostris',
'Orcaella heinsohnii',
'Lagenorhynchus obliquidens',
'Lagenorhynchus obscurus',
'Lissodelphis peroni',
'Lissodelphis borealis',
'Lagenorhynchus australis',
'Cephalorhynchus heavisidil',
'Cephalorhynchus commersoni',
'Lagenorhynchus albirostris',
'Orcinus orca',
'Lagenorhynchus acutus',
'Phocoena dioptrica',
'Phocoena spinnipinis',
'Phocoena phocoena',
'Phocoenoides dalli',
'Neophocaena phocaenoides',
'Delphinapterus leucas',
'Monodon monoceros',
'Inia geoffrensis',
'Pontoporia blainviller',
'Ipotes vexillife',
'lesoplodon peruvianus',
'Mesoplodon perrini',
'Mesoplodon densirostris',
'Mesolpodon stejnegeri',
'Viesoplodon grayli',
'Mesoplodon hectori',
'Mesoplodon carlhubbsi',
'Mesoplodon bowdoini',
'Mesoplodon layardi',
'Mesoplodon europaeus',
'Mesoplodon mirus',
'Mesoplodon gingkodens',
'Mesoplodon bidens',
'Hyperoodon ampullatus',
'Hyperoodon planifrons',
'Ziphius cavirostris lasmacetus shepherdi',
'Berardius bairdit',
'Berardius arnouxii',
'Kogia sima',
'Kogia brevirostris',
'Physeter macrocephalus',
'Balaenoptera borealis',
'Balaenoptera edeni',
'Balaenoptera musculus',
'Balaenoptera physalus',
'Megaptera novaeangliae',
'Eschrictius robustus',
'Balaenoptera acutorostrata',
'Balaenoptera bonaerensis',
'Caperea marginata',
'Eubalaena glacialis',
'Eubalaena japonica',
'Eubalaena australis',
'Balaena mysticetus']

In [16]:
len(philogenetic_tree_names)

74

In [12]:
mapping_names['Orcaella heinsohnii'] = 'Australian snubfin dolphin'
mapping_names['Lissodelphis peroni'] = "Southern right whale dolphin"
mapping_names['Cephalorhynchus heavisidil'] = "Heaviside's dolphin"
mapping_names['Cephalorhynchus commersoni'] = "Commerson's dolphin"
mapping_names['Phocoena spinnipinis'] = "Burmeister's porpoise"
mapping_names['Inia geoffrensis'] = "Amazon river dolphin"
mapping_names['Pontoporia blainviller'] = "La Plata dolphin"
mapping_names['Ipotes vexillife'] = "Baiji"
mapping_names['lesoplodon peruvianus'] = "Pygmy beaked whale"
mapping_names['Mesolpodon stejnegeri'] = "Stejneger's beaked whale"
mapping_names['Viesoplodon grayli'] = "Gray's beaked whale"
mapping_names['Mesoplodon hectori'] = "Hector's beaked whale"
mapping_names['Mesoplodon layardi'] = "Strap-toothed beaked whale"
mapping_names['Mesoplodon gingkodens'] = "Ginkgo-toothed beaked whale"
mapping_names['Ziphius cavirostris lasmacetus shepherdi'] = "Cuvier's beaked whale"
mapping_names['Berardius bairdit'] = "Baird's beaked whale"
mapping_names['Berardius arnouxii'] = "Arnoux's beaked whale"
mapping_names['Kogia brevirostris'] = "Pygmy sperm whale"
mapping_names['Eschrictius robustus'] = "Gray whale"

In [13]:
for name in philogenetic_tree_names:
    if name not in mapping_names:
        print(name)

In [14]:
philo_common_names = [mapping_names[name] for name in philogenetic_tree_names]
philo_common_names

['Long-snouted spinner dolphin',
 'Short-snouted spinner dolphin',
 "Fraser's dolphin",
 'Common dolphin',
 'Striped dolphin',
 'Pantropical spotted dolphin',
 'Atlantic spotted dolphin',
 'Indo-Pacific bottlenose dolphin',
 'Common bottlenose dolphin',
 'Indo-Pacific hump-backed dolphin',
 'Guiana dolphin',
 'Long-finned pilot whale',
 'Short-finned pilot whale',
 'Melon-headed whale',
 'Pygmy killer whale',
 'False killer whale',
 "Risso's dolphin",
 'Rough-toothed dolphin',
 'Irrawaddy dolphin',
 'Australian snubfin dolphin',
 'Pacific white-sided dolphin',
 'Dusky dolphin',
 'Southern right whale dolphin',
 'Northern right whale dolphin',
 "Peale's dolphin",
 "Heaviside's dolphin",
 "Commerson's dolphin",
 'White-beaked dolphin',
 'Orca',
 'Atlantic white-sided dolphin',
 'Spectacled porpoise',
 "Burmeister's porpoise",
 'Harbour porpoise',
 'Dall porpoise',
 'Finless porpoise',
 'Beluga',
 'Narwhal',
 'Amazon river dolphin',
 'La Plata dolphin',
 'Baiji',
 'Pygmy beaked whale',
 "

In [10]:
import requests, os
from tqdm import tqdm

# Set the folder name where images will be stored
my_folder = 'wiki_images'
os.makedirs(my_folder, exist_ok=True)

# Base URL for Wikipedia API
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

# Wikipedia article to download the main image from
my_list = philo_common_names#species_list#IDs_and_names["Common name"].dropna().tolist()

# Define User-Agent header to respect Wikipedia's user-client policy :,)
HEADERS = {
    "User-Agent": "WhereWereWhales: cetaceans images from Wikipedia"
}


def get_image_url(partial_url):
    """ Gets the main image URL from a Wikipedia article """
    try:
        api_res = requests.get(query + partial_url, headers=HEADERS).json()
        first_part = api_res['query']['pages']
        for key, value in first_part.items():
            if 'original' in value:
                return value['original']['source']
    except Exception as exc:
        print(exc)
        print("Partial URL: " + partial_url)
    return None


def download_image(the_url, the_page):
    res = requests.get(the_url, headers=HEADERS) 
    res.raise_for_status()
    
    if res.status_code != 200:
        print(f"Error scraping {the_page}: {res.status_code}")

    # Get the correct file extension
    file_ext = '.' + the_url.split('.')[-1].lower()
    image_file = os.path.join(my_folder, os.path.basename(the_page + file_ext))

    # Save the image
    with open(image_file, 'wb') as f:
        for chunk in res.iter_content(100000):
            f.write(chunk)

    #print(f"Downloaded: {image_file}")

# Download images from Wikipedia articles
for the_page in tqdm(my_list, desc="Scraping Wikipedia images", unit="pages"):
    the_url = get_image_url(the_page)
    if the_url:
        #print(f"Downloading an image of a magnificent {the_page}...")
        download_image(the_url, the_page)
    else:
        print(f"No image found for {the_page} :(")

print("All done!")


Scraping Wikipedia images:   1%|▏         | 1/74 [00:00<00:15,  4.59pages/s]

No image found for Long-snouted spinner dolphin :(


Scraping Wikipedia images:   3%|▎         | 2/74 [00:00<00:15,  4.78pages/s]

No image found for Short-snouted spinner dolphin :(


Scraping Wikipedia images:  14%|█▎        | 10/74 [00:03<00:21,  3.04pages/s]

No image found for Indo-Pacific hump-backed dolphin :(


Scraping Wikipedia images:  46%|████▌     | 34/74 [00:13<00:12,  3.25pages/s]

No image found for Dall porpoise :(


Scraping Wikipedia images:  49%|████▊     | 36/74 [00:13<00:11,  3.44pages/s]

No image found for Beluga :(


Scraping Wikipedia images:  65%|██████▍   | 48/74 [00:20<00:07,  3.32pages/s]

No image found for Andrew's beaked whale :(


Scraping Wikipedia images:  66%|██████▌   | 49/74 [00:20<00:06,  3.65pages/s]

No image found for strap-toothed beaked whale :(


Scraping Wikipedia images:  68%|██████▊   | 50/74 [00:20<00:06,  3.77pages/s]

No image found for Gervais' beaked whale :(


Scraping Wikipedia images:  97%|█████████▋| 72/74 [00:29<00:00,  2.95pages/s]

No image found for Pacific right whale :(


Scraping Wikipedia images: 100%|██████████| 74/74 [00:30<00:00,  2.43pages/s]

All done!





In [None]:
mapping_names["Long-snouted spinner dolphin"] = "Spinner dolphin"
mapping_names["Short-snouted spinner dolphin"] = "Spinner dolphin"
mapping_names["Indo-Pacific hump-backed dolphin"] = "Indo-Pacific humpback dolphin"
mapping_names["Dall porpoise"] = "Dall's porpoise"
mapping_names["Beluga"] = "Beluga whale" 
mapping_names["Andrew's beaked whale"] = ""
mapping_names["Strap-toothed beaked whale"] = ""
mapping_names["Gervais' beaked whale"] = ""
mapping_names["Pacific right whale"] = ""

SyntaxError: invalid syntax (2611185562.py, line 1)