# Watkins Marine Mammal Sound database

https://cis.whoi.edu/science/B/whalesounds

List of species available on page 41 of https://whoicf2.whoi.edu/science/B/whalesounds/WHOI-92-31.pdf

## Imports

In [None]:
import time
import random
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from google.colab import drive

In [None]:
drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/lewagon-deepdive/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Scraping "best-of cuts"  
https://cis.whoi.edu/science/B/whalesounds/index.cfm

### Parameters of scraping

In [None]:
best_root_url = 'https://cis.whoi.edu/science/B/whalesounds/bestOf.cfm'

In [None]:
# The 'Best of' cuts section contains 1,694 sound cuts deemed to be of higher sound quality and lower noise
# from 32 different species.

best_species = [
    'AA1A', 'AA3A', 'AA3B', 'AC1A', 'AC1F', 'AC2A', 'BA2A', 'BB1A', 'BB2A',
    'BD3B', 'BD4A', 'BD5A', 'BD6A', 'BD6B', 'BD10A', 'BD15A', 'BD15B', 'BD15C',
    'BD15F', 'BD15L', 'BD17A', 'BD19D', 'BE3C', 'BE3D', 'BE7A', 'BE9A', 'CB1A',
    'CC2A', 'CC4A', 'CC5A', 'CC12G', 'CC14A'
]

print("number of species :", len(best_species))

number of species : 32


### Test with only one page (AA1A)

In [None]:
url = f'{best_root_url}?code={best_species[0]}'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
soup.find("div",
          class_="database").find("div", class_="large-12 columns").find(
              "table").find_all("tr")[1].find_all("td")[3].find("a")['href']

'/science/B/whalesounds/WhaleSounds/59037001.wav'

### Testing with all 32 species

In [None]:
for species in tqdm(best_species):
    table = []

    time.sleep(random.uniform(2,
                              5))  #time sleep in order not to overload server

    url = f'{best_root_url}?code={species}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    for line in soup.find("div", class_="database").find(
            "div", class_="large-12 columns").find("table").find_all("tr"):
        info = line.find_all("td")
        if len(info) == 5:
            my_dict = {
                'species_code': species,
                'location': info[1].string,
                'observation_date': info[2].string,
                'download_link': info[3].find("a")['href'],
                'metadata': info[4].find("a")['href']
            }
            table.append(my_dict)
    my_df = pd.DataFrame.from_dict(table)
    my_df.to_csv(project_path + f'data/csv_files/best_of_{species}.csv', sep=';')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [02:00<00:00,  3.76s/it]
