In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
import random
from collections import defaultdict
import shutil

In [4]:
spectrograms_dir = "gdrive/MyDrive/Cogfee Beans/spectrograms"
animals = os.listdir(spectrograms_dir)

animal2genus = {} # dictionary that maps the species of animal to its genus
for animal in animals:
  if animal == "Killer Whale":
    animal2genus[animal] = "dolphin"
  elif "seal" in animal.lower():
    animal2genus[animal] = "seal"
  elif "dolphin" in animal.lower():
    animal2genus[animal] = "dolphin"
  elif "whal" in animal.lower() or animal == "Short-Finned":
    animal2genus[animal] = "whale"
  else:
    print(animal)
animal2genus 

{'Atlantic Spotted Dolphin': 'dolphin',
 'Bearded Seal': 'seal',
 'Beluga, White Whale': 'whale',
 'Bowhead Whale': 'whale',
 'Clymene Dolphin': 'dolphin',
 'Common Dolphin': 'dolphin',
 'False Killer Whale': 'whale',
 'Fin, Finback Whale': 'whale',
 "Fraser's Dolphin": 'dolphin',
 "Grampus, Risso's Dolphin": 'dolphin',
 'Harp Seal': 'seal',
 'Humpback Whale': 'whale',
 'Killer Whale': 'dolphin',
 'Leopard Seal': 'seal',
 'Long-Finned Pilot Whale': 'whale',
 'Melon Headed Whale': 'whale',
 'Narwhal': 'whale',
 'Northern Right Whale': 'whale',
 'Pantropical Spotted Dolphin': 'dolphin',
 'Ross Seal': 'seal',
 'Rough-Toothed Dolphin': 'dolphin',
 'Short-Finned': 'whale',
 'Sperm Whale': 'whale',
 'Spinner Dolphin': 'dolphin',
 'Striped Dolphin': 'dolphin',
 'White-beaked Dolphin': 'dolphin',
 'White-sided Dolphin': 'dolphin'}

In [5]:
len(animal2genus.keys())


27

In [6]:
len(animal)

27

In [7]:
def display_audio_count(root=spectrograms_dir):
    """Get number of audio recordings for each species"""
    classes = os.listdir(root)
    for dir in classes:
        subroot = os.path.join(root,dir)
        images = os.listdir(subroot)
        print(subroot.split("/")[-1],len(images))
display_audio_count()

Narwhal 47
Ross Seal 49
Sperm Whale 29
Harp Seal 46
Bearded Seal 33
Killer Whale 32
Short-Finned 56
Leopard Seal 10
Bowhead Whale 60
Common Dolphin 24
Humpback Whale 23
Clymene Dolphin 21
Spinner Dolphin 70
Striped Dolphin 45
Fraser's Dolphin 40
False Killer Whale 50
Fin, Finback Whale 472
Melon Headed Whale 48
Beluga, White Whale 33
White-sided Dolphin 9
Northern Right Whale 54
White-beaked Dolphin 12
Rough-Toothed Dolphin 38
Long-Finned Pilot Whale 42
Atlantic Spotted Dolphin 38
Grampus, Risso's Dolphin 50
Pantropical Spotted Dolphin 56


{'dolphin', 'seal', 'whale'}

In [8]:
data_dir = "gdrive/MyDrive/Cogfee Beans/train_val_data"

In [36]:
random.seed(2022)
def make_train_val_paths(root=data_dir, spectrograms_dir=spectrograms_dir, animal2genus=animal2genus):
    """Get paths of images that will be in train or val folders"""
    try:
        os.mkdir(os.path.join(root,"train/"))
        os.mkdir(os.path.join(root,"val/"))
    except:
        pass
    animals = os.listdir(spectrograms_dir)
    train_val_data = defaultdict(lambda:[[],[]])
    for animal in list(animals):
        subroot = os.path.join(spectrograms_dir,animal)
        images = os.listdir(subroot)
        images = [os.path.join(subroot,x) for x in images] # get image paths for that species
        random.shuffle(images)
        genus = animal2genus[animal]
        train_imgs = images[:int(len(images)*0.8)]
        val_imgs = images[-int(len(images)*0.2):]
        train_val_data[genus][0] += train_imgs
        train_val_data[genus][1] += val_imgs
    min_num_train = 100000000
    min_num_val = 100000000
    for genus in set(animal2genus.values()):
        num_train = len(train_val_data[genus][0])
        if num_train < min_num_train:
            min_num_train = num_train
        num_val = len(train_val_data[genus][1])
        if num_val < min_num_val:
            min_num_val = num_val
    for genus in set(animal2genus.values()):
        random.shuffle(train_val_data[genus][0])
        random.shuffle(train_val_data[genus][1])
        train_val_data[genus][0] = train_val_data[genus][0][:min_num_train]
        train_val_data[genus][1] = train_val_data[genus][1][:min_num_val]
    return train_val_data
train_val_data = make_train_val_paths()

In [37]:
print(len(train_val_data["whale"][0])) # num of whale recordings in training
print(len(train_val_data["whale"][1])) # num of whale recordings in val
print(len(train_val_data["seal"][0])) # num of seal recordings in training
print(len(train_val_data["seal"][1])) # num of seal recordings in val
print(len(train_val_data["dolphin"][0])) # num of dolphin recordings in training
print(len(train_val_data["dolphin"][1])) # num of dolphin recordings in val

109
26
109
26
109
26


In [39]:
train_val_data["dolphin"][0][0]

'gdrive/MyDrive/Cogfee Beans/spectrograms/Clymene Dolphin/Clymene Dolphin220804_205933384760_33231.png'

In [40]:
data_dir = "gdrive/MyDrive/Cogfee Beans/train_val_data"
def move_train_test_files(root=data_dir,train_test=train_val_data):
    """root is where to move the files to"""
    for genus, train_val in train_val_data.items():
        train_genus_path = os.path.join(root+"/train/", genus)
        val_genus_path = os.path.join(root+"/val/", genus)
        try:
            os.mkdir(train_genus_path)
            os.mkdir(val_genus_path)
        except:
            pass
        train_paths, val_paths = train_val
        for path in train_paths:
            # print("move", path, "into train")
            shutil.copy(path,train_genus_path) # params: src, dst
        for path in val_paths:
            # print("move", path, "into test")
            shutil.copy(path,val_genus_path) # params: src, dst
 

move_train_test_files()

In [41]:
print(os.listdir(data_dir + "/train"))
print(os.listdir(data_dir + "/val"))

['whale', 'seal', 'dolphin']
['whale', 'seal', 'dolphin']


In [42]:
print(len(os.listdir(data_dir + "/train/whale")))
print(len(os.listdir(data_dir + "/val/whale")))
print(len(os.listdir(data_dir + "/train/seal")))
print(len(os.listdir(data_dir + "/val/seal")))
print(len(os.listdir(data_dir + "/train/dolphin")))
print(len(os.listdir(data_dir + "/val/dolphin")))

109
26
109
26
109
26
