###### Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/lewagon-deepdive/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import time
import librosa
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.io import wavfile
import matplotlib.pyplot as plt
from tqdm import tqdm
import soundfile as sf

# using tqdm with pandas
tqdm.pandas(desc="Progress")

# Dataframe construction

## Track list

In [None]:
sound_list = pd.read_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/watkins_all_cuts.csv',sep=';')

In [None]:
sound_list.shape

(15566, 6)

In [None]:
sound_list.head(2)

Unnamed: 0,species_code,species_name,location,observation_date,download_link,metadata
0,BD15F,Stenella frontalis,,6-Aug-1961,/science/B/whalesounds/WhaleSounds/61025001.wav,javascript:popUpWin('metaData.cfm?RN=61025001')
1,BD15F,Stenella frontalis,,6-Aug-1961,/science/B/whalesounds/WhaleSounds/61025002.wav,javascript:popUpWin('metaData.cfm?RN=61025002')


## Species list

In [None]:
df_species = pd.read_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/species_table.csv',sep=';')

In [None]:
df_species.shape

(133, 9)

In [None]:
df_species.head(2)

Unnamed: 0,order_code,order_name,family_code,family_name,genus_code,genus_name,species_code,species_name,common_name
0,A,Mysticeti,AA,Balaenidae,AA1,Balaena,AA1A,Balaena mysticetus Linnaeus 1758,Bowhead whale
1,A,Mysticeti,AA,Balaenidae,AA3,Eubalaena,AA3A,Eubalaena glacialis (Borowski) 1781,Northern right whale


## Merge both

In [None]:
sound_list = pd.merge(
                    sound_list, 
                    df_species, 
                    on ='species_code', 
                    how ='left')

In [None]:
sound_list.head(2)

Unnamed: 0,species_code,species_name_x,location,observation_date,download_link,metadata,order_code,order_name,family_code,family_name,genus_code,genus_name,species_name_y,common_name
0,BD15F,Stenella frontalis,,6-Aug-1961,/science/B/whalesounds/WhaleSounds/61025001.wav,javascript:popUpWin('metaData.cfm?RN=61025001'),B,Odontoceti,BD,Delphinidae,BD15,Stenella,Stenella frontalis (G. Cuvier) 1829,Atlantic Spotted Dolphin
1,BD15F,Stenella frontalis,,6-Aug-1961,/science/B/whalesounds/WhaleSounds/61025002.wav,javascript:popUpWin('metaData.cfm?RN=61025002'),B,Odontoceti,BD,Delphinidae,BD15,Stenella,Stenella frontalis (G. Cuvier) 1829,Atlantic Spotted Dolphin


## Add a filename column

In [None]:
sound_list['filename']=sound_list['species_code'] + "_" + sound_list['download_link'].str.rsplit('/',n=1, expand=True)[1]

In [None]:
sound_list.head(2)

Unnamed: 0,species_code,species_name_x,location,observation_date,download_link,metadata,order_code,order_name,family_code,family_name,genus_code,genus_name,species_name_y,common_name,filename
0,BD15F,Stenella frontalis,,6-Aug-1961,/science/B/whalesounds/WhaleSounds/61025001.wav,javascript:popUpWin('metaData.cfm?RN=61025001'),B,Odontoceti,BD,Delphinidae,BD15,Stenella,Stenella frontalis (G. Cuvier) 1829,Atlantic Spotted Dolphin,BD15F_61025001.wav
1,BD15F,Stenella frontalis,,6-Aug-1961,/science/B/whalesounds/WhaleSounds/61025002.wav,javascript:popUpWin('metaData.cfm?RN=61025002'),B,Odontoceti,BD,Delphinidae,BD15,Stenella,Stenella frontalis (G. Cuvier) 1829,Atlantic Spotted Dolphin,BD15F_61025002.wav


## Add a duration column and a sampling rate column

### Define a function to get duration and a function for sampling rate

In [None]:
sf.info('/content/drive/MyDrive/lewagon-deepdive/raw_data/Watkins_all_cuts_part1/BD15F_61025007.wav').__getattribute__('duration')

1.763986332574032

In [None]:
def file_duration(filename):
    """ return the duration in seconds of an audio file """
    
    # directory where .wav files are stored
    directory = '/content/drive/MyDrive/lewagon-deepdive/raw_data/Watkins_all_cuts_part2'
    
    path = f'{directory}/{filename}'
    
    # try except to avoid errors due to corrupted files
    try:
        return sf.info(path).__getattribute__('duration')
    except:
        return np.nan


In [None]:
def file_sampling_rate(filename):
  """ return the sampling rate of a .wav file """
  
  # directory where .wav files are stored
  directory = project_path + 'raw_data/wav_files'
  
  path = f'{directory}/{filename}'
  
  # try except to avoid errors due to corrupted files
  try:
      sampling_rate, data = wavfile.read(path)
      return sampling_rate
  except:
      return np.nan  

### Apply the functions on entire dataframe  
❗Attention : next cell takes a long time to compute

In [None]:
# splitting my df in two parts
sound_list_part1 = sound_list.iloc[:8_114]
sound_list_part2 = sound_list.iloc[8_114:]

In [None]:
sound_list_part2

Unnamed: 0,species_code,species_name_x,location,observation_date,download_link,metadata,order_code,order_name,family_code,family_name,genus_code,genus_name,species_name_y,common_name,filename
8114,BE3C,Globicephala melaena,Western North Atlantic,12-sept-75,/science/B/whalesounds/WhaleSounds/7501000M.wav,javascript:popUpWin('metaData.cfm?RN=7501000M'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000M.wav
8115,BE3C,Globicephala melaena,Western North Atlantic,12-sept-75,/science/B/whalesounds/WhaleSounds/7501000N.wav,javascript:popUpWin('metaData.cfm?RN=7501000N'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000N.wav
8116,BE3C,Globicephala melaena,Western North Atlantic,12-sept-75,/science/B/whalesounds/WhaleSounds/7501000O.wav,javascript:popUpWin('metaData.cfm?RN=7501000O'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000O.wav
8117,BE3C,Globicephala melaena,Western North Atlantic,12-sept-75,/science/B/whalesounds/WhaleSounds/7501000P.wav,javascript:popUpWin('metaData.cfm?RN=7501000P'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000P.wav
8118,BE3C,Globicephala melaena | Transient X,Western North Atlantic X,12-Sep-1975 X,/science/B/whalesounds/WhaleSounds/7501000Q.wav,javascript:popUpWin('metaData.cfm?RN=7501000Q'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000Q.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15561,BD6A,Lagenorhynchus acutus,E of Stellwagen Bank,25-Apr-1979,/science/B/whalesounds/WhaleSounds/79006015.wav,javascript:popUpWin('metaData.cfm?RN=79006015'),B,Odontoceti,BD,Delphinidae,BD6,Lagenorhynchus,Lagenorhynchus acutus (Gray) 1828,White-sided dolphin,BD6A_79006015.wav
15562,BD6A,Lagenorhynchus acutus,E of Stellwagen Bank,25-Apr-1979,/science/B/whalesounds/WhaleSounds/79006016.wav,javascript:popUpWin('metaData.cfm?RN=79006016'),B,Odontoceti,BD,Delphinidae,BD6,Lagenorhynchus,Lagenorhynchus acutus (Gray) 1828,White-sided dolphin,BD6A_79006016.wav
15563,BD6A,Lagenorhynchus acutus,E of Stellwagen Bank,25-Apr-1979,/science/B/whalesounds/WhaleSounds/79006017.wav,javascript:popUpWin('metaData.cfm?RN=79006017'),B,Odontoceti,BD,Delphinidae,BD6,Lagenorhynchus,Lagenorhynchus acutus (Gray) 1828,White-sided dolphin,BD6A_79006017.wav
15564,BD6A,Lagenorhynchus acutus,E of Stellwagen Bank,25-Apr-1979,/science/B/whalesounds/WhaleSounds/79006018.wav,javascript:popUpWin('metaData.cfm?RN=79006018'),B,Odontoceti,BD,Delphinidae,BD6,Lagenorhynchus,Lagenorhynchus acutus (Gray) 1828,White-sided dolphin,BD6A_79006018.wav


In [None]:
sound_list_part2['duration'] = sound_list_part2['filename'].progress_apply(lambda x: file_duration(x))
sound_list_part2.to_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/all_sounds_part2.csv', index=False)

Progress: 100%|██████████| 7452/7452 [28:48<00:00,  4.31it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [3]:
sound_list_part2 = pd.read_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/all_sounds_part2.csv')

In [4]:
sound_list_part2.head(2)

Unnamed: 0,species_code,species_name_x,location,observation_date,download_link,metadata,order_code,order_name,family_code,family_name,genus_code,genus_name,species_name_y,common_name,filename,duration
0,BE3C,Globicephala melaena,Western North Atlantic,12-sept-75,/science/B/whalesounds/WhaleSounds/7501000M.wav,javascript:popUpWin('metaData.cfm?RN=7501000M'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000M.wav,2.023001
1,BE3C,Globicephala melaena,Western North Atlantic,12-sept-75,/science/B/whalesounds/WhaleSounds/7501000N.wav,javascript:popUpWin('metaData.cfm?RN=7501000N'),B,Odontoceti,BE,Globicephalidae,BE3,Globicephala,Globicephala melaena (Traill) 1809,Long-finned pilot,BE3C_7501000N.wav,1.388547


In [5]:
# counting the number of wrong files
sound_list_part2.duration.isna().sum()

0

## Drop unnecessary rows and columns and reorder columns

In [6]:
sound_list_part2.shape

(7452, 16)

In [None]:
# drop rows where duration is nan
# sound_list_part2 = sound_list_part2[sound_list_part2.duration.isna() == False]
# sound_list_part2.shape

In [7]:
sound_list_part2.columns

Index(['species_code', 'species_name_x', 'location', 'observation_date',
       'download_link', 'metadata', 'order_code', 'order_name', 'family_code',
       'family_name', 'genus_code', 'genus_name', 'species_name_y',
       'common_name', 'filename', 'duration'],
      dtype='object')

In [9]:
# dropping columns
sound_list_part2.drop(columns=['order_code', 'order_name', 'genus_code', 'genus_name', 'download_link','metadata','species_name_y'], inplace=True)

In [10]:
sound_list_part2.columns

Index(['species_code', 'species_name_x', 'location', 'observation_date',
       'family_code', 'family_name', 'common_name', 'filename', 'duration'],
      dtype='object')

In [11]:
# reordering
columns_in_order = ['family_code', 'family_name',
                    'species_code', 'species_name_x', 'common_name',
                    'filename', 'duration',
                    'location', 'observation_date'
                    ]

sound_list_part2 = sound_list_part2[columns_in_order]

In [12]:
sound_list_part2.head(10)

Unnamed: 0,family_code,family_name,species_code,species_name_x,common_name,filename,duration,location,observation_date
0,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000M.wav,2.023001,Western North Atlantic,12-sept-75
1,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000N.wav,1.388547,Western North Atlantic,12-sept-75
2,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000O.wav,1.435678,Western North Atlantic,12-sept-75
3,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000P.wav,1.098511,Western North Atlantic,12-sept-75
4,BE,Globicephalidae,BE3C,Globicephala melaena | Transient X,Long-finned pilot,BE3C_7501000Q.wav,1.696711,Western North Atlantic X,12-Sep-1975 X
5,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000R.wav,1.794598,Western North Atlantic,12-sept-75
6,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000S.wav,1.45018,Western North Atlantic,12-sept-75
7,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000T.wav,2.320288,Western North Atlantic,12-sept-75
8,BE,Globicephalidae,BE3C,Globicephala melaena | Transient X,Long-finned pilot,BE3C_7501000U.wav,2.099136,Western North Atlantic X,12-Sep-1975 X
9,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000V.wav,1.819976,Western North Atlantic,12-sept-75


## Replacing all ',' by ' -' in order not to mess up with csv conversion

In [13]:
sound_list_part2.replace(',', ' -', inplace=True, regex=True)

In [14]:
#renaming columns
sound_list_part2.rename(columns={"species_name_x": "species_name"}, inplace=True)

In [15]:
sound_list_part2

Unnamed: 0,family_code,family_name,species_code,species_name,common_name,filename,duration,location,observation_date
0,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000M.wav,2.023001,Western North Atlantic,12-sept-75
1,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000N.wav,1.388547,Western North Atlantic,12-sept-75
2,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000O.wav,1.435678,Western North Atlantic,12-sept-75
3,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000P.wav,1.098511,Western North Atlantic,12-sept-75
4,BE,Globicephalidae,BE3C,Globicephala melaena | Transient X,Long-finned pilot,BE3C_7501000Q.wav,1.696711,Western North Atlantic X,12-Sep-1975 X
...,...,...,...,...,...,...,...,...,...
7447,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006015.wav,1.011072,E of Stellwagen Bank,25-Apr-1979
7448,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006016.wav,1.490625,E of Stellwagen Bank,25-Apr-1979
7449,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006017.wav,0.889673,E of Stellwagen Bank,25-Apr-1979
7450,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006018.wav,0.730884,E of Stellwagen Bank,25-Apr-1979


## Identifying rows with several species

In [None]:
# TODO : remove species codes from location and date columns and add a column with multi-species T/F and a columns by additional species

## Identifying rows with noise

In [None]:
# TODO : remove noise code from location and date columns and add a column with noise T/F and a columns by noise type

## Final tracklist

In [16]:
sound_list_part2.to_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/all_sounds_part2.csv', index=False)

In [17]:
time.strftime('%H:%M:%S', time.gmtime(sound_list_part2.duration.sum()))

'17:43:47'

In [20]:
sound_list_part1 = pd.read_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/all_sounds_part1.csv')

In [21]:
sound_list_part1.head(2)

Unnamed: 0,family_code,family_name,species_code,species_name_x,common_name,filename,duration,location,observation_date
0,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025001.wav,3.527995,,6-Aug-1961
1,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025002.wav,3.527995,,6-Aug-1961


In [22]:
#renaming columns
sound_list_part1.rename(columns={"species_name_x": "species_name"}, inplace=True)

In [23]:
sound_list_part1.head(2)

Unnamed: 0,family_code,family_name,species_code,species_name,common_name,filename,duration,location,observation_date
0,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025001.wav,3.527995,,6-Aug-1961
1,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025002.wav,3.527995,,6-Aug-1961


In [24]:
sound_list_part2.head(2)

Unnamed: 0,family_code,family_name,species_code,species_name,common_name,filename,duration,location,observation_date
0,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000M.wav,2.023001,Western North Atlantic,12-sept-75
1,BE,Globicephalidae,BE3C,Globicephala melaena,Long-finned pilot,BE3C_7501000N.wav,1.388547,Western North Atlantic,12-sept-75


In [27]:
total_sound_list = pd.concat([sound_list_part1,sound_list_part2], ignore_index=True)

In [28]:
total_sound_list

Unnamed: 0,family_code,family_name,species_code,species_name,common_name,filename,duration,location,observation_date
0,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025001.wav,3.527995,,6-Aug-1961
1,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025002.wav,3.527995,,6-Aug-1961
2,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025003.wav,3.527995,,6-Aug-1961
3,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025004.wav,1.763986,,6-Aug-1961
4,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025005.wav,1.323007,,6-Aug-1961
...,...,...,...,...,...,...,...,...,...
15560,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006015.wav,1.011072,E of Stellwagen Bank,25-Apr-1979
15561,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006016.wav,1.490625,E of Stellwagen Bank,25-Apr-1979
15562,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006017.wav,0.889673,E of Stellwagen Bank,25-Apr-1979
15563,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006018.wav,0.730884,E of Stellwagen Bank,25-Apr-1979


In [30]:
# total_sound_list.to_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/watkins_sound_list_extended.csv',index=False)

In [3]:
check = pd.read_csv('/content/drive/MyDrive/lewagon-deepdive/raw_data/watkins_sound_list_extended.csv')

In [4]:
check

Unnamed: 0,family_code,family_name,species_code,species_name,common_name,filename,duration,location,observation_date,noise,multi_species
0,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025001.wav,3.527995,,6-Aug-1961,False,False
1,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025002.wav,3.527995,,6-Aug-1961,False,False
2,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025003.wav,3.527995,,6-Aug-1961,False,False
3,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025004.wav,1.763986,,6-Aug-1961,False,False
4,BD,Delphinidae,BD15F,Stenella frontalis,Atlantic Spotted Dolphin,BD15F_61025005.wav,1.323007,,6-Aug-1961,False,False
...,...,...,...,...,...,...,...,...,...,...,...
15560,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006015.wav,1.011072,E of Stellwagen Bank,25-Apr-1979,False,False
15561,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006016.wav,1.490625,E of Stellwagen Bank,25-Apr-1979,False,False
15562,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006017.wav,0.889673,E of Stellwagen Bank,25-Apr-1979,False,False
15563,BD,Delphinidae,BD6A,Lagenorhynchus acutus,White-sided dolphin,BD6A_79006018.wav,0.730884,E of Stellwagen Bank,25-Apr-1979,False,False
