#### imports

In [None]:
from google.colab import drive
drive.mount('/content/drive') # pour monter notre drive au notebook

Mounted at /content/drive


In [None]:
import os
import pickle
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

# using tqdm with pandas
tqdm.pandas(desc="Progress")

# Function to get the tuple (data, species_code) for **one** audio file

In [None]:
def get_audio_data_and_species_code(audio_directory, filename, species_code, sr=44_100):
  '''
  Load an audio file and return a tuple with the audio data and species code

  Input:
    audio_directory : path to the directory where audio is stored
    filename : name of the audio file
    species_code : code of the species recorded on the audio file
    sr : sampling rate. Default = 44100 Hz

  Output:
    audio_data : np.array of the audio time series. Multi-channel is supported. shape=(n,) or (…, n). See librosa.load() documentation.
    species_code : the species code
  '''
  
  # path to audio file (audio_directory and filename)
  filepath = os.path.join(audio_directory,filename)

  # load the audio file
  audio_data, sampling_rate = librosa.load(filepath,sr=sr)

  return audio_data, species_code

In [None]:
# test with mp3 sound
path = '/content/drive/MyDrive/lewagon-deepdive/raw_data/external_audio_files/AA3A_NOAA_Northern-right-whale.mp3'
audio_data, sampling_rate = librosa.load(path,sr=44_100)



In [None]:
audio_data.shape

(663552,)

# Function to get the list of tuples (data, species_code) for **all** audio files in a dataframe

In [None]:
def get_list_of_tuples(dataset, audio_directory, sr=44_100, nb_rows=None):
  '''
  Takes a pandas dataframe containing the names of the sound files to be treated.
  Returns a list of tuples of two items:
    - the data of the audio file
    - the corresponding species code

  Input:
    dataset : dataframe created by function get_dataset()
    audio_directory : path to the directory where audio is stored
    sr : sampling rate (default = 44100 Hz)
    nb_rows : number of rows to iterate over in the csv (to allow testing on small number of rows)

  Output:
    list_of_tuples :  a list of tuples containing the data and the species code
  '''

  # resize the dataset if requested in parameters
  if nb_rows:
    dataset = dataset.head(nb_rows)

  # iterate over the rows of the dataset to get the audio data and the species code for each audio file
  list_of_tuples = dataset.progress_apply(lambda row: get_audio_data_and_species_code(audio_directory, row.filename, row.species_code, sr),axis=1).tolist() # progress_apply is to display the progress bar tqdm.
  
  return list_of_tuples


# Test

In [None]:
# our parameters

## path to the directory where all .wav files are stored
audio_directory = '/content/drive/MyDrive/lewagon-deepdive/raw_data/wav_files'

## load a dataframe for the test
dataset = pickle.load(open('/content/drive/MyDrive/lewagon-deepdive/working_environment/01.getting_data/dataset_df.pkl', 'rb'))

In [None]:
dataset.shape

(727, 18)

In [None]:
nb_rows = 20 # to test only on a few rows

list_of_tuples = get_list_of_tuples(dataset, audio_directory, nb_rows=None)

Progress: 100%|██████████| 727/727 [07:27<00:00,  1.62it/s]


In [None]:
len(list_of_tuples)

20

# Using a pickle file

In [None]:
pickle_name = 'test_audio_data.pkl'
pickle_location = '/content/drive/MyDrive/lewagon-deepdive/working_environment/01.getting_data'

In [None]:
# save the list into a pickle file
with open(f'{pickle_location}/{pickle_name}', 'wb') as f:
  pickle.dump(list_of_tuples, f)

In [None]:
# load the list from a pickle file
my_list = pickle.load(open(f'{pickle_location}/{pickle_name}', 'rb'))

# Separate the tuples in two different lists

In [None]:
audio_data_list = []
species_code_list = []

for (audio_data, species_code) in list_of_tuples:
  audio_data_list.append(audio_data)
  species_code_list.append(species_code)