#### imports

In [None]:
from google.colab import drive
drive.mount('/content/drive') # pour monter notre drive au notebook

Mounted at /content/drive


In [None]:
import pickle
import pandas as pd

## Function to get the dataset (csv + dataframe)



In [None]:
def get_dataset(path_to_watkins_sound_list, families_to_keep=[], species_to_keep=[], multi_species=False, noise = False, min_duration = 0):
  """
  Load the watkins_sound_list and return a df with the sounds we want to have in the dataset.

  Input:
    path_to_watkins_sound_list : path to watkins_sound_list.csv file.
    families_to_keep : list of families that we want to include in the dataset. Default: [] means all families.
    species_to_keep : list of species that we want to include in the dataset. Default: [] means all species.
    multi_species : True if you want to include sounds with several species. Default: False.
    noise : True if you want to include sounds with polluting noise. Default: False.
    min_duration : minimum duration (seconds) of the sounds you want to include in the dataset. Default: 0.

  Output:
    Creates a csv file with all the selected sounds
    Returns a dataframe with the sounds we want to keep.
  """

  # strings for the name of the final csv file to be created
  family_string = 'all'
  species_string = 'all'
  nb_species = 'multi'
  included_noise = 'polluting'

  # loading the watkins_sound_list in a dataframe
  dataset = pd.read_csv(path_to_watkins_sound_list)

  # keep only required families
  if families_to_keep:
    dataset = dataset[dataset.family_code.isin(families_to_keep)]
    family_string = '-'.join(families_to_keep)

  #keep only required species
  if species_to_keep:
    dataset = dataset[dataset.species_code.isin(species_to_keep)]
    species_string = '-'.join(species_to_keep)

  # if multi_species is False, keep only sounds with one species
  if not multi_species:
    dataset = dataset[dataset.multi_species==False]
    nb_species = 'mono'

  # if noise is False, keep only sounds with no polluting noise
  if not noise:
    dataset = dataset[dataset.noise==False]
    included_noise='no'
  
  # keep only sounds longer that min_duration
  if min_duration != 0 :
    dataset = dataset[dataset.duration >= min_duration]
  
  # reseting the index
  dataset.reset_index(drop=True,inplace=True)

  # create csv file
  csv_name = f"families-{family_string}_species-{species_string}_{nb_species}-species_{included_noise}-noise_min-{min_duration}sec"
  dataset.to_csv(f'/content/drive/MyDrive/lewagon-deepdive/raw_data/{csv_name}.csv')

  return dataset


# Test

In [None]:
# our parameters

## path to the csv document where all the sounds are listed
watkins_sound_list = '/content/drive/MyDrive/lewagon-deepdive/raw_data/watkins_sound_list.csv'

## we want to keep only the four biggest families
families_to_keep=['AC','BA','BD','BE']

## Use the following argument if you want to filter species
# species=['here your list of species']

## Use the following argument if you want to include sounds with several species
# multi_species=True

## Use the following argument if you want to include sounds with polluting noise
# noise = True

## we want to keep recordings longer than 1 second
min_duration = 1

In [None]:
# test
dataset = get_dataset(watkins_sound_list, families_to_keep=families_to_keep, min_duration = min_duration)

In [None]:
dataset.head()

Unnamed: 0,family_code,family_name,species_code,species_name,common_name,filename,duration,sampling_rate,location,observation_date,noise,noise_description,multi_species,species2,species3
0,AC,Balaenopteridae,AC1A,Balaenoptera acutorostrata Lacépède 1804,Minke whale,AC1A_64103004.wav,1.417969,1280,Cape Crozier - Antarctica,22-Nov-1964,False,,False,,
1,AC,Balaenopteridae,AC1A,Balaenoptera acutorostrata Lacépède 1804,Minke whale,AC1A_64103005.wav,1.417969,1280,Cape Crozier - Antarctica,22-Nov-1964,False,,False,,
2,AC,Balaenopteridae,AC1A,Balaenoptera acutorostrata Lacépède 1804,Minke whale,AC1A_64103006.wav,1.417969,1280,Cape Crozier - Antarctica,22-Nov-1964,False,,False,,
3,AC,Balaenopteridae,AC1A,Balaenoptera acutorostrata Lacépède 1804,Minke whale,AC1A_64103007.wav,1.417969,1280,Cape Crozier - Antarctica,22-Nov-1964,False,,False,,
4,AC,Balaenopteridae,AC1A,Balaenoptera acutorostrata Lacépède 1804,Minke whale,AC1A_64103009.wav,1.413281,1280,Cape Crozier - Antarctica,22-Nov-1964,False,,False,,


# load the dataframe into a pickle to use it in abother notebook

In [None]:
pickle_name = 'dataset_df.pkl'
pickle_location = '/content/drive/MyDrive/lewagon-deepdive/working_environment/01.getting_data'

In [None]:
with open(f'{pickle_location}/{pickle_name}', 'wb') as f:
  pickle.dump(dataset, f)