# Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import json
import librosa
import soundfile as sf
import random
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm
from birdlib import utils

In [2]:
DATASET_NAME = 'dataset'
AUDIO_SOURCE = '/home/giacomoschiavo/Tovanella'

In [3]:
DATASET_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}'
TRAIN_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/train'
VALID_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/valid'
TEST_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/test'

In [4]:
species_list = set(os.listdir(TEST_PATH)).intersection(set(os.listdir(TRAIN_PATH)))
if 'Engine_Engine' in species_list:
    species_list.remove('Engine_Engine')
if 'Cuculus canorus_Common Cuckoo' in species_list:
    species_list.remove('Cuculus canorus_Common Cuckoo')

In [5]:
def print_dataset_count_table(dataset_path):
    train_folder = f"{dataset_path}/train"
    valid_folder = f"{dataset_path}/valid"
    test_folder = f"{dataset_path}/test"

    dataset_count = {}
    for species in os.listdir(test_folder):
        if species not in species_list:
            continue
        dataset_count[species] = {
            "train": len(os.listdir(os.path.join(train_folder, species))) if os.path.exists(os.path.join(train_folder, species)) else 0,
            "valid": len(os.listdir(os.path.join(valid_folder, species))) if os.path.exists(os.path.join(valid_folder, species)) else 0,
            "test": len(os.listdir(os.path.join(test_folder, species))) if os.path.exists(os.path.join(test_folder, species)) else 0
        }

    dataset_species_count_df = pd.DataFrame.from_dict(dataset_count, orient='index')
    dataset_species_count_df.index.name = 'Species'
    return dataset_species_count_df.sort_values(by=["train"], ascending=False)


In [6]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13860,0,4683
Fringilla coelebs_Common Chaffinch,8330,0,1370
Sylvia atricapilla_Eurasian Blackcap,3789,0,493
Regulus ignicapilla_Common Firecrest,3218,0,238
Phylloscopus collybita_Common Chiffchaff,2172,0,674
Erithacus rubecula_European Robin,1726,0,556
Troglodytes troglodytes_Eurasian Wren,1394,0,111
Periparus ater_Coal Tit,1160,0,29
Regulus regulus_Goldcrest,877,0,16
Anthus trivialis_Tree Pipit,870,0,17


In [7]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13860,0,4683
Fringilla coelebs_Common Chaffinch,8330,0,1370
Sylvia atricapilla_Eurasian Blackcap,3789,0,493
Regulus ignicapilla_Common Firecrest,3218,0,238
Phylloscopus collybita_Common Chiffchaff,2172,0,674
Erithacus rubecula_European Robin,1726,0,556
Troglodytes troglodytes_Eurasian Wren,1394,0,111
Periparus ater_Coal Tit,1160,0,29
Regulus regulus_Goldcrest,877,0,16
Anthus trivialis_Tree Pipit,870,0,17


In [8]:
species_to_integrate = []
for species in species_list:
    species_audio = os.listdir(os.path.join(TEST_PATH, species))
    if len(species_audio) < 100:
        species_to_integrate.append(species)

In [9]:
dates_count = utils.get_date_count(TRAIN_PATH, species_list)

# Test Integration

In [10]:
# train_integration, test_integration = utils.split_dataset(dates_count, TEST_PATH, test_ratio=0.2)

# with open(f"utils/{DATASET_NAME}/train_integration.json", 'w') as f:
#     json.dump(train_integration, f)

# with open(f"utils/{DATASET_NAME}/test_integration.json", 'w') as f:
#     json.dump(test_integration, f)

In [13]:
with open(f"utils/{DATASET_NAME}/train_integration.json") as f:
    train_integration = json.load(f)

with open(f"utils/{DATASET_NAME}/test_integration.json") as f:
    test_integration = json.load(f)

In [14]:
utils.move_by_date(test_integration, TRAIN_PATH, TEST_PATH)

XC808139_0_63_0.wav
20200215_100000_138_0.wav
20200215_100000_279_0.wav
20200215_100000_334_5.wav
XC808139_0_31_5.wav
20200215_100000_133_5.wav
20200215_100000_256_5.wav
20190608_080000_219_0.wav
20190621_060000_189_0.wav
20200215_100000_342_0.wav
20200215_100000_346_5.wav
20200215_100000_324_0.wav
20200215_100000_276_0.wav
XC896342_0_117_0.wav
20200215_100000_204_0.wav
20200215_100000_229_5.wav
XC808139_0_70_5.wav
20190608_060000_586_5.wav
XC808139_0_66_0.wav
XC808139_0_64_5.wav
XC808139_0_51_0.wav
XC808139_0_21_0.wav
20200215_100000_333_0.wav
20190608_060000_595_5.wav
XC793860_0_16_5.wav
XC896342_0_118_5.wav
XC808139_0_40_5.wav
20190608_060000_594_0.wav
XC808139_0_36_0.wav
XC808139_0_52_5.wav
XC808139_0_48_0.wav
20200215_100000_252_0.wav
20190608_080000_237_0.wav
20200215_100000_364_5.wav
20190608_060000_585_0.wav
20200215_100000_235_5.wav
XC808139_0_58_5.wav
20200215_100000_336_0.wav
XC808139_0_57_0.wav
XC793860_0_19_5.wav
XC808139_0_25_5.wav
20200215_100000_234_0.wav
20200215_10000

# Validation creation

Prendiamo sempre 1/5 random del training set. (80/20)

ATTENZIONE: siccome e' random, bisogna salvare in un file gli esempi spostati in modo da poter annullare lo spostamento 

In [15]:
dates_count_valid = utils.get_date_count(TRAIN_PATH, species_list)

In [16]:
for species in species_list:
    os.makedirs(os.path.join(VALID_PATH, species), exist_ok=True)

In [17]:
train_split, valid_split = utils.split_dataset(dates_count_valid, VALID_PATH, test_ratio=0.1)

In [18]:
utils.move_by_date(valid_split, TRAIN_PATH, VALID_PATH)

XC816699_0_69_0.wav
XC879488_0_31_5.wav
XC688536_0_172_5.wav
XC804860_0_30_0.wav
XC629088_0_28_5.wav
XC530744_0_48_0.wav
XC863397_0_13_5.wav
XC826468_0_84_0.wav
XC908334_0_0_0.wav
XC629088_0_30_0.wav
XC816699_0_18_0.wav
XC897973_0_57_0.wav
XC911446_0_120_0.wav
XC793771_0_28_5.wav
XC688536_0_114_0.wav
XC911446_0_1_5.wav
XC826468_0_87_0.wav
XC826468_0_108_0.wav
XC911446_0_91_5.wav
XC804860_0_151_5.wav
XC688536_0_34_5.wav
XC530744_0_13_5.wav
XC806969_0_37_5.wav
XC816699_0_64_5.wav
XC911446_0_114_0.wav
XC808139_0_0_0.wav
XC898557_0_57_0.wav
XC826468_0_96_0.wav
XC897680_0_150_0.wav
XC748029_0_12_0.wav
XC883372_0_60_0.wav
XC911446_0_87_0.wav
XC826468_0_129_0.wav
XC816699_0_39_0.wav
XC897973_0_61_5.wav
XC816699_0_82_5.wav
XC826468_0_91_5.wav
XC913313_0_31_5.wav
XC911446_0_105_0.wav
XC897973_0_13_5.wav
XC816699_0_25_5.wav
XC863397_0_15_0.wav
XC897973_0_36_0.wav
XC826468_0_78_0.wav
XC688536_0_45_0.wav
XC530744_0_45_0.wav
XC626692_0_55_5.wav
XC871044_0_115_5.wav
XC826468_0_136_5.wav
XC883372_0_5

XC901399_0_37_5.wav
20190621_050000_307_5.wav
XC911446_0_30_0.wav
20190621_130000_487_5.wav
XC715955_0_9_0.wav
20190621_130000_223_5.wav
XC901399_0_55_5.wav
20190621_050000_306_0.wav
20190621_050000_442_5.wav
XC904408_0_40_5.wav
20190621_130000_489_0.wav
20190621_050000_537_0.wav
20190621_030000_79_5.wav
20190621_050000_351_0.wav
20190621_030000_52_5.wav
20190621_130000_384_0.wav
20190621_050000_402_0.wav
20190621_030000_0_0.wav
20190621_100000_358_5.wav
20190621_030000_9_0.wav
20190621_100000_364_5.wav
20190621_100000_370_5.wav
20190621_050000_349_5.wav
20190621_050000_292_5.wav
20190621_030000_27_0.wav
20190621_050000_294_0.wav
20190621_130000_490_5.wav
XC715955_0_10_5.wav
20190621_130000_222_0.wav
20190621_030000_276_0.wav
20190621_030000_277_5.wav
20190621_030000_10_5.wav
20190621_050000_465_0.wav
XC901399_0_22_5.wav
20190621_130000_220_5.wav
20190621_120000_558_0.wav
20190621_100000_369_0.wav
20190621_030000_24_0.wav
20190621_130000_388_5.wav
XC901399_0_54_0.wav
20190621_100000_36

In [19]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,13374,486,4683
Fringilla coelebs_Common Chaffinch,7901,429,1370
Sylvia atricapilla_Eurasian Blackcap,3742,47,493
Regulus ignicapilla_Common Firecrest,2777,104,575
Phylloscopus collybita_Common Chiffchaff,2014,158,674
Erithacus rubecula_European Robin,1554,172,556
Troglodytes troglodytes_Eurasian Wren,1175,122,208
Periparus ater_Coal Tit,881,96,212
Regulus regulus_Goldcrest,725,32,136
Turdus merula_Eurasian Blackbird,658,72,315


In [20]:
len(os.listdir(TRAIN_PATH)), len(os.listdir(TEST_PATH)), len(os.listdir(VALID_PATH))

(55, 23, 20)

Manteniamo solo le specie in cui sono presenti elementi nei tre set

In [21]:
valid_species = os.listdir(VALID_PATH)
REMOVED_PATH = f'{DATASET_PATH}/removed'
os.makedirs(REMOVED_PATH, exist_ok=True)
REMOVED_TRAIN_PATH = f'{REMOVED_PATH}/train'
os.makedirs(REMOVED_TRAIN_PATH, exist_ok=True)
for species in os.listdir(TRAIN_PATH):
    if species not in valid_species:
        os.makedirs(os.path.join(REMOVED_TRAIN_PATH, species), exist_ok=True)
        os.rename(
            os.path.join(TRAIN_PATH, species),
            os.path.join(REMOVED_TRAIN_PATH, species)
        )

In [22]:
valid_species = os.listdir(VALID_PATH)
REMOVED_TEST_PATH = f'{REMOVED_PATH}/test'
os.makedirs(REMOVED_TEST_PATH, exist_ok=True)
for species in os.listdir(TEST_PATH):
    if species not in valid_species:
        os.makedirs(os.path.join(REMOVED_TEST_PATH, species), exist_ok=True)
        os.rename(
            os.path.join(TEST_PATH, species),
            os.path.join(REMOVED_TEST_PATH, species)
        )

In [23]:
len(os.listdir(TRAIN_PATH)), len(os.listdir(TEST_PATH)), len(os.listdir(VALID_PATH))

(20, 20, 20)

In [24]:
    # for audio in os.listdir(os.path.join(TRAIN_PATH, species)):
    #     test_dates = [date for date in train_test_division[species].keys() 
    #                   if train_test_division[species][date]["is_training"] == False]
    #     audio_date = audio.split("_")[0]
    #     if audio_date in test_dates:
    #         source = os.path.join(TRAIN_PATH, species, audio)
    #         dest = os.path.join(TEST_PATH, species, audio)
    #         os.rename(source, dest)

In [25]:
# # MOVE EVERYTHING BACK TO TRAINING
# train_folder = f"E:/Giacomo/Tovanella/{DATASET_NAME}/train"
# test_folder = f"E:/Giacomo/Tovanella/{DATASET_NAME}/test"
# valid_folder = f"E:/Giacomo/Tovanella/{DATASET_NAME}/valid"
# removed = f"E:/Giacomo/Tovanella/{DATASET_NAME}/removed"

# for species in os.listdir(test_folder):
    # for audio in os.listdir(os.path.join(test_folder, species)):
    #     os.rename(
    #         os.path.join(test_folder, species, audio),
    #         os.path.join(train_folder, species, audio),
    #     )
# for species in os.listdir(removed):
#     os.rename(
#         os.path.join(removed, species),
#         os.path.join(train_folder, species),
#     )

In [1]:
print_dataset_count_table(DATASET_PATH)

NameError: name 'print_dataset_count_table' is not defined