# Data Preprocessing

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
import seaborn as sns
import json
import librosa
import soundfile as sf
import random
import pandas as pd
from pydub import AudioSegment
from tqdm import tqdm
from birdlib import utils

In [2]:
DATASET_NAME = 'dataset'
AUDIO_SOURCE = '/home/giacomoschiavo/Tovanella'

In [3]:
DATASET_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}'
TRAIN_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/train'
VALID_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/valid'
TEST_PATH = f'/home/giacomoschiavo/segments/{DATASET_NAME}/test'

In [4]:
species_list = set(os.listdir(TEST_PATH)).intersection(set(os.listdir(TRAIN_PATH)))
if 'Engine_Engine' in species_list:
    species_list.remove('Engine_Engine')
if 'Cuculus canorus_Common Cuckoo' in species_list:
    species_list.remove('Cuculus canorus_Common Cuckoo')

In [5]:
def print_dataset_count_table(dataset_path):
    train_folder = f"{dataset_path}/train"
    valid_folder = f"{dataset_path}/valid"
    test_folder = f"{dataset_path}/test"

    dataset_count = {}
    for species in os.listdir(test_folder):
        if species not in species_list:
            continue
        dataset_count[species] = {
            "train": len(os.listdir(os.path.join(train_folder, species))) if os.path.exists(os.path.join(train_folder, species)) else 0,
            "valid": len(os.listdir(os.path.join(valid_folder, species))) if os.path.exists(os.path.join(valid_folder, species)) else 0,
            "test": len(os.listdir(os.path.join(test_folder, species))) if os.path.exists(os.path.join(test_folder, species)) else 0
        }

    dataset_species_count_df = pd.DataFrame.from_dict(dataset_count, orient='index')
    dataset_species_count_df.index.name = 'Species'
    return dataset_species_count_df.sort_values(by=["train"], ascending=False)


In [6]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fringilla coelebs_Common Chaffinch,9674,424,1370
,6353,729,4683
Sylvia atricapilla_Eurasian Blackcap,3768,192,493
Regulus ignicapilla_Common Firecrest,3301,162,238
Phylloscopus collybita_Common Chiffchaff,2235,112,674
Erithacus rubecula_European Robin,1780,93,556
Loxia curvirostra_Common Crossbill,1449,13,11
Troglodytes troglodytes_Eurasian Wren,1377,73,111
Periparus ater_Coal Tit,1237,57,29
Regulus regulus_Goldcrest,1200,43,16


In [18]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fringilla coelebs_Common Chaffinch,9581,517,1370
,6663,419,4683
Sylvia atricapilla_Eurasian Blackcap,3759,201,493
Regulus ignicapilla_Common Firecrest,3290,173,238
Phylloscopus collybita_Common Chiffchaff,2224,123,674
Erithacus rubecula_European Robin,1772,101,556
Troglodytes troglodytes_Eurasian Wren,1371,79,111
Periparus ater_Coal Tit,1229,65,29
Regulus regulus_Goldcrest,1182,61,16
Wind,841,43,194


In [14]:
dates_count = utils.get_date_count(TRAIN_PATH, species_list)

# Test Integration

In [57]:
# train_integration, test_integration = split_dataset(dates_count, TEST_PATH, test_ratio=0.2)

# with open(f"utils/{DATASET_NAME}/train_integration.json", 'w') as f:
#     json.dump(train_integration, f)

# with open(f"utils/{DATASET_NAME}/test_integration.json", 'w') as f:
#     json.dump(test_integration, f)

In [58]:
# with open(f"utils/{DATASET_NAME}/train_integration.json") as f:
#     train_integration = json.load(f)

# with open(f"utils/{DATASET_NAME}/test_integration.json") as f:
#     test_integration = json.load(f)

In [59]:
# utils.move_by_date(test_integration, TRAIN_PATH, TEST_PATH)

# Validation creation

Prendiamo sempre 1/5 random del training set. (80/20)

ATTENZIONE: siccome e' random, bisogna salvare in un file gli esempi spostati in modo da poter annullare lo spostamento 

In [16]:
# move valid back to train
for species in tqdm(os.listdir(VALID_PATH)):
    if species not in species_list:
        continue
    species_train_path = os.path.join(TRAIN_PATH, species)
    species_valid_path = os.path.join(VALID_PATH, species)
    if not os.path.exists(species_train_path):
        os.makedirs(species_train_path)

    for file in os.listdir(species_valid_path):
        src = os.path.join(species_valid_path, file)
        dst = os.path.join(species_train_path, file)
        if not os.path.exists(dst):
            os.rename(src, dst)
        else:
            print(f"File {dst} already exists, skipping {src}")

100%|██████████| 20/20 [00:00<00:00, 3769.48it/s]


In [17]:
# take 5% of the last samples from each date, samples are ordered by the segment number (32_5 -> 32.5)
for species in species_list:
    os.makedirs(os.path.join(VALID_PATH, species), exist_ok=True)
    train_species_path = os.path.join(TRAIN_PATH, species)
    train_files = sorted(os.listdir(train_species_path))
    date_count = utils.get_date_count(TRAIN_PATH, species_list)
    for date in dates_count[species]:
        date_files = [f for f in train_files if f.startswith(date)]
        date_files = sorted(date_files, key=lambda x: float(x.split('_')[-2] + '.' + x.split('_')[-1].split('.')[0]))
        if len(date_files) == 0:
            continue
        num_samples_to_move = max(1, int(len(date_files) * 0.05))
        samples_to_move = date_files[-num_samples_to_move:]
        
        for sample in samples_to_move:
            src_path = os.path.join(train_species_path, sample)
            dest_path = os.path.join(VALID_PATH, species, sample)
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            os.rename(src_path, dest_path)        


In [15]:
dates_count_valid = utils.get_date_count(TRAIN_PATH, species_list)

In [16]:
for species in species_list:
    os.makedirs(os.path.join(VALID_PATH, species), exist_ok=True)

In [17]:
train_split, valid_split = utils.split_dataset(dates_count_valid, VALID_PATH, test_ratio=0.1)

In [None]:
utils.move_by_date(valid_split, TRAIN_PATH, VALID_PATH)

In [8]:
print_dataset_count_table(DATASET_PATH)

Unnamed: 0_level_0,train,valid,test
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fringilla coelebs_Common Chaffinch,10098,0,1370
,7082,0,4683
Sylvia atricapilla_Eurasian Blackcap,3960,0,493
Regulus ignicapilla_Common Firecrest,3463,0,238
Phylloscopus collybita_Common Chiffchaff,2347,0,674
Erithacus rubecula_European Robin,1873,0,556
Loxia curvirostra_Common Crossbill,1462,0,11
Troglodytes troglodytes_Eurasian Wren,1450,0,111
Periparus ater_Coal Tit,1294,0,29
Regulus regulus_Goldcrest,1243,0,16


In [9]:
len(os.listdir(TRAIN_PATH)), len(os.listdir(TEST_PATH)), len(os.listdir(VALID_PATH))

(55, 23, 20)

Manteniamo solo le specie in cui sono presenti elementi nei tre set

In [10]:
valid_species = os.listdir(VALID_PATH)
REMOVED_PATH = f'{DATASET_PATH}/removed'
os.makedirs(REMOVED_PATH, exist_ok=True)
REMOVED_TRAIN_PATH = f'{REMOVED_PATH}/train'
os.makedirs(REMOVED_TRAIN_PATH, exist_ok=True)
for species in os.listdir(TRAIN_PATH):
    if species not in valid_species:
        os.makedirs(os.path.join(REMOVED_TRAIN_PATH, species), exist_ok=True)
        os.rename(
            os.path.join(TRAIN_PATH, species),
            os.path.join(REMOVED_TRAIN_PATH, species)
        )

In [11]:
valid_species = os.listdir(VALID_PATH)
REMOVED_TEST_PATH = f'{REMOVED_PATH}/test'
os.makedirs(REMOVED_TEST_PATH, exist_ok=True)
for species in os.listdir(TEST_PATH):
    if species not in valid_species:
        os.makedirs(os.path.join(REMOVED_TEST_PATH, species), exist_ok=True)
        os.rename(
            os.path.join(TEST_PATH, species),
            os.path.join(REMOVED_TEST_PATH, species)
        )

In [12]:
len(os.listdir(TRAIN_PATH)), len(os.listdir(TEST_PATH)), len(os.listdir(VALID_PATH))

(20, 20, 20)

In [24]:
    # for audio in os.listdir(os.path.join(TRAIN_PATH, species)):
    #     test_dates = [date for date in train_test_division[species].keys() 
    #                   if train_test_division[species][date]["is_training"] == False]
    #     audio_date = audio.split("_")[0]
    #     if audio_date in test_dates:
    #         source = os.path.join(TRAIN_PATH, species, audio)
    #         dest = os.path.join(TEST_PATH, species, audio)
    #         os.rename(source, dest)

In [25]:
# # MOVE EVERYTHING BACK TO TRAINING
# train_folder = f"E:/Giacomo/Tovanella/{DATASET_NAME}/train"
# test_folder = f"E:/Giacomo/Tovanella/{DATASET_NAME}/test"
# valid_folder = f"E:/Giacomo/Tovanella/{DATASET_NAME}/valid"
# removed = f"E:/Giacomo/Tovanella/{DATASET_NAME}/removed"

# for species in os.listdir(test_folder):
    # for audio in os.listdir(os.path.join(test_folder, species)):
    #     os.rename(
    #         os.path.join(test_folder, species, audio),
    #         os.path.join(train_folder, species, audio),
    #     )
# for species in os.listdir(removed):
#     os.rename(
#         os.path.join(removed, species),
#         os.path.join(train_folder, species),
#     )

In [1]:
print_dataset_count_table(DATASET_PATH)

NameError: name 'print_dataset_count_table' is not defined