In [1]:
import os
import json
import pandas as pd

In [2]:
with open("../utils/category_info.json") as f:
    category_info = json.load(f)

In [3]:
DATASET_NAME = "NEW_DATASET"
DATASET_PATH = f"E:/Giacomo/Tovanella/{DATASET_NAME}"

### Count samples divided in splits

In [4]:
with open(f"../utils/{DATASET_NAME}/train_test_division.json") as f:
    train_test_div = json.load(f)

species_count = {}
for species_name, dates in train_test_div.items():
    species_count[species_name] = {"train": 0, "test": 0}
    species_count[species_name]["train"] = sum([date["count"] for date in dates.values() if date["is_training"] == True])
    species_count[species_name]["test"] = sum([date["count"] for date in dates.values() if date["is_training"] == False])

In [5]:
species_count_df = pd.DataFrame(species_count).T
species_count_df.to_csv(f"../utils/{DATASET_NAME}/species_count_df.csv")

### Divides species audio path in "train" and "test"

In [7]:
species_split = {}
species_list = list(category_info.keys())
for species in species_list:
    species_split[species] = {"train": set(), "test": set()}

In [8]:
split_count = "train"
for species in species_list:
    # check if removed
    split_path = os.path.join(DATASET_PATH, split_count, species)
    removed_path = os.path.join(DATASET_PATH, "removed", species)
    if os.path.exists(split_path):
        folder_path = split_path
    elif os.path.exists(removed_path):
        folder_path = removed_path
    else:
        print("Deleted species?", species)
        continue
    if species not in species_split:
        species_split[species][split] = set()
    for audio in os.listdir(folder_path):
        # 20190608_070000_64.wav
        date = audio.split("_")[0]
        is_training = train_test_div[species][date]["is_training"]
        split = "train" if is_training else "test"
        species_split[species][split].add(audio)

Deleted species? Prunella modularis_Dunnock
Deleted species? Spinus spinus_Eurasian Siskin


In [9]:
split_count = "test"
for species in species_list:
    # check if removed
    split_path = os.path.join(DATASET_PATH, split_count, species)
    removed_path = os.path.join(DATASET_PATH, "removed", species)
    if os.path.exists(split_path):
        folder_path = split_path
    elif os.path.exists(removed_path):
        folder_path = removed_path
    else:
        print("Deleted species?", species)
        continue
    if species not in species_split:
        species_split[species][split] = set()
    for audio in os.listdir(folder_path):
        # 20190608_070000_64.wav
        date = audio.split("_")[0]
        is_training = train_test_div[species][date]["is_training"]
        split = "train" if is_training else "test"
        species_split[species][split].add(audio)


Deleted species? Prunella modularis_Dunnock
Deleted species? Spinus spinus_Eurasian Siskin


In [10]:
for species in species_split:
    species_split[species]["train"] = list(species_split[species]["train"])
    species_split[species]["test"] = list(species_split[species]["test"])

In [None]:
# with open(f"../utils/{DATASET_NAME}/species_split.json", "w") as f:
#     json.dump(species_split, f)

### Get mean confidence score from all models for each audio in test

In [12]:
# get list of all models
CLASSIFIERS_PATH = "../classifiers/official/"
all_pred_test_segments = {}
for model in os.listdir(CLASSIFIERS_PATH):
    path_to_json = os.path.join(CLASSIFIERS_PATH, model, "test_complete_pred_segments_1.json")
    if not os.path.exists(path_to_json):
        continue
    with open(path_to_json) as f:
        pred_segments = json.load(f)
    all_pred_test_segments[model] = pred_segments

In [14]:
from collections import defaultdict

aggregated_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for model, audio_data in all_pred_test_segments.items():
    for audio, segments in audio_data.items():
        for segment, species_scores in segments.items():
            for species, score in species_scores.items():
                aggregated_data[audio][segment][species].append(score)


In [16]:
mean_conf_scores = {}
for audio, segments in aggregated_data.items():
    mean_conf_scores[audio] = {}
    for segment, species_scores in segments.items():
        mean_conf_scores[audio][segment] = {
            species: sum(scores) / len(scores) for species, scores in species_scores.items()
        }

mean_conf_scores["20190608_070000.WAV"]

{'63': {'Turdus philomelos_Song Thrush': 0.3069578359524409,
  'Erithacus rubecula_European Robin': 0.2717607033749421,
  'Dryobates minor_Lesser Spotted Woodpecker': 0.1505056917667389,
  'Fringilla coelebs_Common Chaffinch': 0.180677759150664,
  'Regulus regulus_Goldcrest': 0.11488642543554306,
  'Troglodytes troglodytes_Eurasian Wren': 0.1187060996890068},
 '82': {'Erithacus rubecula_European Robin': 0.953721210360527,
  'Periparus ater_Coal Tit': 0.11245439201593399},
 '83': {'Erithacus rubecula_European Robin': 0.4993184916675091,
  'Turdus philomelos_Song Thrush': 0.1883356049656868,
  'Regulus ignicapilla_Common Firecrest': 0.11537600308656693,
  'Certhia familiaris_Eurasian Treecreeper': 0.12419329211115837,
  'Fringilla coelebs_Common Chaffinch': 0.1872944012284279,
  'Periparus ater_Coal Tit': 0.11864691972732544},
 '88': {'Turdus philomelos_Song Thrush': 0.39168381028705174,
  'Erithacus rubecula_European Robin': 0.18211418949067593,
  'Regulus regulus_Goldcrest': 0.14786259

In [None]:
# with open("../utils/mean_conf_scores.json", "w") as f:
#     json.dump(mean_conf_scores, f)