# Multi Layer Perceptron Validation

In [1]:
import os
import numpy as np
import torch

from datetime import datetime

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn import preprocessing

from bring_features_and_file_paths import bring_features_and_file_paths
from bring_features_and_file_paths import from_spectrogram_path_to_BirdNET_output_path
from bring_features_and_file_paths import get_BirdNET_detections
from bring_features_and_file_paths import get_spectrogram_time_mark_in_file

In [2]:
#feats, fpaths = bring_features_and_file_paths('/grand/projects/BirdAudio/Soundscapes/Features/', sub_sample=None)
feats, fpaths = bring_features_and_file_paths('/grand/projects/BirdAudio/Soundscapes/Second_Features/', sub_sample=None)
indices = torch.randperm(feats.size()[0])
feats=feats[indices]
fpaths=fpaths[indices]

We have 1976583 feature vectors.


In [3]:
device_to_location = {
    4862: "Grassland 1",
    4879: "Grassland 2",
    23734: "Forest 1",
    23764: "Forest 2",
    23795: "Forest 3",
    23771: "Forest 4",
    23757: (datetime(month=7, day=14, year=2021, hour=14, second=55), ["Forest 3", "Forest 5"]),
    23700: "Forest 6",
}

In [None]:
label='detection'

vectors=[]
labels=[]
BirdNET_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/BirdNET_Output'
Spectrograms_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/Second_Filtered_Spectrograms/'
# Spectrograms_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/First_Filtered_Spectrograms'
spectrogram_duration = 9.0
for k, (fpath, feat) in enumerate(tqdm(zip(fpaths,feats), total=len(feats))):
    file_path = ''.join([chr(int(x)) for x in fpath]).replace('~','')
    BirdNET_PATH = from_spectrogram_path_to_BirdNET_output_path(fpath)

    try:
        start_time = get_spectrogram_time_mark_in_file(fpath, spectrogram_duration)
        
        interval = (start_time, start_time + spectrogram_duration)
        data_elements = get_BirdNET_detections(os.path.join(BirdNET_BASE_PATH, BirdNET_PATH), interval, confidence_threshold = 0.0)
        for data_element in data_elements:
            vectors.append(np.array(feat))
            labels.append(data_element[label])

    except:
        print("Something was wrong with")
        print(file_path)


#     if k > 100000:
    if k > 100000:
        break

vectors=np.array(vectors)
labels=np.array(labels)
unique_labels=np.unique(labels)
aux=[]
for label in labels:
    aux.append(np.where(unique_labels==label)[0].item())
    
labels=np.array(aux)

In [4]:
label='location'

vectors=[]
labels=[]
BirdNET_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/BirdNET_Output'
Spectrograms_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/Second_Filtered_Spectrograms/'
# Spectrograms_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/First_Filtered_Spectrograms'
spectrogram_duration = 9.0
for k, (fpath, feat) in enumerate(tqdm(zip(fpaths,feats), total=len(feats))):
    file_path = ''.join([chr(int(x)) for x in fpath]).replace('~','')
    BirdNET_PATH = from_spectrogram_path_to_BirdNET_output_path(fpath)

    try:
        start_time = get_spectrogram_time_mark_in_file(fpath, spectrogram_duration)
        
        interval = (start_time, start_time + spectrogram_duration)
        data_elements = get_BirdNET_detections(os.path.join(BirdNET_BASE_PATH, BirdNET_PATH), interval, confidence_threshold = 0.0)
        for data_element in data_elements:
            device=int(data_element['device'])
            if device!=23788:
                if label=='location':
                    location = device_to_location[device]
                    if isinstance(location, tuple):
                        date = datetime(month=data_element['month'], day=data_element['day'], year=data_element['year'], hour=data_element['hour'], second=data_element['second'])
                        if date < location[0]:
                            data_element['location'] = location[1][0]
                        else:
                            data_element['location'] = location[1][1]
                    else:
                        data_element['location'] = location
                vectors.append(np.array(feat))
                labels.append(data_element[label])

    except:
        print("Something was wrong with")
        print(file_path)


#     if k > 100000:
    if k > 100000:
        break

vectors=np.array(vectors)
labels=np.array(labels)
unique_labels=np.unique(labels)
aux=[]
for label in labels:
    aux.append(np.where(unique_labels==label)[0].item())
    
labels=np.array(aux)

  5%|▌         | 100001/1976583 [05:37<1:45:29, 296.50it/s]


In [5]:
vectors.shape

(114698, 384)

In [6]:
labels.shape

(114698,)

In [7]:
unique_labels

array(['Forest 1', 'Forest 2', 'Forest 3', 'Forest 4', 'Forest 5',
       'Forest 6', 'Grassland 1', 'Grassland 2'], dtype='<U11')

In [8]:
# clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1, activation='identity', max_iter=2000)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1, activation='identity', max_iter=2000)

In [9]:
scaler = preprocessing.StandardScaler().fit(vectors)
clf.fit(scaler.transform(vectors), labels)

In [10]:
kf = KFold(n_splits=10)

In [None]:
performances=[]
for train_indices, test_indices in tqdm(kf.split(vectors), total=10):
    scaler = preprocessing.StandardScaler().fit(vectors[train_indices])
    clf.fit(scaler.transform(vectors[train_indices]), labels[train_indices])
    performances.append(clf.score(scaler.transform(vectors[test_indices]), labels[test_indices]))

 60%|██████    | 6/10 [1:14:17<50:12, 753.01s/it]  

In [None]:
np.mean(np.array(performances))

In [None]:
mask=np.array([unique_labels=='Blue Jay'])
mask=np.logical_or(mask,np.array([unique_labels=='Eastern Wood-Pewee']))
mask=np.logical_or(mask,np.array([unique_labels=='Indigo Bunting']))
# mask=np.logical_or(mask,np.array([unique_labels=='Ovenbird']))
# mask=np.logical_or(mask,np.array([unique_labels=='Scarlet Tanager']))
mask=np.logical_or(mask,np.array([unique_labels=='No detection']))

chosen_labels=np.where(mask[0])[0]
chosen_labels

In [None]:
def find_indices(A, B):
    indices = np.where(np.isin(A, B))
    return indices


In [None]:
indices=find_indices(labels, chosen_labels)[0]
indices

In [None]:
f_vectors = vectors[indices]
f_labels = labels[indices]
f_performances=[]
for train_indices, test_indices in tqdm(kf.split(f_vectors), total=10):
    scaler = preprocessing.StandardScaler().fit(f_vectors[train_indices])
    clf.fit(scaler.transform(f_vectors[train_indices]), f_labels[train_indices])
    f_performances.append(clf.score(scaler.transform(f_vectors[test_indices]), f_labels[test_indices]))

In [None]:
np.mean(np.array(f_performances))

In [None]:
clf.coefs_[1].shape