In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os, collections
import numpy as np
import pandas as pd
import scipy, soundfile, librosa
import mir_eval
from tqdm import tqdm_notebook as tqdm
from fnmatch import fnmatch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from thundersvm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # for imputation

import warnings
warnings.filterwarnings('ignore')

# Load data

In [2]:
data_root = 'data/'
wav_files = [] 
with open('file_names.txt', 'r') as f:
    Lines = f.readlines()
    for line in Lines: 
        wav_files.append(line.strip())
        
print('Number of audio files:', format(len(wav_files)))
print(*wav_files, sep='\n')

# chick call annotation files
csv_files = [file.replace('.wav', '.csv') for file in wav_files]
chickID = [int(wav_files[k][:2]) for k in range(len(wav_files))]
print('Chick ID:', *chickID, sep=' ')

Number of audio files: 4
85SM_2020-01-27_14-40-03.wav
87SM_2020-01-27_15-43-59.wav
89SF_2020-01-27_16-51-22.wav
91SM_2020-01-30_10-20-53.wav
Chick ID: 85 87 89 91


In [3]:
# get call segs information
anno = []
for file in csv_files:
    file_anno = pd.read_csv(data_root+file)
    file_anno['chick'] = [int(file[:2])] * len(file_anno)
    anno.append(file_anno)
    
anno = pd.concat(anno, axis=0, ignore_index=True)

call_segs = pd.DataFrame({'chickID':[], 'callID':[], 'truth_start': [], 'truth_end': [], 'truth_label':[]})
call_segs['chickID'] = list(anno['chick'][1::2])
call_segs['callID'] = np.arange(len(call_segs['chickID']), dtype=int)
call_segs['truth_start'] = list(anno['time'][0::2])
call_segs['truth_end'] = list(anno['time'][1::2])
call_segs['truth_label'] = list(anno['anno'][1::2])

call_segs.loc[(call_segs.truth_label == 'p'),'truth_label']= 1
call_segs.loc[(call_segs.truth_label == 'd'),'truth_label']= 2
call_segs.loc[(call_segs.truth_label == 'q'),'truth_label']= 3
call_segs

Unnamed: 0,chickID,callID,truth_start,truth_end,truth_label
0,85,0,0.000000,0.123356,1
1,85,1,0.336961,0.595465,2
2,85,2,0.830385,1.137415,2
3,85,3,1.258957,1.551020,2
4,85,4,1.659864,1.991837,2
...,...,...,...,...,...
3008,91,3008,629.097506,629.471202,2
3009,91,3009,632.406349,632.756825,2
3010,91,3010,632.883809,633.269841,2
3011,91,3011,633.361270,633.747302,2


# MFCC feature extraction

In [4]:
sr = 44100
hop_sample= 1103  # 25ms
print('frame size: %sms' % (int(hop_sample/44100*1000)))

# MFCC feature extraction, default: n_fft=2048, hop_length=512 
feature = {k:[] for k in range(len(wav_files))}

print('MFCC feature shape: ')
for k in range(len(wav_files)):
    y, sr = soundfile.read(data_root + wav_files[k])
    y = np.mean(y,1)
    feature[k] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=24, hop_length=hop_sample)
    print(feature[k].shape)

frame size: 25ms
MFCC feature shape: 
(24, 27189)
(24, 27102)
(24, 25678)
(24, 25414)


In [5]:
feature_conca = np.zeros((feature[0].shape[0],1))
chick_id = []

for k in range(len(wav_files)):
    feature_conca = np.hstack((feature_conca, feature[k]))
    chick_id.extend([int(wav_files[k][:2])] * feature[k].shape[1])
        
chick_id = np.array(chick_id)
feature_conca = np.transpose(feature_conca)
feature_conca = feature_conca[1:]

In [6]:
# frame-wise label_id and call_id
frame_num = []
label_id = np.zeros((len(chick_id)), dtype=int)   # 1=pleasure, 2=distress, 3=unknown
call_id = np.zeros((len(chick_id)), dtype=int)

for chick in range(len(wav_files)):
    frame_num.append(feature[chick].shape[1])
    seg_chick = call_segs[call_segs['chickID']== int(wav_files[chick][:2])]
    sample_start = list(map(lambda x:int(x * sr / hop_sample) + sum(frame_num[:chick]), seg_chick['truth_start']))
    sample_end = list(map(lambda x:int(x * sr / hop_sample) + sum(frame_num[:chick]), seg_chick['truth_end']))
    
    for k in range(len(seg_chick)): # for each sample
        label_id[sample_start[k]:sample_end[k]] = np.zeros((sample_end[k]-sample_start[k])) + list(seg_chick['truth_label'])[k]
        call_id[sample_start[k]:sample_end[k]] = np.zeros((sample_end[k]-sample_start[k])) + list(seg_chick['callID'])[k]

In [7]:
print(feature_conca.shape, chick_id.shape, label_id.shape, call_id.shape)

(105383, 24) (105383,) (105383,) (105383,)


# Detected segments

In [8]:
label_pred_id = np.zeros((len(label_id)), dtype=int)
call_id_pred = np.zeros((len(label_id)), dtype=int)

segs = {k:[] for k in range(len(wav_files))}
segs[0] = np.load('refined_segs.npz')['arr_0']
segs[1] = np.load('refined_segs.npz')['arr_1']
segs[2] = np.load('refined_segs.npz')['arr_2']
segs[3] = np.load('refined_segs.npz')['arr_3']

# format detected segments as [chick_id, call_id, start, end, label]
seg_pred = []
call_num_detect = [len(segs[k]) for k in range(len(segs))]

for chick in range(len(wav_files)):
    for k in range(len(segs[chick])):
        seg_pred.append([int(wav_files[chick][:2]), k+sum(call_num_detect[:chick]), segs[chick][k,0], segs[chick][k,1], np.nan])
        start_sam = int(np.floor(segs[chick][k,0] * sr / hop_sample)) + sum(frame_num[:chick])
        end_sam = int(np.ceil(segs[chick][k,1] * sr / hop_sample)) + sum(frame_num[:chick])
        label_pred_id[start_sam:end_sam] = np.zeros((end_sam-start_sam), dtype=int) + 100
        call_id_pred[start_sam:end_sam] = np.zeros((end_sam-start_sam), dtype=int) + k+sum(call_num_detect[:chick])
        
seg_pred = np.array(seg_pred)

# Classification

In [9]:
# SVM settings
kernel = 'rbf'
gpu_id = 0 # thundersvm uses GPU to train SVM classifier
param_grid = {'C': [100,10,1], 'gamma': [.0001, .001, .01]}
scoring = 'f1_macro'
cv = 3

F_event = []; F_frame = []
dur = .5  # for event-based evaluation: duration of detected event should be at least 50% of the ground truth duration

In [10]:
# params for resampling predictions into the same frame size as that of the joint scattering feature 
# => for fair comparison of the reults
inframeSize = hop_sample / 44100 # ms
outframeSize = 0.186 # ms

subsample_rate = int(outframeSize / inframeSize)

## subject-independent train-test

In [11]:
for chick in tqdm(chickID):
    
    subset = np.ones((len(label_id)), dtype=int) * 100

    for k in range(len(label_id)):
        if chick_id[k] != chick and label_id[k] != 0:  
            subset[k] = 0
        elif chick_id[k] == chick and label_pred_id[k] != 0:  # assign label for detected call segments
            subset[k] = 1

    feature_tr, label_tr = feature_conca[subset == 0], label_id[subset == 0]
    feature_te, label_te = feature_conca[subset == 1], label_id[subset == 1]

    # imputation
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    feature_tr = imp.fit_transform(feature_tr)
    feature_te = imp.transform(feature_te)

    # standardisation
    stdscaler = StandardScaler()
    feature_tr = stdscaler.fit_transform(feature_tr)
    feature_te = stdscaler.transform(feature_te)
    print(feature_tr.shape, feature_te.shape)

    # classification
    clf =  GridSearchCV(SVC(kernel=kernel, gpu_id=gpu_id), param_grid=param_grid, cv=cv, scoring=scoring)
    clf = clf.fit(feature_tr, label_tr)
    label_pred = clf.predict(feature_te)
    
    ########## frame-based evaluation ##########
    # detected segments back into frame level
    label_te_ori = label_id[chick_id==chick] # include label=0
    label_pred_ori = label_pred_id[chick_id==chick] # include label=0
    label_pred_true = np.zeros((len(label_te_ori)), dtype=int)
    label_pred_true[label_pred_ori!=0] = label_pred
    label_pred_true[label_pred_true==100] = 0
    
    # resample into the same frame size as that of the JTFS feature
    label_te_ori = label_te_ori[::subsample_rate]
    label_pred_true = label_pred_true[::subsample_rate]

    report = pd.DataFrame(classification_report(label_te_ori, label_pred_true, output_dict=True))
    F_frame.append([chick, report['1']['f1-score'], report['2']['f1-score'], report['3']['f1-score']])

    ########## event-based evaluation ##########
    # predicted segments: [chick_id, call_id, start, end, label]
    call_id_test = call_id_pred[subset == 1]
    _, idx = np.unique(call_id_test, return_index=True)
    call_id_test_unique = call_id_test[np.sort(idx)]

    seg_label_pred = []
    for call in call_id_test_unique:
        seg_label_pred.append(collections.Counter(label_pred[call_id_test==call]).most_common(1)[0][0])

    pred_event = []

    for k in call_id_test_unique:
        pred_event.append(seg_pred[seg_pred[:,1] == k])
    pred_event = np.squeeze(np.array(pred_event))
    pred_event[:, -1] = seg_label_pred

    truth_event = call_segs[call_segs['chickID'] == chick].to_numpy()

    pred_event_ori = pred_event; truth_event_ori = truth_event

    event_result = []
    for label in range(1,len(np.unique(label_id))): # 1=pleasure, 2=distress, 3=uncertain
        # should compare only the target events
        pred_event = pred_event_ori; truth_event = truth_event_ori
        pred_event=pred_event[pred_event[:,-1]==label]; truth_event=truth_event[truth_event[:,-1]==label] # label

        matched = mir_eval.util.match_events(truth_event[:,2], pred_event[:,2],window=.2, distance=None) # start

        # check each one on the duration
        TP = 0
        for k in range(len(matched)): 
            interval_truth = pd.Interval(truth_event[matched[k][0],2], truth_event[matched[k][0],3]) # start, end
            interval_pred = pd.Interval(pred_event[matched[k][1],2], pred_event[matched[k][1],3])
            if interval_truth.overlaps(interval_pred):
                time_sorted = np.sort([truth_event[matched[k][0],2], truth_event[matched[k][0],3], 
                      pred_event[matched[k][1],2], pred_event[matched[k][1],3]]) # start, end, start, end
                event_dur = time_sorted[2] - time_sorted[1]
                if event_dur / (truth_event[matched[k][0],3] - truth_event[matched[k][0],2]) > dur: # at least half duration
                     TP += 1

        FN = len(truth_event[truth_event[:,-1]==label]) - TP
        FP = len(pred_event[pred_event[:,-1]==label]) - TP

        if TP != 0:
            P_event = TP / (TP + FP) * 100; R_event = TP / (TP + FN) * 100; 
            event_result.extend([2 * P_event * R_event / (P_event + R_event)])
        else:
            event_result.extend([0])
            
    F_event.append([chick] + event_result)

  0%|          | 0/4 [00:00<?, ?it/s]

(24802, 24) (6988, 24)
(20405, 24) (9584, 24)
(19087, 24) (10684, 24)
(23792, 24) (7053, 24)


## average

In [12]:
F_frame = np.round(np.array(F_frame) * 100)
print('Frame-based F-scores for pleasure, contact, and uncertain calls: {}'.format(
    [elem for elem in np.mean(F_frame,0)[1:]]))
print('Event-based F-scores for pleasure, contact, and uncertain calls: {}'.format(
    [round(elem,2) for elem in np.mean(F_event,0)[1:]]))

Frame-based F-scores for pleasure, contact, and uncertain calls: [4.5, 77.0, 2.75]
Event-based F-scores for pleasure, contact, and uncertain calls: [3.58, 75.04, 2.93]


In [13]:
print('Frame-based average F-score: {}'.format(round(np.mean(np.mean(F_frame,0)[1:]), 2)))
print('Event-based average F-score: {}'.format(round(np.mean(np.mean(F_event,0)[1:]), 2)))

Frame-based average F-score: 28.08
Event-based average F-score: 27.18
