In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import scipy
import mir_eval
from time import time
from fnmatch import fnmatch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from thundersvm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # for imputation

import warnings
warnings.filterwarnings('ignore')

# Load data

In [2]:
data_root = 'data/'
wav_files = [] 
with open('file_names.txt', 'r') as f:
    Lines = f.readlines()
    for line in Lines: 
        wav_files.append(line.strip())
        
print('Number of audio files:', format(len(wav_files)))
print(*wav_files, sep='\n')

# chick call annotation files
csv_files = [file.replace('.wav', '.csv') for file in wav_files]
chickID = [int(wav_files[k][:2]) for k in range(len(wav_files))]
print('Chick ID:', *chickID, sep=' ')

Number of audio files: 4
85SM_2020-01-27_14-40-03.wav
87SM_2020-01-27_15-43-59.wav
89SF_2020-01-27_16-51-22.wav
91SM_2020-01-30_10-20-53.wav
Chick ID: 85 87 89 91


In [3]:
# get call segment information
anno = []
for file in csv_files:
    file_anno = pd.read_csv(data_root+file)
    file_anno['chick'] = [int(file[:2])] * len(file_anno)
    anno.append(file_anno)
    
anno = pd.concat(anno, axis=0, ignore_index=True)

call_segs = pd.DataFrame({'chickID':[], 'callID':[], 'truth_start': [], 'truth_end': [], 'truth_label':[]})
call_segs['chickID'] = list(anno['chick'][1::2])
call_segs['callID'] = np.arange(len(call_segs['chickID']), dtype=int)
call_segs['truth_start'] = list(anno['time'][0::2])
call_segs['truth_end'] = list(anno['time'][1::2])
call_segs['truth_label'] = list(anno['anno'][1::2])

call_segs.loc[(call_segs.truth_label == 'p'),'truth_label']= 1
call_segs.loc[(call_segs.truth_label == 'd'),'truth_label']= 2
call_segs.loc[(call_segs.truth_label == 'q'),'truth_label']= 3
call_segs

Unnamed: 0,chickID,callID,truth_start,truth_end,truth_label
0,85,0,0.000000,0.123356,1
1,85,1,0.336961,0.595465,2
2,85,2,0.830385,1.137415,2
3,85,3,1.258957,1.551020,2
4,85,4,1.659864,1.991837,2
...,...,...,...,...,...
3008,91,3008,629.097506,629.471202,2
3009,91,3009,632.406349,632.756825,2
3010,91,3010,632.883809,633.269841,2
3011,91,3011,633.361270,633.747302,2


# JTFS feature

In [4]:
# load extracted JTFS feature
joint = scipy.io.loadmat('JTFS_feature.mat')['fileFeatures'].squeeze()

In [5]:
# scattering params used
sr = 44100
T = 2**14
oversampling = 2
hop_sample = int(T/(2**oversampling))
print('frame size: %sms' % (int(hop_sample/44100*1000)))
print('frame samples: %d' % hop_sample)

frame size: 92ms
frame samples: 4096


In [6]:
feature = {k: np.vstack((joint[k], joint[k])) for k in range(len(joint))}

for k in range(len(joint)):
    for m in range(2,joint[k].shape[1]-2):
        feature[k][:joint[k].shape[0],m] = np.mean(joint[k][:,m-2:m+3], axis=1)
        feature[k][joint[k].shape[0]:, m] = np.std(joint[k][:,m-2:m+3], axis=1)
del(joint)

for k in range(len(feature)):
    print(feature[k].shape)
    
feature_conca = np.zeros((feature[0].shape[0],1))
chick_id = []

for k in range(len(wav_files)):
    feature_conca = np.hstack((feature_conca, feature[k]))
    chick_id.extend([int(wav_files[k][:2])] * feature[k].shape[1])
        
chick_id = np.array(chick_id)
feature_conca = np.transpose(feature_conca)
feature_conca = feature_conca[1:]

(850, 7322)
(850, 7299)
(850, 6915)
(850, 6844)


In [7]:
# frame-wise label_id and call_id
frame_num = []
label_id = np.zeros((len(chick_id)), dtype=int)   # 1=pleasure, 2=distress, 3=uncertain
call_id = np.zeros((len(chick_id)), dtype=int)

for chick in range(len(wav_files)):
    frame_num.append(feature[chick].shape[1])
    seg_chick = call_segs[call_segs['chickID']==int(wav_files[chick][:2])]
    sample_start = list(map(lambda x:int(x * sr / hop_sample) + sum(frame_num[:chick]), seg_chick['truth_start']))
    sample_end = list(map(lambda x:int(x * sr / hop_sample) + sum(frame_num[:chick]), seg_chick['truth_end']))
    
    for k in range(len(seg_chick)): # for each sample
        label_id[sample_start[k]:sample_end[k]] = np.zeros((sample_end[k]-sample_start[k])) + list(seg_chick['truth_label'])[k]
        call_id[sample_start[k]:sample_end[k]] = np.zeros((sample_end[k]-sample_start[k])) + list(seg_chick['callID'])[k]

In [8]:
print(feature_conca.shape, chick_id.shape, call_id.shape, label_id.shape)

(28380, 850) (28380,) (28380,) (28380,)



# Classification

In [9]:
# SVM settings
kernel = 'rbf'
gpu_id = 0 # thundersvm uses GPU to train SVM classifier
param_grid = {'C': [100,10,1], 'gamma': [.0001, .001, .01]}
scoring = 'f1_macro'
cv = 3

F_event = []; F_frame = []
dur = .5  # for event-based evaluation: duration of detected event should be at least 50% of the ground truth duration

## subject-independent train-test

In [10]:
for chick in chickID:
    subset = np.ones((len(label_id)), dtype=int) * 100

    for k in range(len(label_id)):
        if chick_id[k] != chick:
            subset[k] = 0
        else:
            subset[k] = 1

    feature_tr, label_tr = feature_conca[subset == 0], label_id[subset == 0]
    feature_te, label_te = feature_conca[subset == 1], label_id[subset == 1]

    # imputation
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    feature_tr = imp.fit_transform(feature_tr)
    feature_te = imp.transform(feature_te)

    # standardisation
    stdscaler = StandardScaler()
    feature_tr = stdscaler.fit_transform(feature_tr)
    feature_te = stdscaler.transform(feature_te)
    print(feature_tr.shape, feature_te.shape)

    # classification
    clf =  GridSearchCV(SVC(kernel=kernel, gpu_id=gpu_id), param_grid=param_grid, cv=cv, scoring=scoring)
    clf = clf.fit(feature_tr, label_tr)
    label_pred = clf.predict(feature_te)
    
    ###### frame-based evaluation ######
    report = pd.DataFrame(classification_report(label_te, label_pred, output_dict=True))
    F_frame.append([chick, report['1']['f1-score'], report['2']['f1-score'], report['3']['f1-score']])
    
    ###### event-based evaluation ######
    trainDur = []
    for k in range(len(call_segs)): 
        if call_segs['chickID'][k] != chick:   # [chickID, callID, truth_start, truth_end, label]
            trainDur.append(call_segs['truth_end'][k] - call_segs['truth_start'][k])
    minDurTrain = min(trainDur)

    frame_time = T / sr / (2**oversampling)

    # pred events
    pred = pd.DataFrame({'pred': label_pred}, dtype=int)
    pred['block'] = (pred.pred.shift(1) != pred.pred).astype(int).cumsum()
    pred_event = pred.reset_index().groupby(['block','pred'])['index'].apply(np.array)

    # truth events
    truth = pd.DataFrame({'truth': label_te}, dtype=int)
    truth['block'] = (truth.truth.shift(1) != truth.truth).astype(int).cumsum()
    truth_event = truth.reset_index().groupby(['block','truth'])['index'].apply(np.array)

    # pred_event: [index, start, duration, label]
    predAll = np.zeros((1,4), dtype=int); ind = 0
    for key, elem in pred_event.items():
        predAll =  np.vstack((predAll, np.array([ind, elem[0] * frame_time, (elem[-1]-elem[0]+1)* frame_time, key[1]])))
        ind += 1
    predAll=predAll[1:]; predAll_ori = predAll

    # truth_event
    truthAll = np.zeros((1,4), dtype=int); ind = 0
    for key, elem in truth_event.items():
        truthAll =  np.vstack((truthAll, np.array([ind, elem[0] * frame_time, (elem[-1]-elem[0]+1)* frame_time, key[1]])))
        ind += 1
    truthAll=truthAll[1:];  truthAll_ori = truthAll

    print(predAll.shape, truthAll.shape)

    # gap filling
    count = 0; k = 0

    while count < predAll.shape[0]-1:
        if predAll[k+1,1] - (predAll[k,1]+predAll[k,2]) < minDurTrain: # start of next - end of current
            predAll[k+1,2] = predAll[k+1,1] + predAll[k+1,2] - predAll[k,1]   # duration 
            predAll[k+1,1] = predAll[k,1]    # start move left
            predAll = np.delete(predAll, k, 0)
        else: 
            k += 1
        count += 1

    # minimum duration prunning
    predAll=predAll[predAll[:,2]>=minDurTrain]
    minDurPred = min(predAll[:,2])
    print(minDurTrain, minDurPred, frame_time)

    event_result = []
    for label in range(1,len(np.unique(label_id))):
        # should compare only the target events
        predAll = predAll_ori; truthAll = truthAll_ori
        predAll=predAll[predAll[:,3]==label]; truthAll=truthAll[truthAll[:,3]==label]

        matched = mir_eval.util.match_events(truthAll[:,1], predAll[:,1],window=.2, distance=None)

        # check each one on the duration
        TP = 0
        for k in range(len(matched)):
            interval_truth = pd.Interval(truthAll[matched[k][0],1], truthAll[matched[k][0],2]+truthAll[matched[k][0],1])
            interval_pred = pd.Interval(predAll[matched[k][1],1], predAll[matched[k][1],2]+predAll[matched[k][1],1])
            if interval_truth.overlaps(interval_pred):
                time_sorted = np.sort([truthAll[matched[k][0],1], truthAll[matched[k][0],2]+truthAll[matched[k][0],1], 
                      predAll[matched[k][1],1], predAll[matched[k][1],2]+predAll[matched[k][1],1]]) # start, end, start, end
                event_dur = time_sorted[2] - time_sorted[1]
                if event_dur / truthAll[matched[k][0],2] > dur: # at least half duration
                     TP += 1

        FN = len(truthAll[truthAll[:,3]==label]) - TP
        FP = len(predAll[predAll[:,3]==label]) - TP

        if TP != 0:
            P_event = TP / (TP + FP) * 100; R_event = TP / (TP + FN) * 100; 
            event_result.extend([2 * P_event * R_event / (P_event + R_event)])
        else:
            event_result.extend([0])
            
    F_event.append([chick] + event_result)

(21058, 850) (7322, 850)
(442, 4) (820, 4)
0.04825389999996332 0.09287981859410431 0.09287981859410431
(21081, 850) (7299, 850)
(1078, 4) (2266, 4)
0.03936509999994087 0.09287981859410431 0.09287981859410431
(21465, 850) (6915, 850)
(179, 4) (1425, 4)
0.03936509999994087 0.09287981859410431 0.09287981859410431
(21536, 850) (6844, 850)
(99, 4) (959, 4)
0.03936509999994087 0.09287981859410431 0.09287981859410431


## average

In [11]:
print('Frame-based F-scores for pleasure, contact, and uncertain calls: {}'.format(
    [round(elem, 2) for elem in np.mean(F_frame,0)[1:]*100]))
print('Event-based F-scores for pleasure, contact, and uncertain calls: {}'.format(
    [round(elem, 2) for elem in np.mean(F_event,0)[1:]]))

Frame-based F-scores for pleasure, contact, and uncertain calls: [10.5, 79.54, 6.25]
Event-based F-scores for pleasure, contact, and uncertain calls: [7.04, 29.77, 1.41]


In [12]:
print('Frame-based average F-score: {}'.format(round(np.mean(np.mean(F_frame,0)[1:]*100), 2)))
print('Event-based average F-score: {}'.format(round(np.mean(np.mean(F_event,0)[1:]), 2)))

Frame-based average F-score: 32.1
Event-based average F-score: 12.74
