In [1]:
import pandas as pd
import librosa
import sklearn.metrics as metrics


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive



# Obtain label and prediction sets

1. Labels: parse the XML file of selected audios from the clips database to get information on start and end position (in samples) of speech segments; obtain a dictionary with the timestamps

2. Predictions: Apply the SS-VAD on these files and obtain a dictionary with the predicted timestamps from the output dataframe containing speech and silence segments.



In [3]:
#xml parsing


from bs4 import BeautifulSoup

xml_file = '/content/drive/MyDrive/Thesis/Data/radio_firenze/radio_lungo/labels/RTV_FIRENZE_etichettato_all.xml'


with open(xml_file, 'r', encoding='ISO-8859-1') as file:
    soup = BeautifulSoup(file, 'xml')

#
annotated_sil = {} #timestamps will be stored here for each file

for turno in soup.find_all('TURNO'):
  dialogo = turno.parent

  x=0 #for each segment

  timestamp = {}
  turno_id = turno['id']
  for wrd_tag in turno.find_all('WRD'):
    wrd_id = wrd_tag['id']
    start_time = wrd_tag['start']
    end_time = wrd_tag['end']
    pos = wrd_tag['POS']
    lemma = wrd_tag['lemma']
    turno_key = dialogo['id']+'#'+turno_id


    if lemma == "__" or lemma == "" and not wrd_tag.find('eeh'):

      timestamp[f'segment_{x}_silence'] = (start_time, end_time)

      annotated_sil[turno_key] = timestamp
      x+=1


    else:

      timestamp[f'segment_{x}_speech'] = (start_time, end_time)
      annotated_sil[turno_key] = timestamp

      x+=1


annotated_sil.keys()


dict_keys(['RDdc_01F#1', 'RDdc_01F#2', 'RDis_01F#1', 'RDis_01F#2', 'RDis_01F#3', 'RDis_01F#4', 'RDit_02F#2', 'RDit_03F#1', 'RDit_03F#2', 'RDit_03F#3', 'RDit_05F#1', 'RDit_05F#2', 'RDit_05F#3', 'RDit_06F#1', 'RDit_06F#2', 'RDit_06F#3', 'RDit_06F#9', 'RDit_06F#10', 'RDit_06F#12', 'RDit_06F#13', 'RDit_06F#22'])

In [4]:
fold= '/content/drive/MyDrive/Thesis/Data/radio_firenze/LABELED_RDTV/'

segments_vad = pd.read_csv(fold+'/rdtv_segments_vad.csv')
segments_vad = segments_vad[segments_vad['file'].str.contains('22050')] # the clips corpus has files with different sampling rates; we keep the ones with 22.5 kHz


In [5]:
import pandas as pd

vad_silence_timestamps = {}

df=segments_vad

grouped_df = df.groupby('file') #each segment; each key


for file_name, group in grouped_df:
    id=0

    segment_dict = {}
    for index, row in group.iterrows():
      if row['label'] == 'h#': #get silence segment timestamps

        segment_key = f'segment_{id}_silence'
        start = int(row['start_time'])
        end = int(row['end_time'])
        segment_dict[segment_key] = (start,end)
        id+=1


      if row['label'] == 'S': #get speech segment timestamps
        segment_key = f'segment_{id}_speech'
        start = int(row['start_time'])
        end = int(row['end_time'])

        segment_dict[segment_key] = (start,end)

        id+=1


    file_name = file_name.replace('_22050Hz','')
    file_name= file_name.replace('.wav','')
    file_name = file_name.replace('_p0','')
    file_name = file_name.replace('_p1','')  #remove this info as we want the dictionary keys to match


    vad_silence_timestamps[file_name] = segment_dict

vad_silence_timestamps.keys()

vad_silence_timestamps = dict(list(vad_silence_timestamps.items())[:10]) #keep the first 10 files
vad_silence_timestamps.keys()

dict_keys(['RDdc_01F#1', 'RDdc_01F#2', 'RDis_01F#1', 'RDis_01F#2', 'RDis_01F#3', 'RDis_01F#4', 'RDit_02F#2', 'RDit_05F#1', 'RDit_05F#2', 'RDit_05F#3'])

##Keep the same dictionary keys for both dictionaries (vad prediction and annotated data).

In [6]:
common_keys = set(vad_silence_timestamps.keys()).intersection(annotated_sil.keys())

annotated_sil = {key: annotated_sil[key] for key in common_keys}
print(f"REF: {annotated_sil['RDdc_01F#2']}, \nVAD PRED: {vad_silence_timestamps['RDdc_01F#2']}")

REF: {'segment_0_silence': ('0', '8593'), 'segment_1_speech': ('8593', '19353'), 'segment_2_speech': ('19353', '23437'), 'segment_3_speech': ('23437', '38307'), 'segment_4_speech': ('38307', '51049'), 'segment_5_speech': ('51049', '56296'), 'segment_6_speech': ('56296', '82306'), 'segment_7_silence': ('82306', '86200'), 'segment_8_speech': ('86200', '98924'), 'segment_9_speech': ('98924', '119283'), 'segment_10_silence': ('119283', '129156'), 'segment_11_speech': ('129156', '136268'), 'segment_12_speech': ('136268', '145275'), 'segment_13_speech': ('145275', '171884'), 'segment_14_speech': ('145275', '171884'), 'segment_15_speech': ('145275', '171884'), 'segment_16_speech': ('171884', '183935')}, 
VAD PRED: {'segment_0_silence': (0, 2090), 'segment_1_speech': (2090, 2332), 'segment_2_silence': (2332, 4070), 'segment_3_speech': (4070, 6314), 'segment_4_silence': (6314, 10362), 'segment_5_speech': (10362, 16302), 'segment_6_silence': (16302, 17622), 'segment_7_speech': (17622, 20064), 's

The VAD catches many small silences which are not annotated in the clips corpus; to perform a
binary classification we create two sets from the dictionaries, based on their timestamps positions in sample numbers.

In [7]:
import numpy as np

sig_labels={}

for key, segments in annotated_sil.items():
    # get the last tuple value (sample position of last segment) for each key,
    # since the annotations from clips only mark speech segments OR long silences

    last_tuple_value = segments[list(segments.keys())[-1]][1]

    sig_labels[key] =int(last_tuple_value)

sig_labels

# convert timestamp segments to binary arrays where 1 marks speech and 0 silence
def segments_to_binary(segments, signal_length, label):
    binary_array = np.zeros(signal_length)
    for id, segment in segments.items():
        start_sample, end_sample = int(segment[0]), int(segment[1])

        if label in id:
          binary_array[start_sample:end_sample] = 1


    return binary_array


vad_binary_sil = {}
ref_binary_sil = {}

ref_binary_spe = {}
vad_binary_spe = {}

#1 Create ref and pref dictionaries on silence

# populate vad_binary based on vad_silence_timestamps
for key, segments in vad_silence_timestamps.items():
    signal_length = sig_labels[key]

    vad_binary_sil[key] = segments_to_binary(segments, signal_length, "silence")

# populate ref_binary based on annotated_sil
for key, segments in annotated_sil.items():
    signal_length = sig_labels[key]
    ref_binary_sil[key] = segments_to_binary(segments, signal_length, "silence")

#2 Create ref and pref dictionaries on speech


# populate vad_binary based on vad_silence_timestamps
for key, segments in vad_silence_timestamps.items():
    signal_length = sig_labels[key]
    vad_binary_spe[key] = segments_to_binary(segments, signal_length, "speech")

# populate ref_binary based on annotated_sil
for key, segments in annotated_sil.items():
    signal_length = sig_labels[key]
    ref_binary_spe[key] = segments_to_binary(segments, signal_length, "speech")



In [8]:
x, y= len(ref_binary_spe['RDdc_01F#1']), len(vad_binary_spe['RDdc_01F#1'])
x,y

(85683, 85683)

Several metrics are used to characterize VAD performance.

We calculate: precision and recall, F1, and two other typically used metrics, error rate and detection cost function

ER = 100 ∗ # false rejections + false insertions / #total  ;

False alarm rate Pfa is the percentage of non-speech instances being misclassified as speech and miss rate
Pmiss is the percentage of speech instances being misclassified as non-speech.

Detection cost function (DCF)
is defined as
DCF = (1 − γ) ∗ Pmiss + γ ∗ Pfa (12)
where the weight γ is equal to 0.25, which penalizes missed speech frames more heavily

<!-- from https://arxiv.org/pdf/1906.03588.pdf -->

In [9]:
# Exploratory evaluation to see how it performs without a tolerance

In [39]:
def scores(ref, prediction): #without tolerance
    CM = metrics.confusion_matrix(ref, prediction)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]

    TOTAL_PREDS = TP+FP+TN+FN

    ER = 100 * (FP + FN) / TOTAL_PREDS
    accuracy = (TP + TN) / TOTAL_PREDS
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    falses = FP+FN

    PMISS= FN/TOTAL_PREDS
    PFA = FP/TOTAL_PREDS
    g= 0.25
    DCF = ((1-g) * PMISS) + (g * PFA )


    g= 0.25
    DCF = ((1-g) * PMISS) + (g * PFA )
    F1 = 2* ((precision * recall) / (precision+recall))

    roc_auc= metrics.roc_auc_score(ref, prediction, average = "samples")


    return  f"F: {F1}, ER: {ER}, DCF: {DCF}, ROC: {roc_auc}"



scores(ref_binary_spe['RDdc_01F#1'], vad_binary_spe['RDdc_01F#1']) , scores(ref_binary_sil['RDdc_01F#1'], vad_binary_sil['RDdc_01F#1'])

('F: 0.8927850356294538, ER: 16.857486315838614, DCF: 0.08584550027426677, ROC: 0.7520348441674637',
 'F: 0.605850570321454, ER: 16.857486315838614, DCF: 0.08272936288411938, ROC: 0.7520348441674638')

Usually VADS are evaluated with a tolerance of 200ms. We perform this by checking if the prediction label matches either within the range of 200 (ms) x 22.05 (sample rate) = 4410 samples.

In [31]:
def metrics_with_tolerance(ref_data, pred_data, tolerance):
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    for i in range(len(ref_data)):
        ref_value = ref_data[i]
        pred_value = pred_data[i]


        # check for a match within the specified tolerance
        match_range = range(i - tolerance, i + tolerance + 1)
        match = any(0 <= idx < len(ref_data) and ref_data[idx] == pred_value for idx in match_range)
        if match:

            if pred_value == 1 and ref_value == 1:
                TP += 1

            elif pred_value == 0 and ref_value == 0:
                TN += 1
        elif not match:

            if pred_value == 1 and ref_value == 0:
                FP += 1
            elif pred_value == 0 and ref_value == 1:
                FN += 1


    TOTAL_PREDS = TP + FP + FN + TN


    accuracy = (TP+TN)/TOTAL_PREDS
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    falses = FP+FN
    ER = 1-accuracy

    PMISS= FN/TOTAL_PREDS
    PFA = FP/TOTAL_PREDS
    g= 0.25
    DCF = ((1-g) * PMISS) + (g * PFA )

    F1 = 2* ((precision * recall) / (precision+recall))

    roc_auc= metrics.roc_auc_score(ref_data, pred_data, average = "samples")


    # print(f"ACCURACY: {accuracy}, PRECISION: {precision}, RECALL: {recall}, F1: {F1}, ER: {ER} DCF: {DCF}, TOTAL PREDICTIONS: {TOTAL_PREDS}, ROC_AUC: {roc_auc}")
    return accuracy, precision, recall, F1, ER, DCF, roc_auc

tolerance = 4410 #200ms x 22.05 sr



In [32]:
tolerance = 4410
# iterate x2 for two dictionaries. we care about false insertions in this case!

all_accuracies = []
all_precisions = []
all_recalls = []
all_F1=[]
all_ERs = []
all_DCFs = []
all_roc=[]

for key in ref_binary_spe.keys():
    ref_data = ref_binary_spe[key]
    pred_data = vad_binary_spe[key]
    print(f"working on key : {key}")

    accuracy, precision, recall, F1, ER, DCF, roc_auc = metrics_with_tolerance(ref_data, pred_data, tolerance)

    print(f"accuracy: {accuracy}, precision: {precision}, recall: {recall}, ER: {ER}")
    all_accuracies.append(accuracy)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_ERs.append(ER)
    all_DCFs.append(DCF)
    all_roc.append(roc_auc)
    all_F1.append(F1)



working on key : RDdc_01F#2
accuracy: 0.9012215585322971, precision: 0.990252277699316, recall: 0.9002996047231804, ER: 0.09877844146770287
working on key : RDis_01F#2
accuracy: 0.6457765667574932, precision: 1.0, recall: 0.6334155798378569, ER: 0.3542234332425068
working on key : RDit_02F#2
accuracy: 0.8469578994145719, precision: 1.0, recall: 0.8359135478525066, ER: 0.1530421005854281
working on key : RDit_05F#3
accuracy: 0.9146458147744895, precision: 0.9589933111002829, recall: 0.9436933711118722, ER: 0.08535418522551053
working on key : RDit_05F#1
accuracy: 0.9548557202547299, precision: 0.9921872415868963, recall: 0.9547642085964555, ER: 0.04514427974527013
working on key : RDis_01F#1
accuracy: 0.8227173115562226, precision: 1.0, recall: 0.7918976881898983, ER: 0.17728268844377737
working on key : RDis_01F#4
accuracy: 0.8015660859026482, precision: 0.9825217176124301, recall: 0.7762571643188624, ER: 0.19843391409735178
working on key : RDit_05F#2
accuracy: 0.7373408282666055, pre

In [33]:
dict_eval_spe = {}

accuracy_array = np.array(all_accuracies)
precision_array = np.array(all_precisions)
recall_array = np.array(all_recalls)
f1_array = np.array(all_F1)
ER_array = np.array(all_ERs)
DCF_array = np.array(all_DCFs)
roc_array = np.array(all_roc)

dict_eval_spe = {
    'accuracy': np.mean(accuracy_array),
    'precision': np.mean(precision_array),
    'recall': np.mean(recall_array),
    'f1': np.mean(f1_array),
    'ER': np.mean(ER_array),
    'DCF': np.mean(DCF_array),
    'roc': np.mean(roc_array)

}
dict_eval_spe #metrics on speech identification

{'accuracy': 0.8304261936234234,
 'precision': 0.9616140035101811,
 'recall': 0.8206417729481252,
 'f1': 0.8793828359051898,
 'ER': 22.52989347002147,
 'DCF': 0.11540710559416054,
 'roc': 0.7678070630472189}

In [34]:
all_accuracies = []
all_precisions = []
all_recalls = []
all_F1=[]
all_ERs = []
all_DCFs = []
all_roc=[]


for key in ref_binary_sil.keys():
    ref_data = ref_binary_sil[key]
    pred_data = vad_binary_sil[key]
    print(f"working on key : {key}")

    accuracy, precision, recall, F1, ER, DCF, roc_auc = metrics_with_tolerance(ref_data, pred_data, tolerance)

    print(f"accuracy: {accuracy}, precision: {precision}, recall: {recall}, ER: {ER}")
    all_accuracies.append(accuracy)
    all_precisions.append(precision)
    all_recalls.append(recall)
    all_ERs.append(ER)
    all_DCFs.append(DCF)
    all_roc.append(roc_auc)
    all_F1.append(F1)

dict_eval_sil = {}

accuracy_array = np.array(all_accuracies)
precision_array = np.array(all_precisions)
recall_array = np.array(all_recalls)
f1_array = np.array(all_F1)
ER_array = np.array(all_ERs)
DCF_array = np.array(all_DCFs)
roc_array = np.array(all_roc)

dict_eval_sil = {
    'accuracy': np.mean(accuracy_array),
    'precision': np.mean(precision_array),
    'recall': np.mean(recall_array),
    'f1': np.mean(f1_array),
    'ER': np.mean(ER_array),
    'DCF': np.mean(DCF_array),
    'roc': np.mean(roc_array)

}
dict_eval_sil

working on key : RDdc_01F#2
accuracy: 0.9012215585322971, precision: 0.47495773807550795, recall: 0.9105293257927178, ER: 0.09877844146770287
working on key : RDis_01F#2
accuracy: 0.6457765667574932, precision: 0.08691834942932397, recall: 1.0, ER: 0.3542234332425068
working on key : RDit_02F#2
accuracy: 0.8469578994145719, precision: 0.3054597533173555, recall: 1.0, ER: 0.1530421005854281
working on key : RDit_05F#3
accuracy: 0.9140064930028476, precision: 0.5977579073802215, recall: 0.6746497966561229, ER: 0.08599350699715236
working on key : RDit_05F#1
accuracy: 0.9548557202547299, precision: 0.780706085668436, recall: 0.9553986177725745, ER: 0.04514427974527013
working on key : RDis_01F#1
accuracy: 0.8227173115562226, precision: 0.45515371238246827, recall: 1.0, ER: 0.17728268844377737
working on key : RDis_01F#4
accuracy: 0.8133590049939402, precision: 0.46653398973499527, recall: 0.9299509343355257, ER: 0.1866409950060598
working on key : RDit_05F#2
accuracy: 0.7373408282666055, 

{'accuracy': 0.8299159407272935,
 'precision': 0.5154177851393539,
 'recall': 0.907657098805527,
 'f1': 0.6195283836109461,
 'FER': 0.16957380637657654,
 'DCF': 0.055120415266637725,
 'roc': 0.7643263046998953}