In [1]:
import numpy as np
import torch
import os
import pandas as pd

from utils import segment_features, predictions
from model_architectures import EnhancedAudioCNN

In [2]:
data_path = "../../Data/development_scenes_npy/development_scenes/"
data_csv = pd.read_csv("../../Data/development_scenes_npy/development_scene_annotations.csv")
data = []
filenames = []
for filename in data_csv['filename']:
    data.append((np.load(os.path.join(data_path, filename + ".npy"))))
    filenames.append(filename)

In [3]:
reshaped_data = []
for file in data:
    reshaped_data.append(np.expand_dims(file, axis=0))

In [4]:
def load_model(model_path, device='cpu'):
    checkpoint = torch.load(model_path, map_location=device)
    model = EnhancedAudioCNN()
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    model.to(device)
    return model

In [5]:
def inference_results(sample, model_path, device):
    # Load the model
    model = load_model(model_path, device=device)
    results = []
    sample = segment_features(sample)
    sample = torch.tensor(sample, dtype=torch.float32).to(device)
    sample = sample.unsqueeze(1)
    for segment_tensor in sample:
        # Each tensor in segments is (1, feature_dim, segment_frames), which should match the expected input shape of your model
        segment_tensor = segment_tensor.unsqueeze(0)
        
        with torch.no_grad():
            output = model(segment_tensor)
            results.append(torch.softmax(output, dim=1).detach().cpu().numpy())
    return results

In [6]:
use_mps = torch.backends.mps.is_available()
device = torch.device("mps" if use_mps else "cpu")

In [15]:
len(filenames)

1190

In [7]:
model_path = "model_epoch_47.pth"
final_predictions = {'filename': [],
                     'command': [],
                     'timestamp': []
                     }
i = 0
final_pred_last = []
for sample, file in zip(reshaped_data, filenames):
    result = inference_results(sample[0], model_path, device)
    final_prediction = (predictions(result))
    if final_prediction == final_pred_last:
        continue
    for pred in final_prediction:
        final_predictions['filename'].append(file)
        final_predictions['command'].append(pred[0])
        final_predictions['timestamp'].append(pred[1]/39 + 0.5)
        i += 1
        print(i, "/", len(filenames))
    final_pred_last = final_prediction
    

1 / 1190
2 / 1190
3 / 1190
4 / 1190
5 / 1190
6 / 1190
7 / 1190
8 / 1190
9 / 1190
10 / 1190
11 / 1190
12 / 1190
13 / 1190
14 / 1190
15 / 1190
16 / 1190
17 / 1190
18 / 1190
19 / 1190
20 / 1190
21 / 1190
22 / 1190
23 / 1190
24 / 1190
25 / 1190
26 / 1190
27 / 1190
28 / 1190
29 / 1190
30 / 1190
31 / 1190
32 / 1190
33 / 1190
34 / 1190
35 / 1190
36 / 1190
37 / 1190
38 / 1190
39 / 1190
40 / 1190
41 / 1190
42 / 1190
43 / 1190
44 / 1190
45 / 1190
46 / 1190
47 / 1190
48 / 1190
49 / 1190
50 / 1190
51 / 1190
52 / 1190
53 / 1190
54 / 1190
55 / 1190
56 / 1190
57 / 1190
58 / 1190
59 / 1190
60 / 1190
61 / 1190
62 / 1190
63 / 1190
64 / 1190
65 / 1190
66 / 1190
67 / 1190
68 / 1190
69 / 1190
70 / 1190
71 / 1190
72 / 1190
73 / 1190
74 / 1190
75 / 1190
76 / 1190
77 / 1190
78 / 1190
79 / 1190
80 / 1190
81 / 1190
82 / 1190
83 / 1190
84 / 1190
85 / 1190
86 / 1190
87 / 1190
88 / 1190
89 / 1190
90 / 1190
91 / 1190
92 / 1190
93 / 1190
94 / 1190
95 / 1190
96 / 1190
97 / 1190
98 / 1190
99 / 1190
100 / 1190
101 / 11

In [8]:
df = pd.DataFrame(final_predictions)
df

Unnamed: 0,filename,command,timestamp
0,2_speech_true_Ofen_aus,Heizung aus,6.166667
1,4_speech_true_Alarm_an,Fernseher an,4.500000
2,19_speech_false_Lüftung_aus,Lüftung aus,8.884615
3,22_speech_false_Heizung_an,Heizung an,7.243590
4,24_speech_true_Licht_an,Licht an,13.243590
...,...,...,...
935,2019_speech_true_Heizung_an,Heizung an,16.705128
936,2020_speech_true_Lüftung_an,Lüftung an,13.679487
937,2021_speech_false_Radio_an,Radio an,11.166667
938,2022_speech_true_Alarm_an,Alarm an,21.192308


In [32]:
df_1 = pd.DataFrame(final_predictions)
df_1.to_csv("data_leakage_fixed.csv", index=False)

In [9]:
df = pd.DataFrame(final_predictions)
csv_path = 'predictions.csv'
df.to_csv(csv_path, index=False)

SmartVoiceControl has also released a specification of the costs associated with correct and
incorrect predictions. This new evaluation metric is better aligned with the actual goals of the
application and deviates significantly from the purely classification-based evaluation used in
the previous assignment. The proposed cost function distinguishes between four types of
events:

- True Positives: A speech command was issued and correctly detected by the system.
A detection is considered correct if the predicted timestamp is within the
annotation’s onset and offset. The associated cost is -1.

- False Negatives: A speech command was issued, but the system did not detect a
command. This is annoying but often a result of sloppy pronunciation; users will
adjust their speech to the system over time. The associated cost is 0.5


- False Positives: The system wrongfully detects a speech command. Depending on
the command, the consequences of wrongful detections range from spooky and
bothersome to potentially dangerous (luckily, there will be some additional
safeguards to avoid the worst). The associated costs, therefore, depend on the
wrongfully detected command:
 * {“Fernseher”, “Licht”, “Radio”, or “Staubsauger”} + {“an” or “aus”}: 2
 * {“Heizung” or “Lüftung”} + {“an” or “aus”}: 3
 * {“Ofen” or “Alarm” } + {“an” or “aus”}: 4

- Cross-Triggers: Cross-Triggers are a special type of false positives. They occur when
a user intends to interact with the system by issuing a speech command, but the
system does not recognize the correct command. This lowers the perceived quality of
the voice control system and leads to user frustration. However, Cross-Triggers are
less expensive compared to general false positives, as the user receives feedback
from the system and can manually intervene if the system makes a critical mistake
(e.g., detecting "Ofen an", or "Alarm aus" instead of the intended speech command).
In general, Cross-Triggers result in a cost of 1. Luckily, not all Cross-Triggers are
that expensive; the system automatically corrects incorrect action keywords. For
example, if the radio (“Radio”) is on and the user issues “Radio aus” (i.e., radio off),
but the system detects “Radio an” (i.e., radio on), the system will automatically
correct the classification to “Radio aus” (i.e., radio off). The cost for incorrect action
keywords with correct device keywords is, therefore, 0.1.