In [2]:
import numpy as np
import torch
import os
import pandas as pd

from utils import segment_features, predictions
from model_architectures import EnhancedAudioCNN

In [8]:
data_path = "../../Data/test_scenes/"

# List all files in the directory
files = [file for file in os.listdir(data_path) if file.endswith('.npy')]
# Load each .npy file and store the data in a dictionary
data = {}
for file in files:
    file_path = os.path.join(data_path, file)
    file_key = os.path.splitext(file)[0]
    data[file_key] = np.load(file_path)

In [20]:
reshaped_data = {}
for file in data:
    reshaped_data[file] = np.expand_dims(data[file], axis=0)

In [11]:
def load_model(model_path, device='cpu'):
    checkpoint = torch.load(model_path, map_location=device)
    model = EnhancedAudioCNN()
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    model.to(device)
    return model

In [14]:
def inference_results(sample, model_path, device):
    # Load the model
    model = load_model(model_path, device=device)
    results = []
    sample = segment_features(sample)
    sample = torch.tensor(sample, dtype=torch.float32).to(device)
    sample = sample.unsqueeze(1)
    for segment_tensor in sample:
        # Each tensor in segments is (1, feature_dim, segment_frames), which should match the expected input shape of your model
        segment_tensor = segment_tensor.unsqueeze(0)
        
        with torch.no_grad():
            output = model(segment_tensor)
            results.append(torch.softmax(output, dim=1).detach().cpu().numpy())
    return results

In [13]:
use_mps = torch.backends.mps.is_available()
device = torch.device("mps" if use_mps else "cpu")

In [16]:
len(data)

526

In [21]:
model_path = "model_epoch_47.pth"
final_predictions = {'filename': [],
                     'command': [],
                     'timestamp': []
                     }
i = 0
final_pred_last = []
for file in reshaped_data:
    sample = reshaped_data[file]
    result = inference_results(sample[0], model_path, device)
    final_prediction = (predictions(result))
    if final_prediction == final_pred_last:
        continue
    for pred in final_prediction:
        final_predictions['filename'].append(file)
        final_predictions['command'].append(pred[0])
        final_predictions['timestamp'].append(pred[1]/39 + 0.5)
        i += 1
        print(i, "/", len(data))
    final_pred_last = final_prediction
    

1 / 526
2 / 526
3 / 526
4 / 526
5 / 526
6 / 526
7 / 526
8 / 526
9 / 526
10 / 526
11 / 526
12 / 526
13 / 526
14 / 526
15 / 526
16 / 526
17 / 526
18 / 526
19 / 526
20 / 526
21 / 526
22 / 526
23 / 526
24 / 526
25 / 526
26 / 526
27 / 526
28 / 526
29 / 526
30 / 526
31 / 526
32 / 526
33 / 526
34 / 526
35 / 526
36 / 526
37 / 526
38 / 526
39 / 526
40 / 526
41 / 526
42 / 526
43 / 526
44 / 526
45 / 526
46 / 526
47 / 526
48 / 526
49 / 526
50 / 526
51 / 526
52 / 526
53 / 526
54 / 526
55 / 526
56 / 526
57 / 526
58 / 526
59 / 526
60 / 526
61 / 526
62 / 526
63 / 526
64 / 526
65 / 526
66 / 526
67 / 526
68 / 526
69 / 526
70 / 526
71 / 526
72 / 526
73 / 526
74 / 526
75 / 526
76 / 526
77 / 526
78 / 526
79 / 526
80 / 526
81 / 526
82 / 526
83 / 526
84 / 526
85 / 526
86 / 526
87 / 526
88 / 526
89 / 526
90 / 526
91 / 526
92 / 526
93 / 526
94 / 526
95 / 526
96 / 526
97 / 526
98 / 526
99 / 526
100 / 526
101 / 526
102 / 526
103 / 526
104 / 526
105 / 526
106 / 526
107 / 526
108 / 526
109 / 526
110 / 526
111 / 52

In [22]:
df = pd.DataFrame(final_predictions)
df

Unnamed: 0,filename,command,timestamp
0,806bd2ba88,Alarm an,13.525641
1,e6e6512aa0,Heizung aus,7.756410
2,24411c818e,Lüftung an,1.705128
3,24411c818e,Licht aus,6.294872
4,8178b6eafb,Staubsauger aus,3.653846
...,...,...,...
472,4bbcfcb4ea,Heizung an,4.089744
473,4bbcfcb4ea,Heizung an,9.500000
474,ee8160697e,Alarm aus,6.653846
475,041dc40f49,Licht aus,6.961538


In [32]:
df_1 = pd.DataFrame(final_predictions)
df_1.to_csv("data_leakage_fixed.csv", index=False)

In [23]:
df = pd.DataFrame(final_predictions)
csv_path = 'predictions.csv'
df.to_csv(csv_path, index=False)

SmartVoiceControl has also released a specification of the costs associated with correct and
incorrect predictions. This new evaluation metric is better aligned with the actual goals of the
application and deviates significantly from the purely classification-based evaluation used in
the previous assignment. The proposed cost function distinguishes between four types of
events:

- True Positives: A speech command was issued and correctly detected by the system.
A detection is considered correct if the predicted timestamp is within the
annotation’s onset and offset. The associated cost is -1.

- False Negatives: A speech command was issued, but the system did not detect a
command. This is annoying but often a result of sloppy pronunciation; users will
adjust their speech to the system over time. The associated cost is 0.5


- False Positives: The system wrongfully detects a speech command. Depending on
the command, the consequences of wrongful detections range from spooky and
bothersome to potentially dangerous (luckily, there will be some additional
safeguards to avoid the worst). The associated costs, therefore, depend on the
wrongfully detected command:
 * {“Fernseher”, “Licht”, “Radio”, or “Staubsauger”} + {“an” or “aus”}: 2
 * {“Heizung” or “Lüftung”} + {“an” or “aus”}: 3
 * {“Ofen” or “Alarm” } + {“an” or “aus”}: 4

- Cross-Triggers: Cross-Triggers are a special type of false positives. They occur when
a user intends to interact with the system by issuing a speech command, but the
system does not recognize the correct command. This lowers the perceived quality of
the voice control system and leads to user frustration. However, Cross-Triggers are
less expensive compared to general false positives, as the user receives feedback
from the system and can manually intervene if the system makes a critical mistake
(e.g., detecting "Ofen an", or "Alarm aus" instead of the intended speech command).
In general, Cross-Triggers result in a cost of 1. Luckily, not all Cross-Triggers are
that expensive; the system automatically corrects incorrect action keywords. For
example, if the radio (“Radio”) is on and the user issues “Radio aus” (i.e., radio off),
but the system detects “Radio an” (i.e., radio on), the system will automatically
correct the classification to “Radio aus” (i.e., radio off). The cost for incorrect action
keywords with correct device keywords is, therefore, 0.1.