In [2]:
import numpy as np
import torch
import os
import pandas as pd

from utils import segment_features, predictions
from model_architectures import EnhancedAudioCNN

In [9]:
data_path = "../../Data/development_scenes_npy/development_scenes/"
data_csv = pd.read_csv("../../Data/development_scenes_npy/development_scene_annotations.csv")
data = []
filenames = []
for filename in data_csv['filename']:
    data.append((np.load(os.path.join(data_path, filename + ".npy"))))
    filenames.append(filename)

In [10]:
reshaped_data = []
for file in data:
    reshaped_data.append(np.expand_dims(file, axis=0))

In [11]:
def load_model(model_path, device='cpu'):
    checkpoint = torch.load(model_path, map_location=device)
    model = EnhancedAudioCNN()
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    model.to(device)
    return model

In [13]:
def inference_results(sample, model_path, device):
    # Load the model
    model = load_model(model_path, device=device)
    results = []
    sample = segment_features(sample)
    sample = torch.tensor(sample, dtype=torch.float32).to(device)
    sample = sample.unsqueeze(1)
    for segment_tensor in sample:
        # Each tensor in segments is (1, feature_dim, segment_frames), which should match the expected input shape of your model
        segment_tensor = segment_tensor.unsqueeze(0)
        
        with torch.no_grad():
            output = model(segment_tensor)
            results.append(torch.softmax(output, dim=1).detach().cpu().numpy())
    return results

In [14]:
use_mps = torch.backends.mps.is_available()
device = torch.device("mps" if use_mps else "cpu")

In [15]:
len(filenames)

1190

In [33]:
model_path = "model_epoch_47.pth"
final_predictions = {'filename': [],
                     'command': [],
                     'timestamp': []
                     }
i = 0
final_pred_last = []
for sample, file in zip(reshaped_data, filenames):
    print(sample.shape)
    result = inference_results(sample[0], model_path, device)
    final_prediction = (predictions(result))
    if final_prediction == final_pred_last:
        continue
    for pred in final_prediction:
        final_predictions['filename'].append(file)
        final_predictions['command'].append(pred[0])
        final_predictions['timestamp'].append(pred[1]/39 + 0.5)
        i += 1
        print(i, "/", len(filenames))
    final_pred_last = final_prediction
    

(1, 175, 581)
1 / 1190
(1, 175, 1109)


KeyboardInterrupt: 

In [24]:
df = pd.DataFrame(final_predictions)
df

Unnamed: 0,filename,command,timestamp
0,2_speech_true_Ofen_aus,Heizung aus,6.141026
1,4_speech_true_Alarm_an,Alarm an,15.243590
2,19_speech_false_Lüftung_aus,Lüftung aus,8.884615
3,22_speech_false_Heizung_an,Heizung an,7.217949
4,24_speech_true_Licht_an,Licht an,13.243590
...,...,...,...
976,2019_speech_true_Heizung_an,Heizung an,16.602564
977,2020_speech_true_Lüftung_an,Lüftung an,13.679487
978,2021_speech_false_Radio_an,Radio an,11.166667
979,2022_speech_true_Alarm_an,Alarm an,21.166667


In [32]:
df_1 = pd.DataFrame(final_predictions)
df_1.to_csv("data_leakage_fixed.csv", index=False)

In [25]:
df = pd.DataFrame(final_predictions)
csv_path = 'predictions.csv'
df.to_csv(csv_path, index=False)

SmartVoiceControl has also released a specification of the costs associated with correct and
incorrect predictions. This new evaluation metric is better aligned with the actual goals of the
application and deviates significantly from the purely classification-based evaluation used in
the previous assignment. The proposed cost function distinguishes between four types of
events:

- True Positives: A speech command was issued and correctly detected by the system.
A detection is considered correct if the predicted timestamp is within the
annotation’s onset and offset. The associated cost is -1.

- False Negatives: A speech command was issued, but the system did not detect a
command. This is annoying but often a result of sloppy pronunciation; users will
adjust their speech to the system over time. The associated cost is 0.5


- False Positives: The system wrongfully detects a speech command. Depending on
the command, the consequences of wrongful detections range from spooky and
bothersome to potentially dangerous (luckily, there will be some additional
safeguards to avoid the worst). The associated costs, therefore, depend on the
wrongfully detected command:
 * {“Fernseher”, “Licht”, “Radio”, or “Staubsauger”} + {“an” or “aus”}: 2
 * {“Heizung” or “Lüftung”} + {“an” or “aus”}: 3
 * {“Ofen” or “Alarm” } + {“an” or “aus”}: 4

- Cross-Triggers: Cross-Triggers are a special type of false positives. They occur when
a user intends to interact with the system by issuing a speech command, but the
system does not recognize the correct command. This lowers the perceived quality of
the voice control system and leads to user frustration. However, Cross-Triggers are
less expensive compared to general false positives, as the user receives feedback
from the system and can manually intervene if the system makes a critical mistake
(e.g., detecting "Ofen an", or "Alarm aus" instead of the intended speech command).
In general, Cross-Triggers result in a cost of 1. Luckily, not all Cross-Triggers are
that expensive; the system automatically corrects incorrect action keywords. For
example, if the radio (“Radio”) is on and the user issues “Radio aus” (i.e., radio off),
but the system detects “Radio an” (i.e., radio on), the system will automatically
correct the classification to “Radio aus” (i.e., radio off). The cost for incorrect action
keywords with correct device keywords is, therefore, 0.1.