In [2]:
import numpy as np
import torch
import torch.nn as nn

In [3]:
def segment_features(features, segment_frames=44, hop_frames=1):
    # Number of feature frames in the file
    num_frames = features.shape[1]
    
    # Calculate the number of segments we can extract
    num_segments = 1 + (num_frames - segment_frames) // hop_frames
    
    # Prepare an array to store segments
    segments = []
    
    # Extract segments using a sliding window
    for i in range(num_segments):
        start_frame = i * hop_frames
        end_frame = start_frame + segment_frames
        segment = features[:, start_frame:end_frame]
        segments.append(segment)
        
    return np.array(segments)

In [12]:
test_data_1 = np.load('../../Data/scene_analysis/2_Florian_Heizung_aus.npy')
test_data_2 = np.load('../../Data/scene_analysis/3_Verena_Staubsauger_an_Alarm_an.npy')
test_data_3 = np.load('../../Data/scene_analysis/5_Lukas_Staubsauger_an_Licht_aus.npy')
test_data_4 = np.load('../../Data/scene_analysis/6_Deepak_Radio_aus_Licht_aus.npy')

print(f"Florian Heizung aus (Scene 1): {test_data_1.shape} "
      f"\nVerena Staubsauger an Alarm an (Scene 2): {test_data_2.shape} "
      f"\nLukas Staubsauger an Licht aus (Scene 3): {test_data_3.shape} "
      f"\nDeepak Radio aus Licht aus (Scene 4): {test_data_4.shape}")

Florian Heizung aus (Scene 1): (1, 175, 854) 
Verena Staubsauger an Alarm an (Scene 2): (1, 175, 762) 
Lukas Staubsauger an Licht aus (Scene 3): (1, 175, 292) 
Deepak Radio aus Licht aus (Scene 4): (1, 175, 243)


In [4]:
test_data_1.shape

(1, 175, 854)

In [119]:
from model_architectures import EnhancedAudioCNN

def load_model(model_path, device='cpu'):
    checkpoint = torch.load(model_path, map_location=device)
    model = EnhancedAudioCNN()
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    model.to(device)
    return model

#model_path = 'models/EnhancedAudioCNN_1.pth'
model_path = 'all_3.pth'
use_mps = torch.backends.mps.is_available()
device = torch.device("mps" if use_mps else "cpu")
# Load the model
model = load_model(model_path, device=device)

seg_1 = segment_features(test_data_1[0])
seg_2 = segment_features(test_data_2[0])
seg_3 = segment_features(test_data_3[0])
seg_4 = segment_features(test_data_4[0])

"""seg_1 = np.concatenate((seg_1[:, :22, :], seg_1[:, 87:, :]), axis=1)
seg_2 = np.concatenate((seg_2[:, :22, :], seg_2[:, 87:, :]), axis=1)
seg_3 = np.concatenate((seg_3[:, :22, :], seg_3[:, 87:, :]), axis=1)
seg_4 = np.concatenate((seg_4[:, :22, :], seg_4[:, 87:, :]), axis=1)"""

# Assume segments are already loaded and segmented from the previous step
segments_1 = torch.tensor(seg_1, dtype=torch.float32).to(device)
segments_2 = torch.tensor(seg_2, dtype=torch.float32).to(device)
segments_3 = torch.tensor(seg_3, dtype=torch.float32).to(device)
segments_4 = torch.tensor(seg_4, dtype=torch.float32).to(device)

segments_1 = segments_1.unsqueeze(1)
segments_2 = segments_2.unsqueeze(1)
segments_3 = segments_3.unsqueeze(1)
segments_4 = segments_4.unsqueeze(1)
# Apply the model to each segment
results = []
for segment_tensor in segments_2:
    # Each tensor in segments is (1, feature_dim, segment_frames), which should match the expected input shape of your model
    segment_tensor = segment_tensor.unsqueeze(0)
    
    with torch.no_grad():
        output = model(segment_tensor)
        results.append(torch.softmax(output, dim=1).detach().cpu().numpy())

In [6]:
words = []
for i, result in enumerate(results):
    if max(result[0]) > 0.5:
        words.append([i, result])
len(words)

283

In [6]:
segments_1.shape

torch.Size([811, 1, 175, 44])

In [121]:
# Dictionary mapping from index to class
class_mapping = {
    0: 'Alarm',
    1: 'Brötchen',
    2: 'Fernseher',
    3: 'Haus',
    4: 'Heizung',
    5: 'Leitung',
    6: 'Licht',
    7: 'Lüftung',
    8: 'Ofen',
    9: 'Radio',
    10: 'Schraube',
    11: 'Spiegel',
    12: 'Staubsauger',
    13: 'an',
    14: 'aus',
    15: 'kann',
    16: 'nicht',
    17: 'offen',
    18: 'warm',
    19: 'wunderbar'
}

# Initialize a dictionary to hold prediction counts
prediction_counts = {name: 0 for name in class_mapping.values()}

# Process each result in the results list
for result in results:
    probabilities = np.array(result[0])
    if max(probabilities) > 0.8:
        predicted_index = np.argmax(probabilities)
        predicted_class = class_mapping[predicted_index]
        prediction_counts[predicted_class] += 1
sum_of_counts = sum(prediction_counts.values())
# Now print the counts of each predicted class
for class_name, count in prediction_counts.items():
    if count>0:
        print(f"{class_name}: {count}")
    
#print(sum_of_counts)

Alarm: 19
Fernseher: 2
Schraube: 6
Staubsauger: 30
an: 3
warm: 4


In [8]:
for result in results:
    probabilities = np.array(result[0])
probabilities

array([0.02938659, 0.05264592, 0.07938747, 0.11098366, 0.05376389,
       0.01894658, 0.01557708, 0.00456725, 0.02142099, 0.14485623,
       0.06881715, 0.01790391, 0.05573327, 0.0328542 , 0.07277482,
       0.04881035, 0.02759245, 0.01794075, 0.03978242, 0.08625506],
      dtype=float32)