In [1]:
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import os
from glob import glob
import pandas as pd


In [None]:
class CNNNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.flatten = nn.Flatten()

        # A camada linear recebe o número correto de entradas
        self.linear = nn.Linear(256, 10)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_data):
        
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = self.flatten(x)
        
        logits = self.linear(x)
        prediction = self.softmax(logits)
        return prediction  

In [3]:
'''if __name__ == "__main__":
    cnn = CNNNetwork()
    cnn = cnn.to('cpu')  
    inputs = torch.randn(32, 1, 64, 44)  # Exemplo de um batch com 32 amostras
    print(inputs.shape)  # Verifique a forma da entrada
    output = cnn(inputs)
    
    summary(output, (1, 64, 44))'''

'if __name__ == "__main__":\n    cnn = CNNNetwork()\n    cnn = cnn.to(\'cpu\')  \n    inputs = torch.randn(32, 1, 64, 44)  # Exemplo de um batch com 32 amostras\n    print(inputs.shape)  # Verifique a forma da entrada\n    output = cnn(inputs)\n    \n    summary(output, (1, 64, 44))'

In [4]:
def mfcc(file_path):
    audio, sr = librosa.load(file_path, sr=None)

    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

    mfcc = np.mean(mfcc, axis=1)

    return mfcc  

In [5]:
def melspectrogram(file_path, target_length=376):
    audio, sr = librosa.load(file_path, sr=None)

    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    current_length = mel_spec_db.shape[-1]

    if current_length < target_length:
        # Preencher (padding) com zeros no final
        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, target_length - current_length)), mode='constant')
    elif current_length > target_length:
        # Cortar o espectrograma
        mel_spec_db = mel_spec_db[:, :target_length]

    return mel_spec_db


In [6]:
'''file = librosa.load()
mfcc = spectrogram()

mfcc = np.expand_dims(mfcc, axis=0)
mfcc = np.expand_dims(mfcc, axis=0)

mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32)

print(mfcc_tensor.shape)'''

'file = librosa.load()\nmfcc = spectrogram()\n\nmfcc = np.expand_dims(mfcc, axis=0)\nmfcc = np.expand_dims(mfcc, axis=0)\n\nmfcc_tensor = torch.tensor(mfcc, dtype=torch.float32)\n\nprint(mfcc_tensor.shape)'

In [None]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, index):
        file_path = self.file_paths[index]
        label = self.labels[index]

        features = mfcc(file_path)

        features = torch.tensor(features, dtype=torch.float32)

        features = features.unsqueeze(0)

        features = features.unsqueeze(0)

        return features, label
    
file_paths = glob('/Users/gabrielasimon/Desktop/ac2/UrbanSound8K/audio/*/*.wav')
labels = [0, 1, 0]

dataset = AudioDataset(file_paths=file_paths, labels=labels)

data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [8]:
csv_path = '/Users/gabrielasimon/Desktop/ac2/UrbanSound8K/metadata/UrbanSound8K.csv'

csv = pd.read_csv(csv_path)

file_class = dict(zip(csv['slice_file_name'], csv['classID']))

folds = glob('/Users/gabrielasimon/Desktop/ac2/UrbanSound8K/audio/*')


In [9]:
def load_data(folds, file_class):
    file_paths = []
    labels = []
    for fold in folds:
        audio_files = glob(os.path.join(fold, "*.wav"))
        
        for file in audio_files:
            file_name = os.path.basename(file)  # Nome completo com extensão .wav

            if file_name in file_class:  # Verifica se o nome completo com .wav está no file_class
                file_paths.append(file)  # Adiciona o caminho do arquivo
                labels.append(file_class[file_name])  # Adiciona a classe correspondente
            else:
                print(f"Aviso: Arquivo {file_name} não encontrado no CSV.")
        
    return file_paths, labels

In [10]:
def train(model, data_loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch_idx, (inputs, labels) in enumerate(data_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()  
        outputs = model(inputs)
        loss = criterion(outputs, labels)  
        loss.backward()  
        optimizer.step()  
        
        running_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct_predictions += (preds == labels).sum().item()
        total_predictions += labels.size(0)

        if batch_idx % 50 == 0:
            print(f"Batch {batch_idx}/{len(data_loader)} - Loss: {loss.item():.4f}")

    accuracy = correct_predictions / total_predictions
    return running_loss / len(data_loader), accuracy


def validate(model, data_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    return running_loss / len(data_loader), accuracy

In [11]:
all_train_losses = []
all_train_accuracies = []
all_val_losses = []
all_val_accuracies = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

folds = [f'/Users/gabrielasimon/Desktop/ac2/UrbanSound8K/audio/fold{i}' for i in range(1, 11)]

for i, val_fold in enumerate(folds):

    print(f"Validating on {val_fold}...")

    train_folds = [fold for fold in folds if fold != val_fold]
    
    train_files, train_labels = load_data(train_folds, file_class)
    val_files, val_labels = load_data([val_fold], file_class)
    
    train_dataset = AudioDataset(train_files, train_labels)
    val_dataset = AudioDataset(val_files, val_labels)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    model = CNNNetwork().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = validate(model, val_loader, criterion, device)

    all_train_losses.append(train_loss)
    all_train_accuracies.append(train_acc)
    all_val_losses.append(val_loss)
    all_val_accuracies.append(val_acc)
    
    print(f"Fold {i+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

Validating on /Users/gabrielasimon/Desktop/ac2/UrbanSound8K/audio/fold1...


RuntimeError: Given groups=1, weight of size [16, 1, 3, 3], expected input[1, 32, 1, 13] to have 1 channels, but got 32 channels instead

In [None]:
print(f"Average Train Loss: {sum(all_train_losses) / len(all_train_losses):.4f}")
print(f"Average Train Accuracy: {sum(all_train_accuracies) / len(all_train_accuracies):.4f}")
print(f"Average Val Loss: {sum(all_val_losses) / len(all_val_losses):.4f}")
print(f"Average Val Accuracy: {sum(all_val_accuracies) / len(all_val_accuracies):.4f}")