In [218]:
import torch.nn as nn
import torch

class MaqamCNN1(nn.Module):
    def __init__(self):
        super(MaqamCNN1, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=20, out_channels=32, kernel_size=(3,3), padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(kernel_size=(3,3))
        self.dropout1 = nn.Dropout(p=0.1)
        
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.pool2 = nn.MaxPool2d(kernel_size=(3,3))
        self.dropout2 = nn.Dropout(p=0.2)

        self.fc1 = nn.Linear(90016, 512)
        self.dropout3 = nn.Dropout(p=0.2)

        self.fc2 = nn.Linear(512, 265)
        self.dropout4 = nn.Dropout(p=0.2)

        self.fc3 = nn.Linear(265, 100)
        self.dropout5 = nn.Dropout(p=0.2)

    def forward(self, x):
        # print("X.shape1 = ", x.shape)
        # x = x.unsqueeze(-1)
        x = x.unsqueeze(-1)
        print("1x.shape:", x.shape)
        # print("mfcc.shape:", mfcc.shape)
        # x = torch.squeeze(x, 3)
        x = self.conv1(x)
        # x = self.bn1(x)
        # x = self.pool1(x)
        x = self.dropout1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        # x = self.pool2(x)
        x = self.dropout2(x)
        print("2x.shape:", x.shape)
        x = x.view(x.shape[0], -1)
        print("3x.shape:", x.shape)
        x = self.fc1(x)
        print("4x.shape:", x.shape)
        x = self.dropout3(x)

        x = self.fc2(x)
        print("5x.shape:", x.shape)
        x = self.dropout4(x)
        x = self.fc3(x)
        print("6x.shape:", x.shape)
        x = self.dropout5(x)
        print("7x.shape:", x.shape)
        return x


In [219]:
import os
import torchaudio
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import librosa
import numpy as np
import pickle
from sklearn.model_selection import train_test_split

class MaqamDataset(Dataset):
    def __init__(self, mode='train', transform=None, cache_file='maqam_dataset_cache2.pkl', test_size=0.2):
        self.mode = mode
        self.transform = transform
        if mode == 'train':
            self.data_dir = r"C:\Users\USER\Documents\GitHub\trainset_cutten30"
        else:
            self.data_dir = r"C:\Users\USER\Documents\GitHub\testset_cutten30"
        self.maqams = ['Ajam', 'Bayat', 'Hijaz', 'Kurd', 'Nahawand', 'Rast', 'Saba', 'Seka']
        self.audio_list = self._load_audio_list()
        
        # Split the dataset into training and validation sets using train_test_split method
        train_list, val_list = train_test_split(self.audio_list, test_size=test_size, random_state=42, stratify=[label for (_, label) in self.audio_list])
        self.audio_list = train_list if self.mode == 'train' else val_list
        
        self.cache_file = cache_file
        self.data = self._load_data_from_cache_or_compute()
        # self.pad_to_max_length(1440000)

    def _load_audio_list(self):
        audio_list = []
        for i, maqam in enumerate(self.maqams):
            label_dir = os.path.join(self.data_dir, maqam)
            audio_list += [(os.path.join(label_dir, audio_name), i) for audio_name in os.listdir(label_dir) if audio_name.endswith('.wav')]
        return audio_list

    def __len__(self):
        return len(self.audio_list)

    def __getitem__(self, idx):
        audio_path, label_idx = self.audio_list[idx]
        waveform, sample_rate = torchaudio.load(audio_path)
        waveform = waveform[0] # only keep the first channel
        if self.transform:
            waveform = self.transform(waveform)
        mfcc = self.compute_mfcc(waveform).T
        mfcc = torch.from_numpy(mfcc).float()
        return mfcc, label_idx
    
    def pad_to_max_length(self, max_length):
        for i in range(len(self)):
            padded_data = F.pad(self.data[i][0], (0, max_length - len(self.data[i][0])), 'constant', 0)
            self.data[i] = (padded_data, self.data[i][1])

    def compute_mfcc(self, waveform):
        # Compute the MFCC of the waveform
        n_fft = 2048
        hop_length = 512
        n_mels = 128
        sr = 48000
        waveform = waveform.numpy()  # Convert PyTorch tensor to NumPy array
        mfcc = librosa.feature.mfcc(y=waveform, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, n_mfcc=20)
        mfcc = np.transpose(mfcc)
        mfcc = mfcc.astype(np.float32)  # Ensure data type is compatible with np.issubdtype()
        return mfcc
    
    def _load_data_from_cache_or_compute(self):
        if os.path.isfile(self.cache_file):
            print(f'Loading data from cache file: {self.cache_file}')
            with open(self.cache_file, 'rb') as f:
                return pickle.load(f)
        else:
            print(f'Cache file not found. Computing data from scratch and saving to cache file: {self.cache_file}')
            data = [self.__getitem__(i) for i in range(len(self))]
            with open(self.cache_file, 'wb') as f:
                pickle.dump(data, f)
            return data


In [236]:
import torch
from torch.utils.data import DataLoader
import librosa
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

max_length = 1440000

def MFCC_plot(mfcc):
        plt.figure(figsize=(10, 4))
        mfcc = mfcc.detach().numpy()
        mfcc = mfcc.mean(axis=2).T
        librosa.display.specshow(mfcc, x_axis='time')
        plt.colorbar()
        plt.title('MFCC')
        plt.tight_layout()
        plt.show()

def custom_collate(batch):
    inputs, labels = zip(*batch)
    max_frames = max([m.shape[1] for m in inputs])
    padded_mfcc = []
    for m in inputs:
        pad_width = ((0, 0), (0, max_frames - m.shape[1]))
        padded_m = np.pad(m, pad_width=pad_width, mode='constant')
        padded_mfcc.append(padded_m)

    padded_mfcc = torch.from_numpy(np.array(padded_mfcc)).float()
    labels = torch.tensor(labels)
    return padded_mfcc, labels



In [240]:
# Define training and validation datasets with specified test size
train_dataset = MaqamDataset(mode='train', test_size=0.2)
val_dataset = MaqamDataset(mode='val', test_size=0.2)

# Define training and validation data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate)

Loading data from cache file: maqam_dataset_cache2.pkl
Loading data from cache file: maqam_dataset_cache2.pkl


In [222]:
torch.cuda.init()
torch.cuda.empty_cache()

In [241]:
torch.cuda.init()
torch.cuda.empty_cache()
# Initialize model and define loss function and optimizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MaqamCNN1().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# Train the model for a specified number of epochs
num_epochs = 50

for epoch in range(num_epochs):
    for i, data in enumerate(train_loader):
        inputs, targets = data  # MFCCs and labels

        targets = targets.to(device)
        inputs = inputs[0].cuda()

        print("Inputs.shape[0] = ", inputs.shape[0])
        print("targets.shape[0] = ", targets.shape[0], "\n")
        inputs = inputs.unsqueeze(0).repeat(batch_size, 1, 1)

        optimizer.zero_grad()

        outputs = model(inputs)
        print("outuput.shape[0] = ", outputs.shape[0])
        print("targets.shape[0] = ", targets.shape[0], "\n")
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        # Validation loop
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            total_correct = 0
            total_samples = 0
            for data in val_loader:
                inputs, targets = data  # MFCCs and labels

                targets = targets.to(device)
                inputs = inputs[0].cuda()

                print("2Inputs.shape[0] = ", inputs.shape[0])
                print("targets.shape[0] = ", targets.shape[0], "\n")
                inputs = inputs.unsqueeze(0).repeat(batch_size, 1, 1)
                outputs = model(inputs)
                print("outuput.shape[0] = ", outputs.shape[0])
                print("targets.shape[0] = ", targets.shape[0], "\n")
                val_loss += criterion(outputs, targets).item() * len(targets)

                _, predicted_labels = torch.max(outputs, 1)
                total_correct += (predicted_labels == targets).sum().item()
                total_samples += len(targets)

            val_loss /= len(val_dataset)
            val_acc = float(total_correct) / total_samples

        print(f'Epoch {epoch + 1:02d}: train_loss={loss.item():.5f}, val_loss={val_loss:.5f}, val_acc={val_acc:.5f}')

# Save the trained model
torch.save(model.state_dict(), 'maqam_cnn_mfcc.pth')

In [None]:
# Test the model on new data
test_dataset = MaqamDataset(mode='test')
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate)

model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for data in test_loader:
        inputs, targets, mfcc = data
        targets = targets.to(device)
        inputs = inputs.unsqueeze(1).unsqueeze(3).cuda()
        outputs = model(inputs)
        _, predicted_labels = torch.max(outputs, 1)
        total_correct += (predicted_labels == targets).sum().item()
        total_samples += len(targets)

    test_acc = float(total_correct) / total_samples

print(f'Test accuracy: {test_acc:.5f}')


Loading data from cache file: maqam_dataset_cache2.pkl
Test accuracy: 0.16667


: 