In [1]:
import numpy as np
# import pandas as pd
import librosa
# import tensorflow as tf
# from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import torch
import torchaudio
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # For CUDA
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    np.random.seed(seed)

    # Ensures deterministic behavior (optional, can slow things down)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
set_seed(42)

In [4]:
data_file= "data/processed3/50_speakers_audio_data"

In [5]:
def extract_mfcc(parent_dir, sub_folders, n_mfcc=13, max_pad_len=129 , mfcc_window_len= 43):
    x = []
    y = []
    
    for label, folder in enumerate(sub_folders):
        folder_path = os.path.join(parent_dir, folder)
        
        # Loop through each audio file in the speaker's folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.wav'):  # Only process .wav files
                file_path = os.path.join(folder_path, file_name)
                
                audio, sr = librosa.load(file_path, sr=None)

                # Extract MFCC features
                org_mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
                delta_mfcc = librosa.feature.delta(org_mfcc)
                delta2_mfcc = librosa.feature.delta(org_mfcc , order=2)
                mfcc=np.concatenate((org_mfcc, delta_mfcc, delta2_mfcc), axis=0)
                
                scaler = StandardScaler()
                mfcc = scaler.fit_transform(mfcc.T)
#                 mfcc.T

                # Padding or truncating the MFCC feature array
                if mfcc.shape[0] < max_pad_len:
                    pad_width = max_pad_len - mfcc.shape[0]
                    mfcc = np.pad(mfcc, pad_width=((0, pad_width), (0, 0)), mode='constant')
                else:
                    mfcc = mfcc[:max_pad_len ,:]

                # Slice the MFCC into windows of window_len
                num_windows = mfcc.shape[0] // mfcc_window_len
                for i in range(num_windows):
                    start = i * mfcc_window_len
                    end = start + mfcc_window_len
                    mfcc_window = mfcc[start:end,: ]
                    x.append(mfcc_window)
                    speaker_id = int(folder[-2:])
                    y.append(speaker_id)
    
    x= np.array(x)
    y= np.array(y)
    return x,y


In [6]:
no_speakers_file=50

def speakers_list(no_speakers_file ,data_file ):
    speaker_l = []

    # Get all subfolders in the data_file directory
    subfolders = [f.name for f in os.scandir(data_file) if f.is_dir()]

    # Check if the requested number of speakers is available
    if no_speakers_file > len(subfolders):
        raise ValueError(f"Requested {no_speakers_file} speakers, but only {len(subfolders)} available.")

    # Select the first 'no_speakers_file' subfolders
    speaker_l = subfolders[:no_speakers_file]

    return speaker_l

speaker_list = speakers_list(no_speakers_file,data_file )


In [None]:
x,y= extract_mfcc(data_file,speaker_list)

KeyboardInterrupt: 

: 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Training Data Shape:", x_train.shape)
print("Test Data Shape:", x_test.shape)

Training Data Shape: (117052, 43, 39)
Test Data Shape: (29264, 43, 39)


In [None]:
input_shape= (43,39)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize encoder and fit on full set of labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)  # Assuming you're predicting on y_test


In [None]:

# Flatten the data: (n_samples, 32, 13) to (n_samples, 32*13)
n_samples_train = x_train.shape[0]
x_train_flat = x_train.reshape(n_samples_train, -1)

n_samples_test = x_test.shape[0]
x_test_flat = x_test.reshape(n_samples_test, -1)
