In [1]:
import numpy as np
# import pandas as pd
import librosa
# import tensorflow as tf
# from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os
import torch
import torchaudio
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # For CUDA
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    np.random.seed(seed)

    # Ensures deterministic behavior (optional, can slow things down)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
set_seed(42)

In [4]:
data_file= "data/processed3/50_speakers_audio_data"

In [5]:
import os
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ProcessPoolExecutor

def process_file(file_path, label, max_pad_len, mfcc_window_len, n_mfcc):
    audio, sr = librosa.load(file_path, sr=None)

    # Extract MFCC features
    org_mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    delta_mfcc = librosa.feature.delta(org_mfcc)
    delta2_mfcc = librosa.feature.delta(org_mfcc, order=2)
    mfcc = np.concatenate((org_mfcc, delta_mfcc, delta2_mfcc), axis=0)

    scaler = StandardScaler()
    mfcc = scaler.fit_transform(mfcc.T)

    # Padding or truncating the MFCC feature array
    if mfcc.shape[0] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[0]
        mfcc = np.pad(mfcc, pad_width=((0, pad_width), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:max_pad_len, :]

    # Slice the MFCC into windows of window_len
    num_windows = mfcc.shape[0] // mfcc_window_len
    windows = []
    speaker_id = int(label[-2:])  # Extract speaker ID from the label

    for i in range(num_windows):
        start = i * mfcc_window_len
        end = start + mfcc_window_len
        mfcc_window = mfcc[start:end, :]
        windows.append((mfcc_window, speaker_id))
    
    return windows

def extract_mfcc_parallel(parent_dir, sub_folders, n_mfcc=13, max_pad_len=129, mfcc_window_len=43):
    x = []
    y = []

    tasks = []
    with ProcessPoolExecutor() as executor:
        for label, folder in enumerate(sub_folders):
            folder_path = os.path.join(parent_dir, folder)

            # Loop through each audio file in the speaker's folder
            for file_name in os.listdir(folder_path):
                if file_name.endswith('.wav'):  # Only process .wav files
                    file_path = os.path.join(folder_path, file_name)
                    tasks.append(
                        executor.submit(
                            process_file, file_path, folder, max_pad_len, mfcc_window_len, n_mfcc
                        )
                    )
        
        for future in tasks:
            result = future.result()
            for mfcc_window, speaker_id in result:
                x.append(mfcc_window)
                y.append(speaker_id)

    x = np.array(x)
    y = np.array(y)
    return x, y

In [6]:
no_speakers_file=50

def speakers_list(no_speakers_file ,data_file ):
    speaker_l = []

    # Get all subfolders in the data_file directory
    subfolders = [f.name for f in os.scandir(data_file) if f.is_dir()]

    # Check if the requested number of speakers is available
    if no_speakers_file > len(subfolders):
        raise ValueError(f"Requested {no_speakers_file} speakers, but only {len(subfolders)} available.")

    # Select the first 'no_speakers_file' subfolders
    speaker_l = subfolders[:no_speakers_file]

    return speaker_l

speaker_list = speakers_list(no_speakers_file,data_file )


In [7]:
x,y= extract_mfcc_parallel(data_file,speaker_list)

In [13]:
x.shape

(146316, 43, 39)

In [14]:
np.save("data/features/x_1_3.npy",x)

In [None]:
np.save("data/features/y_1_3.npy",y)

: 

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Training Data Shape:", x_train.shape)
print("Test Data Shape:", x_test.shape)

Training Data Shape: (117052, 43, 39)
Test Data Shape: (29264, 43, 39)


In [9]:
input_shape= (43,39)

In [10]:
from sklearn.preprocessing import LabelEncoder

# Initialize encoder and fit on full set of labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)  # Assuming you're predicting on y_test


In [11]:

# Flatten the data: (n_samples, 32, 13) to (n_samples, 32*13)
n_samples_train = x_train.shape[0]
x_train_flat = x_train.reshape(n_samples_train, -1)

n_samples_test = x_test.shape[0]
x_test_flat = x_test.reshape(n_samples_test, -1)


In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 10, 12],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.1, 1, 5, 10],
    'min_child_weight': [1, 3, 5, 10],
}

# Instantiate the classifier
xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=50,  # Number of combinations to try
    scoring='accuracy',
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Run the hyperparameter search
random_search.fit(x_train_flat, y_train_encoded)

# Best estimator and evaluation
best_model = random_search.best_estimator_

print("Best Parameters:\n", random_search.best_params_)

# Predict and evaluate
y_pred_xgb = best_model.predict(x_test_flat)
print("XGBoost Test Accuracy: {:.2f}%".format(accuracy_score(y_test_encoded, y_pred_xgb) * 100))
print("Classification Report:\n", classification_report(y_test_encoded, y_pred_xgb))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


Exception ignored on calling ctypes callback function <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x74b6d44ac590>>:
Traceback (most recent call last):
  File "/data/anaconda3/envs/speech/lib/python3.13/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x738fdf50c590>>:
Traceback (most recent call last):
  File "/data/anaconda3/envs/speech/lib/python3.13/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7d2c5ef0c590>>:
Traceback (most recent c

[CV] END colsample_bytree=0.8, gamma=0.3, learning_rate=0.05, max_depth=10, min_child_weight=10, n_estimators=100, reg_alpha=1, reg_lambda=0.1, subsample=1.0; total time= 1.6min
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0.01, reg_lambda=0.1, subsample=1.0; total time= 1.6min
[CV] END colsample_bytree=0.8, gamma=0.5, learning_rate=0.2, max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=10, subsample=1.0; total time= 1.6min
[CV] END colsample_bytree=0.8, gamma=0.3, learning_rate=0.05, max_depth=5, min_child_weight=1, n_estimators=200, reg_alpha=0.01, reg_lambda=10, subsample=1.0; total time= 1.6min


Exception ignored on calling ctypes callback function <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x70e42f510590>>:
Traceback (most recent call last):
  File "/data/anaconda3/envs/speech/lib/python3.13/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7c7cb7a28590>>:
Traceback (most recent call last):
  File "/data/anaconda3/envs/speech/lib/python3.13/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 


[CV] END colsample_bytree=0.8, gamma=0.5, learning_rate=0.2, max_depth=5, min_child_weight=1, n_estimators=500, reg_alpha=0.1, reg_lambda=10, subsample=1.0; total time= 1.6min
[CV] END colsample_bytree=0.8, gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, reg_alpha=0.01, reg_lambda=0.1, subsample=1.0; total time= 1.6min


KeyboardInterrupt: 