### model training

In [17]:
### model training

import os
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier  # Add this import
from sklearn.decomposition import PCA

# Replace these with your actual folder paths
devesh_folder = "./devesh"
rakshit_folder = "./rakshit"
others_folder = "./not_devesh"

def load_audio_files(folder_path, label):
    embeddings = []
    labels = []
    encoder = VoiceEncoder()
    
    files_count = 0
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.mp3') or file_name.endswith('.wav'):
            files_count += 1
            file_path = os.path.join(folder_path, file_name)
            wav = preprocess_wav(file_path)
            embed = encoder.embed_utterance(wav)
            embeddings.append(embed)
            labels.append(label)
    
    print(f"Loaded {files_count} files from {folder_path}")
    return embeddings, labels

# Load all three classes of audio files
devesh_embeddings, devesh_labels = load_audio_files(devesh_folder, label=0)
rakshit_embeddings, rakshit_labels = load_audio_files(rakshit_folder, label=1)
others_embeddings, others_labels = load_audio_files(others_folder, label=2)

# Combine all data
X = np.vstack((devesh_embeddings, rakshit_embeddings, others_embeddings))
y = np.hstack((devesh_labels, rakshit_labels, others_labels))
n_components = 8  # You can adjust this number
pca = PCA(n_components=n_components)

# Fit and transform the data
X_reduced = pca.fit_transform(X)
X=X_reduced
# Train the classifier
# Add validation checks before training
print(f"Total samples: {len(X)}")
print(f"Number of Devesh samples: {sum(y == 0)}")
print(f"Number of Rakshit samples: {sum(y == 1)}")
print(f"Number of Other samples: {sum(y == 2)}")

if len(X) < 30:  # Increased threshold for 3 classes
    print("Warning: Very small dataset. Results may not be reliable.")

if sum(y == 0) < 5 or sum(y == 1) < 5 or sum(y == 2) < 5:  # Check all three classes
    print("Warning: One or more classes have very few samples.")

# Split data into train/test sets first
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the classifier
clf = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
clf.fit(X_train, y_train)

# Evaluate on test set
test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)
print("\nTest set evaluation:")
print(f"Test accuracy: {test_accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, test_pred, target_names=['Devesh', 'Rakshit', 'Others']))

# Perform cross-validation
cv_scores = cross_val_score(clf, X, y, cv=5)
print("\nCross-validation results:")
print(f"CV scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

# Check for perfect separation
if cv_scores.mean() > 0.99:
    print("\nWarning: Near-perfect accuracy detected. This might indicate:")
    print("1. Data leakage")
    print("2. Duplicate or very similar samples across folders")
    print("3. Insufficient data diversity")
    print("4. Insufficient inter-class variation")
    print("Please verify your dataset and data splitting process.")

### model saving

import joblib

model_path = "./speaker_classifier_model.pkl"
joblib.dump(clf, model_path)
print(f"Model saved to {model_path}")

### model inference

import os
from resemblyzer import VoiceEncoder, preprocess_wav
import numpy as np
import joblib

# Load the trained model
model_path = "./speaker_classifier_model.pkl"
clf = joblib.load(model_path)
print(f"Model loaded from {model_path}")

def predict_speaker(audio_file_path):
    """
    Predict if the speaker in the audio file is Devesh (0), Rakshit (1), or Other (2).

    Parameters:
        audio_file_path (str): Path to the audio file to predict.
    
    Returns:
        int: 0 if Devesh, 1 if Rakshit, 2 if Other
    """
    # Load and preprocess the audio
    encoder = VoiceEncoder()
    wav = preprocess_wav(audio_file_path)
    embed = encoder.embed_utterance(wav)
    
    # Reshape the embedding to match the model's expected input shape
    embed = embed.reshape(1, -1)
    
    # Make the prediction
    prediction = clf.predict(embed)[0]
    return prediction

# Example usage
audio_file_path = "test.mp3"  # Replace with the actual test audio file path

if os.path.exists(audio_file_path):
    prediction = predict_speaker(audio_file_path)
    speaker = "Devesh" if prediction == 0 else "Rakshit" if prediction == 1 else "Other"
    print(f"The speaker in the audio file is: {speaker}")
else:
    print(f"Audio file not found: {audio_file_path}")

  checkpoint = torch.load(weights_fpath, map_location="cpu")


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded 27 files from ./devesh
Loaded the voice encoder model on cpu in 0.00 seconds.
Loaded 26 files from ./rakshit
Loaded the voice encoder model on cpu in 0.00 seconds.
Loaded 11 files from ./not_devesh
Total samples: 64
Number of Devesh samples: 27
Number of Rakshit samples: 26
Number of Other samples: 11

Test set evaluation:
Test accuracy: 0.85

Classification Report:
              precision    recall  f1-score   support

      Devesh       0.75      1.00      0.86         6
     Rakshit       1.00      0.60      0.75         5
      Others       1.00      1.00      1.00         2

    accuracy                           0.85        13
   macro avg       0.92      0.87      0.87        13
weighted avg       0.88      0.85      0.84        13


Cross-validation results:
CV scores: [1.         0.92307692 0.84615385 1.         1.        ]
Mean CV accuracy: 0.95 (+/- 0.12)
Model saved to ./speaker_classifier_model.pkl
Model loaded 

In [19]:
# Add this after creating embeddings but before training
print("\nEmbedding dimensions:")
print(f"Single embedding shape: {devesh_embeddings[0].shape}")
print(f"Full X matrix shape: {X.shape}")
print(f"Full y vector shape: {y.shape}")

# Optional: Check for any NaN values
print("\nData validation:")
print(f"Any NaN values in X: {np.isnan(X).any()}")
print(f"Any infinite values in X: {np.isinf(X).any()}")


Embedding dimensions:
Single embedding shape: (256,)
Full X matrix shape: (64, 8)
Full y vector shape: (64,)

Data validation:
Any NaN values in X: False
Any infinite values in X: False


### XGBOOST

### model saving

In [18]:
# Save the trained model
import joblib  # Add this line to import joblib

model_path = "./speaker_classifier_model.pkl"
joblib.dump(clf, model_path)
print(f"Model saved to {model_path}")


Model saved to ./speaker_classifier_model.pkl


### model inference

In [7]:
import os
from resemblyzer import VoiceEncoder, preprocess_wav
import numpy as np
import joblib

# Load the trained model
model_path = "./speaker_classifier_model.pkl"
clf = joblib.load(model_path)
print(f"Model loaded from {model_path}")

# Function to load and process a single audio file for inference
def predict_speaker(audio_file_path):
    """
    Predict if the speaker in the audio file is Devesh (1) or not (0).

    Parameters:
        audio_file_path (str): Path to the audio file to predict.
    
    Returns:
        int: 1 if the speaker is Devesh, 0 otherwise.
    """
    # Load and preprocess the audio
    encoder = VoiceEncoder()
    wav = preprocess_wav(audio_file_path)
    embed = encoder.embed_utterance(wav)
    
    # Reshape the embedding to match the model's expected input shape
    embed = embed.reshape(1, -1)
    
    # Make the prediction
    prediction = clf.predict(embed)[0]
    return prediction

# Example usage
audio_file_path = "devesh_test.mp3"  # Replace with the actual test audio file path

if os.path.exists(audio_file_path):
    prediction = predict_speaker(audio_file_path)
    speaker = "Devesh" if prediction == 1 else "Not Devesh"
    print(f"The speaker in the audio file is: {speaker}")
else:
    print(f"Audio file not found: {audio_file_path}")


Model loaded from ./speaker_classifier_model.pkl
Loaded the voice encoder model on cpu in 0.01 seconds.
The speaker in the audio file is: Devesh


  checkpoint = torch.load(weights_fpath, map_location="cpu")
