In [None]:
import librosa
import numpy as np

def extract_voice_characteristics(audio_file, output_file, sr=22050, n_mfcc=13):
    """
    Extracts MFCC features from an audio file and saves them as a .npy file.

    Parameters:
    audio_file (str): Path to the audio file.
    output_file (str): Path to save the .npy file.
    sr (int): Sample rate for loading the audio file. Default is 22050.
    n_mfcc (int): Number of MFCCs to extract. Default is 13.
    """
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=sr)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # Save the MFCC features to a .npy file
    np.save(output_file, mfccs)

# Example usage
audio_file = 'inputs/Rec.wav'
output_file = 'voice_characteristics.npy'
extract_voice_characteristics(audio_file, output_file)



In [None]:
import librosa
import numpy as np
import torch

def extract_voice_characteristics(audio_file, output_file, sr=22050, n_mfcc=13):
    """
    Extracts MFCC features from an audio file and saves them as a .npy file.

    Parameters:
    audio_file (str): Path to the audio file.
    output_file (str): Path to save the .npy file.
    sr (int): Sample rate for loading the audio file. Default is 22050.
    n_mfcc (int): Number of MFCCs to extract. Default is 13.
    """
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=sr)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # Save the MFCC features to a .npy file
    np.save(output_file, mfccs)

    return mfccs  # Return mfccs for inspection

# Example usage
audio_file = 'inputs/Rec.wav'
output_file = 'voice_characteristics2.npy'
mfccs = extract_voice_characteristics(audio_file, output_file)

# Check the dimensions of mfccs
print("MFCCs shape:", mfccs.shape)

# Assuming you want to load and inspect as a PyTorch tensor
speaker_embeddings = torch.tensor(mfccs)

# Check the dimensions of speaker_embeddings
print("Tensor shape:", speaker_embeddings.shape)


In [None]:
import torch
import torchaudio
import torch.hub

# Load the pre-trained speaker embedding model
model = torch.hub.load('torchvggish-master', 'vggish')

# Load the audio file
audio_file = 'inputs/Rec.wav'  # Replace with your audio file path
waveform, sample_rate = torchaudio.load(audio_file)

# Preprocess the audio
waveform = waveform.mean(dim=0, keepdim=True)  # If stereo, average channels
if waveform.size(1) < 16000:
    waveform = torch.nn.functional.pad(waveform, (0, 16000 - waveform.size(1)), "constant", 0)

# Extract speaker embeddings
with torch.no_grad():
    embeddings = model.forward(waveform)

# Save the embeddings as .npy file
output_file = 'speaker_embeddings3.npy'
torch.save(embeddings, output_file)

print(f"Speaker embeddings saved to {output_file} with shape {embeddings.shape}")


In [19]:
import librosa
import numpy as np
import torch

def extract_voice_characteristics(audio_file, output_file, sr=22050, n_mfcc=13):
    """
    Extracts MFCC features from an audio file and saves them as a .npy file.

    Parameters:
    audio_file (str): Path to the audio file.
    output_file (str): Path to save the .npy file.
    sr (int): Sample rate for loading the audio file. Default is 22050.
    n_mfcc (int): Number of MFCCs to extract. Default is 13.
    """
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=sr)

    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # Pad or truncate mfccs to match torch.Size([1, 512])
    mfccs = torch.tensor(mfccs[np.newaxis, :512])  # Adjust to torch tensor and truncate/pad if necessary

    # Save the MFCC features to a .npy file
    np.save(output_file, mfccs)

# Example usage
audio_file = 'inputs/Rec.wav'
output_file = 'voice_characteristics3.npy'
extract_voice_characteristics(audio_file, output_file)
