In [37]:
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import butter, filtfilt
import soundfile as sf
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity
import os

In [45]:
# Load multiple German speech files and extract MFCCs
mfcc_list = []

directory = os.fsencode('./data/clips')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".mp3"):
        y, sr = librosa.load(os.path.join('./data/clips', filename), sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=26)
    mfcc_list.append(mfccs.T)

# Stack all MFCCs from different files
all_mfccs = np.vstack(mfcc_list)

In [None]:
num_frames = 1000  # Adjust length

# Fit a GMM with 8 components (adjust for best results)
gmm = GaussianMixture(n_components=8, covariance_type="full", random_state=42)
gmm.fit(all_mfccs)
random_mfccs = gmm.sample(num_frames)[0].T  # Transpose back to (features, frames)

In [47]:
# Fit KDE to each MFCC coefficient separately
kde_models = [KernelDensity(kernel="gaussian", bandwidth=0.1).fit(all_mfccs[:, i].reshape(-1, 1))
              for i in range(all_mfccs.shape[1])]

# Generate random MFCCs using KDE
random_mfccs = np.array([kde.sample(num_frames).flatten() for kde in kde_models])

In [48]:
# Compute delta and delta-delta (acceleration) to enforce smoothness
delta_mfccs = librosa.feature.delta(random_mfccs)
delta2_mfccs = librosa.feature.delta(random_mfccs, order=2)

# Stack for a more speech-like representation
generated_mfccs = np.vstack([random_mfccs, delta_mfccs, delta2_mfccs])


# Convert random MFCCs back to a Mel spectrogram
random_mel_spec = librosa.feature.inverse.mfcc_to_mel(random_mfccs)

# Convert Mel spectrogram to STFT (Short-Time Fourier Transform)
random_stft = librosa.feature.inverse.mel_to_stft(random_mel_spec, sr=sr)

# Reconstruct the waveform using the Griffin-Lim algorithm
random_audio = librosa.griffinlim(random_stft)

# Design a bandpass filter (simulating vocal tract effects)
lowcut = 300  # Hz
highcut = 3400  # Hz

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype="band")
    return filtfilt(b, a, data)

# Apply the filter to the synthesized speech
filtered_audio = butter_bandpass_filter(random_audio, lowcut, highcut, sr)

sf.write("generated_speech.wav", filtered_audio, sr)
