recording audio 

In [228]:
import soundfile as sf
import sounddevice as sd
from scipy.io.wavfile import write

# collect data of small user base, probably using siamese network
def record_audio(file_name, duration = 5, sampling_rate = 16000): # sr = samples taken per second 
    print(f"Recording audio for {duration} seconds...")
    audio = sd.rec(int(duration * sampling_rate), samplerate=sampling_rate, channels=1)
    sd.wait()  # waits until the 5 second recording is done
    sf.write(file_name, audio, sampling_rate) # saves the audio file
    print(f"Finished recording audio for {duration} seconds.")
    
record_audio("5s20.wav") # file name
    
    
    



Recording audio for 5 seconds...
Finished recording audio for 5 seconds.


clean and save audio

In [229]:
import librosa
import librosa.display
from librosa import feature 
import os

def clean_audio(filename, folder_name, duration = 5, target_sr = 16000):
    file_path = os.path.join(folder_name, filename)
    y, sr = librosa.load(filename, sr=target_sr)
    y = librosa.util.fix_length(y, size = target_sr * duration)
    sf.write(file_path, y, target_sr)
    return y

clean_audio("5s20.wav", "/Users/25yoon/PycharmProjects/final_project/audios/5")
    

array([ 3.0517578e-05, -3.0517578e-05,  4.8828125e-03, ...,
       -4.5776367e-03, -4.0893555e-03, -4.5776367e-03], dtype=float32)

In [270]:
fn = "7s20.wav"
record_audio(fn)
clean_audio(fn, "/Users/25yoon/PycharmProjects/final_project/audios/7")

Recording audio for 5 seconds...
Finished recording audio for 5 seconds.


array([-3.0517578e-05,  0.0000000e+00, -6.1035156e-05, ...,
       -9.4604492e-04, -3.9672852e-04, -3.0517578e-05], dtype=float32)

In [272]:
import numpy as np
# function that extracts the mfcc


def extract_mfcc(file_path, duration = 5, sampling_rate = 16000, n_mfcc = 40 ):
    y, sr = librosa.load(file_path, sr=sampling_rate)
    y = librosa.util.fix_length(y, size = sr * duration)
    mfcc = librosa.feature.mfcc(y=y, sr=sampling_rate, n_mfcc=n_mfcc)
    mfcc = mfcc - np.mean(mfcc, axis=1, keepdims=True) / np.std(mfcc, axis=1, keepdims=True)
    mfcc = mfcc.T
    return mfcc 

feature_sample = extract_mfcc("/Users/25yoon/PycharmProjects/final_project/test_audio_folders/test_2.wav")
print(f"shape: {feature_sample.shape}")



shape: (157, 40)


In [24]:
from sklearn.preprocessing import StandardScaler

#standardizing is very important to z_scorizing all the values to make it easier for the machine to read 
scaler_1 = StandardScaler()
def normalize(mfcc):
    norm_mfcc = scaler_1.fit_transform(mfcc)
    return norm_mfcc

# might delete later
def augment_audio(file_path, pitch_shift=2.0, output_folder="augmented_audio"):
    # Load the original audio file
    y, sr = librosa.load(file_path, sr=16000)
    
    # Apply pitch shift
    y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)
    
    # Ensure the output folder exists (create it if it doesn't exist)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Extract the filename from the original file path
    filename = os.path.basename(file_path)
    
    # Create the new file name (e.g., "user_1_01_augmented.wav")
    new_filename = f"augmented_{filename}"
    # Create the full path to save the file
    save_path = os.path.join(output_folder, new_filename)  
    # Save the augmented audio file to the specified folder
    sf.write(save_path, y_shifted, sr)  
    print(f"Saved augmented audio as {save_path}")
    return save_path


check duration safety checking


In [17]:
import wave

with wave.open('test_2.wav', 'rb') as wav_file:
    frame_rate = wav_file.getframerate()
    n_frames = wav_file.getnframes()
    duration = n_frames / frame_rate
    print(f"Duration: {duration} seconds")

Duration: 10.0 seconds


In [302]:
from sklearn.model_selection import train_test_split

# audio preprocessing, file preprocessing
X = []
y = [] 
AUDIO_DIR = "/Users/25yoon/PycharmProjects/final_project/audios"

for speaker_id, speaker_folder in enumerate(sorted(os.listdir(AUDIO_DIR))):
    speaker_path = os.path.join(AUDIO_DIR, speaker_folder)
    for audio_file in os.listdir(speaker_path):
        file_path = os.path.join(speaker_path, audio_file)
        mfcc = extract_mfcc(file_path, duration = 5, sampling_rate = 16000)
        X.append(mfcc)
        y.append(0 if speaker_id+1 < 7 else 1)
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        
        


In [303]:
os.makedirs('processed_data', exist_ok=True)
np.save('processed_data/X_train.npy', X_train)
np.save('processed_data/y_train.npy', y_train)
np.save('processed_data/X_test.npy', X_test)
np.save('processed_data/y_test.npy', y_test)

In [304]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
# lstms, or long short term memories holds important information using three gates of input, forget, output 
def create_rnn_model(input_shape, num_users):
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(num_users, activation='sigmoid'))  # One output neuron per user
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Model summary for input shape (80 time-steps, 40 features) for 10 users
model = create_rnn_model(input_shape=(157, 40), num_users=7)
model.summary()

In [305]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 106ms/step - accuracy: 0.0484 - loss: 2.2129 - val_accuracy: 0.1724 - val_loss: 1.9696
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.1289 - loss: 1.9754 - val_accuracy: 0.1724 - val_loss: 1.8343
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.1661 - loss: 1.8427 - val_accuracy: 0.1724 - val_loss: 1.7158
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.1534 - loss: 1.7144 - val_accuracy: 0.2414 - val_loss: 1.5906
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.2545 - loss: 1.5789 - val_accuracy: 0.8276 - val_loss: 1.4571
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.5116 - loss: 1.4412 - val_accuracy: 0.8276 - val_loss: 1.2993
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━

In [306]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.8276 - loss: 0.6381
Test Loss: 0.6380565166473389, Test Accuracy: 0.8275862336158752


In [325]:
def is_target_speaker(model, mfcc_features, precition=None):
    prediction = model.predict(mfcc_features)
    print(prediction)
    prob = np.mean(prediction, axis=-1)
    print(prob)
    return prob[0] > 0.4  # Returns True (1) for target speaker, False (0) otherwise

# Example MFCC extraction for new audio
fn = "test1impo.wav"
record_audio(fn)
clean_audio(fn, "/Users/25yoon/PycharmProjects/final_project/audios/7")



Recording audio for 5 seconds...
Finished recording audio for 5 seconds.


array([ 0.0000000e+00, -3.0517578e-05,  3.0517578e-05, ...,
        9.0026855e-02, -9.5520020e-03,  1.4300537e-01], dtype=float32)

In [284]:
fn = "test1impo.wav"
record_audio(fn)
clean_audio(fn, "/Users/25yoon/PycharmProjects/final_project/audios/7")

(157, 40)

In [326]:
new_mfcc = extract_mfcc('/Users/25yoon/PycharmProjects/final_project/audios/7/test1impo.wav')
new_mfcc = np.expand_dims(new_mfcc, axis=0) 
 # Extract the probability
      # If > 0.5, grant access
# Check if it matches the target speaker
result = is_target_speaker(model, new_mfcc)
if result:
    print("Access granted")
else:
    print("Access denied")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[[0.8605667  0.62293994 0.13989203 0.17855164 0.4769474  0.1813635
  0.33303502]]
[0.39904234]
Access denied


(157, 40)