# Speech Feature Engineering




In [1]:
# pip install librosa
# pip install my-voice-analysis

Some install/package related issues:
- For pydub was getting "No such file or directory: 'ffprobe'" error with pydub so instead used `conda install -c conda-forge ffmpeg` (rather than pip) to install.
- For my-voice-analysis was getting "Try again the sound of the audio was not clear". I ended up copying the code from that package's repo and modified all the `sourcerun` file paths and now it works.


In [2]:
import os
from pydub import AudioSegment
import librosa
mysp=__import__("my-voice-analysis")

In [3]:
# File paths for audio files
m4a_path = './data/m4a_files/'
wav_path = './data/wav_files/'

In [4]:
def convert_m4a_to_wav(file_name, input_dir, output_dir):
    '''
    Convert an .m4a audio file to a .wav audio file using PyDub.

    Inputs:
        file_name (str): name of file (without extension)
        input_dir (str): directory path for input .m4a file
        output_dir (str): directory path for output .wav file

    '''
    
    # Load the m4a file
    audio = AudioSegment.from_file(input_dir + file_name + '.m4a', format = 'm4a')
#     # Export the audio to wav format
#     audio.export(output_dir + file_name + '.wav', format = 'wav')
    
    # Set the desired sample rate and bit depth
    desired_sample_rate = 44100
    desired_sample_width = 2  # 16-bit depth

    # Resample the audio to the desired sample rate
    resampled_audio = audio.set_frame_rate(desired_sample_rate)

    # Set the bit depth to the desired value
    converted_audio = resampled_audio.set_sample_width(desired_sample_width)

    # Export the converted audio to a new WAV file
    converted_audio.export(output_dir + file_name + '.wav', format="wav")
    



    


In [5]:
# Create empty directories for wav files (if it doesn't exist)
if not os.path.exists(wav_path):
    os.makedirs(wav_path)


In [6]:
# List of file names
file_list = os.listdir(m4a_path) # List all files in original directory
# Updated list of files names
# remove extension and skip files that start with '.' (e.g. ipynb checkpoints)
file_list = [x.replace('.m4a', '')for x in file_list if x[0] != '.'] # Remove extension

In [7]:
file_list_temp = file_list[0:2]

In [8]:
# Convert all audio files to wav format
for file in file_list_temp:
    convert_m4a_to_wav(file, m4a_path, wav_path)


In [9]:
file_name = "348th_11.4.21" # Audio File title
wav_file = wav_path + file_name + '.wav'

## Librosa Library

In [17]:
# Load the WAV file using librosa
y, sr = librosa.load(wav_file)

# Extract pitch, loudness, and spectral centroid
pitch = librosa.pitch.piptrack(y=y, sr=sr)
loudness = librosa.amplitude_to_db(librosa.feature.rms(y=y))
spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)

# Print the features
print("Pitch:", pitch)
print("Loudness:", loudness)
print("Spectral centroid:", spec_centroid)

Pitch: (array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32))
Loudness: [[-48.007576 -39.336563 -32.142986 ... -22.90003  -24.209024 -26.71492 ]]
Spectral centroid: [[1854.40866307  945.04239462  713.05587732 ... 1150.47042849
  1063.63073103  891.03299265]]


## my-voice-analysis Library

In [11]:
summary_dataset = mysp.mysptotal(file_name, wav_path[:-1])

In [12]:
summary_dataset.T

Unnamed: 0,0
number_ of_syllables,736.0
number_of_pauses,203.0
rate_of_speech,2.0
articulation_rate,5.0
speaking_duration,150.8
original_duration,309.3
balance,0.5
f0_mean,224.76
f0_std,62.28
f0_median,222.7


In [13]:
# Gender recognition and mood of speech:
gender_mood = mysp.myspgend(file_name, wav_path[:-1])
gender_mood

('a female, mood of speech: Reading, p-value/sample size= :0.00', 5)

In [14]:
def extract_mood(gender_mood_string):
    '''
    Want to extract the mood of speech from the gender and mood string from my-voice-analysis package
    
    For example, from the string:
    ('a female, mood of speech: Reading, p-value/sample size= :0.00', 5)
    I'd want to return "Reading"
    
    '''
    
    # Find the index of the first colon and the next comma after it
    colon_index = gender_mood_string.find(':')
    comma_index = gender_mood_string.find(',', colon_index)

    # Extract the text between the colon and comma using slicing
    mood = gender_mood_string[colon_index+2:comma_index]
    
    return mood
    

In [16]:
mood = extract_mood(gender_mood[0])
mood

'Reading'