In [7]:
import importlib
import extraction_functions_praat
importlib.reload(extraction_functions_praat)

<module 'extraction_functions_praat' from '/home/dene/rp2/extraction_functions_praat.py'>

In [None]:
from pathlib import Path
import pandas as pd
from extraction_functions_praat import (
    PP_f0_mean, PP_f0_median, PP_f0_sd,
    PP_f0_mean_murton, PP_f0_median_murton, PP_f0_sd_murton, 
    PP_jitter,
    PP_lh_ratio,
    PP_cpp_mean_murton, PP_cpp_median_murton, PP_cpp_sd_murton,
    PP_max_phonation,
    PP_harmonics_to_noise,
)

In [2]:
audio_dir = Path('audio_files_pre/')
features_dir = Path('extracted_features/')
segments_dir = Path('audio_files_segments/')
best_segments_dir = Path('audio_files_segments_best/')

In [None]:
AVAILABLE_FEATURES = [
    'PP_F0', 'PP_F02', 'PP_F0_SD', 
    'PP_F0_M', 'PP_F02_M', 'PP_F0_SD_M', 
    'PP_JIT',
    'PP_LHR',
    'PP_CPP_M', 'PP_CPP_M2', 'PP_CPP_SD_M',
    'PP_MAX_PH',
    'PP_HNR',
] ## add more

selected_features_dict = {
    'PP_F0': True, 'PP_F02':True, 'PP_F0_SD': True, 
    'PP_F0_M': True, 'PP_F02_M':True, 'PP_F0_SD_M': True, 
    'PP_JIT': True,
    'PP_LHR': True,
    'PP_CPP_M': True, 'PP_CPP_M2': True, 'PP_CPP_SD_M':True,
    'PP_MAX_PH': True,
    'PP_HNR': True,
} ## enable the ones you want

In [None]:
def extract_features_vowels(audio_file, selected_features, f0_min, f0_max, jitter_type):
    extracted_features = {}
    
    if 'PP_F0' in selected_features:
        extracted_features['PP_F0'] = PP_f0_mean(audio_file, f0_min=f0_min, f0_max=f0_max)
    elif 'PP_F0_M' in selected_features:
        extracted_features['PP_F0_M'] = PP_f0_mean_murton(audio_file, f0_min=f0_min, f0_max=f0_max)
    elif 'PP_F0_SD' in selected_features:
        extracted_features['PP_F0_SD'] = PP_f0_sd(audio_file, f0_min=f0_min, f0_max=f0_max)
    elif 'PP_F0_SD_M' in selected_features:
        extracted_features['PP_F0_SD_M'] = PP_f0_sd_murton(audio_file, f0_min=f0_min, f0_max=f0_max)
    elif 'PP_JIT' in selected_features:
        jitter_values = PP_jitter(audio_file, f0_min=f0_min, f0_max=f0_max, type=jitter_type)
        if jitter_type == 'all':
            extracted_features['PP_JIT_local'] = jitter_values[0]
            extracted_features['PP_JIT_abs'] = jitter_values[1]
            extracted_features['PP_JIT_rap'] = jitter_values[2]
            extracted_features['PP_JIT_ppq5'] = jitter_values[3]
            extracted_features['PP_JIT_ddp'] = jitter_values[4]
        else:
            extracted_features[f'PP_JIT_{jitter_type}'] = jitter_values
    elif 'PP_LHR' in selected_features:
        extracted_features['PP_LHR'] = PP_lh_ratio(audio_file)
    elif 'PP_CPP_M' in selected_features:
        extracted_features['PP_CPP_M'] = PP_cpp_mean_murton(audio_file)
    elif 'PP_CPP_M2' in selected_features:
        extracted_features['PP_CPP_M2'] = PP_cpp_median_murton(audio_file)
    elif 'PP_CPP_SD_M' in selected_features:
        extracted_features['PP_CPP_SD_M'] = PP_cpp_sd_murton(audio_file)
    elif 'PP_HNR' in selected_features:
        extracted_features['PP_HNR'] = PP_harmonics_to_noise(audio_file)
            
    return extracted_features


def extract_features_mpt(audio_file, selected_features, silence_threshold):
    extracted_features = {}
    
    if 'PP_MAX_PH' in selected_features:
        extracted_features['PP_MAX_PH'] = PP_max_phonation(audio_file, silence_threshold)
            
    return extracted_features


def extract_features_sentences(audio_file, selected_features, f0_min, f0_max):
    extracted_features = {}
    
    if 'PP_F02' in selected_features:
        extracted_features['PP_F02'] = PP_f0_median(audio_file, f0_min=f0_min, f0_max=f0_max)    
    elif 'PP_F02_M' in selected_features:
        extracted_features['PP_F02_M'] = PP_f0_median_murton(audio_file, f0_min=f0_min, f0_max=f0_max)  
    elif 'PP_F0_SD' in selected_features:
        extracted_features['PP_F0_SD'] = PP_f0_sd(audio_file, f0_min=f0_min, f0_max=f0_max)
    elif 'PP_F0_SD_M' in selected_features:
        extracted_features['PP_F0_SD_M'] = PP_f0_sd_murton(audio_file, f0_min=f0_min, f0_max=f0_max)
    elif 'PP_CPP_M' in selected_features:
        extracted_features['PP_CPP_M'] = PP_cpp_mean_murton(audio_file)
    elif 'PP_CPP_M2' in selected_features:
        extracted_features['PP_CPP_M2'] = PP_cpp_median_murton(audio_file)
    elif 'PP_CPP_SD_M' in selected_features:
        extracted_features['PP_CPP_SD_M'] = PP_cpp_sd_murton(audio_file)
                        
    return extracted_features

In [None]:
def process_audio_files(directory, selected_features, f0_min, f0_max, jitter_type, silence_threshold):
    patient_dfs = {}

    files = [file for file in directory.rglob('*') if file.is_file()]

    for file in files:
        filename = file.stem.replace("_pre", "")
        parts = filename.split("_")
        if len(parts) != 4:
            print(f"Unexpected named audio file: {file}")
            continue

        patient_id, day, exercise, take_letter = parts

        file_path = str(file)
        
        if exercise == 'VOW':
            if directory == best_segments_dir:
                if file_path[-1].isdigit(): 
                    continue ## we do not analyze the best segment of the entire vowel exercise, only of the best segments
                if file_path[-1].isalpha():
                    features = extract_features_vowels(file_path, selected_features, f0_min, f0_max, jitter_type)
            else:
                features = extract_features_vowels(file_path, selected_features, f0_min, f0_max, jitter_type)
        elif exercise == 'MPT':
            features = extract_features_mpt(file_path, selected_features, silence_threshold)
        elif exercise == 'SEN':
            features = extract_features_sentences(file_path, selected_features, f0_min, f0_max)
            
        df_entry = {'day': int(day), **features}    
            
        df_key = (patient_id, take_letter, exercise)
        if df_key not in patient_dfs:
            patient_dfs[df_key] = pd.DataFrame(columns=['day'] + list(features.keys()))
        
        patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
        
        for key, df in patient_dfs.items():
            patient_dfs[key] = df.sort_values(by='day', ascending=True).reset_index(drop=True)

        for (patient_id, take_letter, exercise), df in patient_dfs.items():
            file_name = f"{patient_id}_{exercise}_{take_letter}.csv"
            file_path = features_dir / exercise / patient_id / file_name

            df.to_csv(file_path, index=False)

In [None]:
process_audio_files(
    directory=audio_dir,
    selected_features=selected_features_dict,
    f0_min=60, f0_max=300, 
    jitter_type='all') ## choose from 'local' 'abs' 'rap' 'ppq5' 'ddp' 'all'

process_audio_files(
    directory=segments_dir,
    selected_features=selected_features_dict,
    f0_min=60, f0_max=300, 
    jitter_type='all') ## choose from 'local' 'abs' 'rap' 'ppq5' 'ddp' 'all'

## will not use the best segments for now, since I might use the mid 500 ms of a vowel segment (Murton 2023)
# process_audio_files(
#     directory=best_segments_dir,
#     selected_features=selected_features_dict,
#     f0_min=60, f0_max=300, 
#     jitter_type='all') ## choose from 'local' 'abs' 'rap' 'ppq5' 'ddp' 'all'

  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])],