In [43]:
import importlib
import extraction_functions_praat
importlib.reload(extraction_functions_praat)

<module 'extraction_functions_praat' from '/home/dene/rp2/extraction_functions_praat.py'>

In [44]:
from paths import *
import pandas as pd
from extraction_functions_praat import (
    PP_f0_mean, PP_f0_median, PP_f0_sd,
    PP_f0_mean_murton, PP_f0_median_murton, PP_f0_sd_murton, 
    PP_jitter, PP_jitter_murton,
    PP_lh_ratio, PP_lh_ratio_murton,
    PP_cpp_mean_murton, PP_cpp_median_murton, PP_cpp_sd_murton,
    PP_max_phonation,
    PP_harmonics_to_noise, PP_harmonics_to_noise_murton,
)

In [45]:
AVAILABLE_FEATURES = [
    'PP_F0', 'PP_F02', 'PP_F0_SD', 
    'PP_F0_M', 'PP_F02_M', 'PP_F0_SD_M', 
    'PP_JIT', 'PP_JIT_M',
    'PP_LHR', 'PP_LHR_M',
    'PP_CPP_M', 'PP_CPP_M2', 'PP_CPP_SD_M',
    'PP_MAX_PH',
    'PP_HNR', 'PP_HNR_M',
] ## add more

selected_features_dict = {
    'PP_F0': True, 'PP_F02':True, 'PP_F0_SD': True, 
    'PP_F0_M': True, 'PP_F02_M':True, 'PP_F0_SD_M': True, 
    'PP_JIT': True, 'PP_JIT_M': True,
    'PP_LHR': True, 'PP_LHR_M': True,
    'PP_CPP_M': True, 'PP_CPP_M2': True, 'PP_CPP_SD_M':True,
    'PP_MAX_PH': True,
    'PP_HNR': True, 'PP_HNR_M': True,
} ## enable the ones you want

In [46]:
def extract_features_vowels(audio_file, selected_features, segment_length, f0_min, f0_max, jitter_type):
    extracted_features = {}
    
    if 'PP_F0' in selected_features:
        extracted_features['PP_F0'] = PP_f0_mean(audio_file, f0_min=f0_min, f0_max=f0_max)
    if 'PP_F0_M' in selected_features:
        extracted_features['PP_F0_M'] = PP_f0_mean_murton(audio_file, f0_min=f0_min, f0_max=f0_max)
    if 'PP_F0_SD' in selected_features:
        extracted_features['PP_F0_SD'] = PP_f0_sd(audio_file, f0_min=f0_min, f0_max=f0_max)
    if 'PP_F0_SD_M' in selected_features:
        extracted_features['PP_F0_SD_M'] = PP_f0_sd_murton(audio_file, f0_min=f0_min, f0_max=f0_max)
    if 'PP_JIT' in selected_features:
        jitter_values = PP_jitter(audio_file, f0_min=f0_min, f0_max=f0_max, type=jitter_type)
        if jitter_type == 'all':
            extracted_features['PP_JIT_local'] = jitter_values[0]
            extracted_features['PP_JIT_abs'] = jitter_values[1]
            extracted_features['PP_JIT_rap'] = jitter_values[2]
            extracted_features['PP_JIT_ppq5'] = jitter_values[3]
            extracted_features['PP_JIT_ddp'] = jitter_values[4]
        else:
            extracted_features[f'PP_JIT_{jitter_type}'] = jitter_values
    if 'PP_JIT_M' in selected_features:
        jitter_values = PP_jitter_murton(audio_file, segment_length=segment_length, f0_min=f0_min, f0_max=f0_max, type=jitter_type)
        if jitter_type == 'all':
            extracted_features['PP_JIT_M_local'] = jitter_values[0]
            extracted_features['PP_JIT_M_abs'] = jitter_values[1]
            extracted_features['PP_JIT_M_rap'] = jitter_values[2]
            extracted_features['PP_JIT_M_ppq5'] = jitter_values[3]
            extracted_features['PP_JIT_M_ddp'] = jitter_values[4]
        else:
            extracted_features[f'PP_JIT_M_{jitter_type}'] = jitter_values
    if 'PP_LHR' in selected_features:
        extracted_features['PP_LHR'] = PP_lh_ratio(audio_file)
    if 'PP_LHR_M' in selected_features:
        extracted_features['PP_LHR_M'] = PP_lh_ratio_murton(audio_file, segment_length=segment_length)
    if 'PP_CPP_M' in selected_features:
        extracted_features['PP_CPP_M'] = PP_cpp_mean_murton(audio_file)
    if 'PP_CPP_M2' in selected_features:
        extracted_features['PP_CPP_M2'] = PP_cpp_median_murton(audio_file)
    if 'PP_CPP_SD_M' in selected_features:
        extracted_features['PP_CPP_SD_M'] = PP_cpp_sd_murton(audio_file)
    if 'PP_HNR' in selected_features:
        extracted_features['PP_HNR'] = PP_harmonics_to_noise(audio_file)
    if 'PP_HNR_M' in selected_features:
        extracted_features['PP_HNR_M'] = PP_harmonics_to_noise_murton(audio_file, segment_length=segment_length)
            
    return extracted_features


def extract_features_mpt(audio_file, selected_features, silence_threshold):
    extracted_features = {}
    
    if 'PP_MAX_PH' in selected_features:
        extracted_features['PP_MAX_PH'] = PP_max_phonation(audio_file, silence_threshold)
            
    return extracted_features


def extract_features_sentences(audio_file, selected_features, f0_min, f0_max):
    extracted_features = {}
    
    if 'PP_F02' in selected_features:
        extracted_features['PP_F02'] = PP_f0_median(audio_file, f0_min=f0_min, f0_max=f0_max)    
    if 'PP_F02_M' in selected_features:
        extracted_features['PP_F02_M'] = PP_f0_median_murton(audio_file, f0_min=f0_min, f0_max=f0_max)  
    if 'PP_F0_SD' in selected_features:
        extracted_features['PP_F0_SD'] = PP_f0_sd(audio_file, f0_min=f0_min, f0_max=f0_max)
    if 'PP_F0_SD_M' in selected_features:
        extracted_features['PP_F0_SD_M'] = PP_f0_sd_murton(audio_file, f0_min=f0_min, f0_max=f0_max)
    if 'PP_CPP_M' in selected_features:
        extracted_features['PP_CPP_M'] = PP_cpp_mean_murton(audio_file)
    if 'PP_CPP_M2' in selected_features:
        extracted_features['PP_CPP_M2'] = PP_cpp_median_murton(audio_file)
    if 'PP_CPP_SD_M' in selected_features:
        extracted_features['PP_CPP_SD_M'] = PP_cpp_sd_murton(audio_file)
                        
    return extracted_features

In [47]:
def process_audio_files_vowels(directory, selected_features, segment_length, f0_min, f0_max, jitter_type):
    patient_dfs = {}

    files = [file for d in directory for file in d.rglob('*') if file.is_file()]

    for file in files:
        filename = file.stem.replace("_pre", "")
        parts = filename.split("_")
        if len(parts) != 4:
            print(f"Unexpected named audio file: {file}")
            continue

        patient_id, day, exercise, take_letter = parts

        features = {}
        file_path = str(file)
        
        if exercise == 'VOW':
            if directory == best_segments_dir:
                if file_path[-1].isdigit(): 
                    continue ## we do not analyze the best segment of the entire vowel exercise, only of the best segments
                if file_path[-1].isalpha():
                    features = extract_features_vowels(file_path, selected_features, f0_min, f0_max, jitter_type)
            else:
                features = extract_features_vowels(file_path, selected_features, segment_length, f0_min, f0_max, jitter_type)
        
        if features:    
            df_entry = {'day': int(day), **features}       
            df_key = (patient_id, take_letter, exercise)
            if df_key not in patient_dfs:
                patient_dfs[df_key] = pd.DataFrame(columns=['day'] + list(features.keys()))
            patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
        
        for key, df in patient_dfs.items():
            patient_dfs[key] = df.sort_values(by='day', ascending=True).reset_index(drop=True)

        for (patient_id, take_letter, exercise), df in patient_dfs.items():
            file_name = f"{patient_id}_{exercise}_{take_letter}.csv"
            file_path = features_dir / exercise / patient_id / file_name

            df.to_csv(file_path, index=False)

def process_audio_files_mpt(directory, selected_features, silence_threshold):
    patient_dfs = {}

    files = [file for d in directory for file in d.rglob('*') if file.is_file()]

    for file in files:
        filename = file.stem.replace("_pre", "")
        parts = filename.split("_")
        if len(parts) != 4:
            print(f"Unexpected named audio file: {file}")
            continue

        patient_id, day, exercise, take_letter = parts

        features = {}
        file_path = str(file)
        
        if exercise == 'MPT':
            features = extract_features_mpt(file_path, selected_features, silence_threshold)
            
        if features:            
            df_entry = {'day': int(day), **features}    
            df_key = (patient_id, take_letter, exercise)
            if df_key not in patient_dfs:
                patient_dfs[df_key] = pd.DataFrame(columns=['day'] + list(features.keys()))
            patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
            
        for key, df in patient_dfs.items():
            patient_dfs[key] = df.sort_values(by='day', ascending=True).reset_index(drop=True)

        for (patient_id, take_letter, exercise), df in patient_dfs.items():
            file_name = f"{patient_id}_{exercise}_{take_letter}.csv"
            file_path = features_dir / exercise / patient_id / file_name

            df.to_csv(file_path, index=False)

def process_audio_files_sentences(directory, selected_features, f0_min, f0_max):
    patient_dfs = {}

    files = [file for d in directory for file in d.rglob('*') if file.is_file()]

    for file in files:
        filename = file.stem.replace("_pre", "")
        parts = filename.split("_")
        if len(parts) != 4:
            print(f"Unexpected named audio file: {file}")
            continue

        patient_id, day, exercise, take_letter = parts
        
        features = {}
        file_path = str(file)
        
        if exercise == 'SEN':
            features = extract_features_sentences(file_path, selected_features, f0_min, f0_max)
            
        if features:
            df_entry = {'day': int(day), **features}    
            df_key = (patient_id, take_letter, exercise)
            if df_key not in patient_dfs:
                patient_dfs[df_key] = pd.DataFrame(columns=['day'] + list(features.keys()))
            patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
        
        for key, df in patient_dfs.items():
            patient_dfs[key] = df.sort_values(by='day', ascending=True).reset_index(drop=True)
            
        for (patient_id, take_letter, exercise), df in patient_dfs.items():
            file_name = f"{patient_id}_{exercise}_{take_letter}.csv"
            file_path = features_dir / exercise / patient_id / file_name

            df.to_csv(file_path, index=False)

In [48]:
dir_to_run_vowels = [
    audio_dir,
    segments_dir,
    # best_segments_dir, ## will not use the best segments for now, since I might use the middle of a vowel segment (Murton 2023)
]
dir_to_run_mpt = [audio_dir, segments_dir]
dir_to_run_sentences = [audio_dir, segments_dir]

process_audio_files_vowels(
    directory=dir_to_run_vowels,
    selected_features=selected_features_dict,
    segment_length=1.0,
    f0_min=60, f0_max=300, 
    jitter_type='all') ## choose from 'local' 'abs' 'rap' 'ppq5' 'ddp' 'all'

process_audio_files_mpt(
    directory=dir_to_run_mpt,
    selected_features=selected_features_dict,
    silence_threshold=50) ## choose from 'local' 'abs' 'rap' 'ppq5' 'ddp' 'all'

process_audio_files_sentences(
    directory=dir_to_run_sentences,
    selected_features=selected_features_dict,
    f0_min=60, f0_max=300) ## choose from 'local' 'abs' 'rap' 'ppq5' 'ddp' 'all'

  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])], ignore_index=True)
  patient_dfs[df_key] = pd.concat([patient_dfs[df_key], pd.DataFrame([df_entry])],