In [None]:
# [HuBERT: 음성 전체 특징 벡터 추출]
import torch
from transformers import Wav2Vec2Processor, HubertModel
import librosa
import numpy as np

# 모델과 프로세서를 로드합니다.
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

def extract_features(audio_path, min_length=16000):
    # 오디오 파일을 로드하고 샘플링 레이트를 설정합니다.
    audio, sample_rate = librosa.load(audio_path, sr=16000)  # Hubert는 16kHz 샘플링 레이트 사용
    
    # 입력 오디오의 길이가 최소 길이보다 짧으면 패딩을 추가합니다.
    if len(audio) < min_length:
        audio = np.pad(audio, (0, min_length - len(audio)), mode='constant')
    
    # 입력 신호를 모델이 처리할 수 있도록 변환합니다.
    inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
    
    # 음성 특징을 추출합니다.
    with torch.no_grad():
        features = model(**inputs).last_hidden_state
    
    return features

# 두 음성 파일의 경로를 지정합니다.
audio_path1 = "./example_tts.wav"
audio_path2 = "./user_audio.wav"

# 각 음성 파일에서 특징을 추출합니다.
features1 = extract_features(audio_path1)
features2 = extract_features(audio_path2)

print("First audio features shape:", features1.shape)
print("Second audio features shape:", features2.shape)

First audio features shape: torch.Size([1, 223, 1024])
Second audio features shape: torch.Size([1, 329, 1024])


In [None]:
# [HuBERT: 음성 전체 특징 벡터 추출] + [DTW: 구간 맞춤] + [코사인 유사도 계산(패딩 제외)]
import torch
from transformers import Wav2Vec2Processor, HubertModel
import librosa
import numpy as np
from scipy.spatial.distance import cosine
from fastdtw import fastdtw

# 모델과 프로세서를 로드합니다.
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

def extract_features(audio_path, min_length=16000):
    # 오디오 파일을 로드하고 샘플링 레이트를 설정합니다.
    audio, sample_rate = librosa.load(audio_path, sr=16000)  # Hubert는 16kHz 샘플링 레이트 사용
    
    # 입력 오디오의 길이가 최소 길이보다 짧으면 패딩을 추가합니다.
    if len(audio) < min_length:
        audio = np.pad(audio, (0, min_length - len(audio)), mode='constant')
    
    # 입력 신호를 모델이 처리할 수 있도록 변환합니다.
    inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
    
    # 음성 특징을 추출합니다.
    with torch.no_grad():
        features = model(**inputs).last_hidden_state
    
    return features

def calculate_dtw_cosine_similarity(features1, features2):
    # DTW 계산
    distance, path = fastdtw(features1, features2, dist=lambda x, y: 1 - cosine(x, y))
    
    # 코사인 유사도 계산 (패딩 구간 제외)
    similarities = []
    for i, j in path:
        if not np.all(features1[i] == 0) and not np.all(features2[j] == 0):
            sim = 1 - cosine(features1[i], features2[j])  # 코사인 유사도 계산
            similarities.append(sim)
    
    # 평균 유사도 계산
    return np.mean(similarities) if similarities else 0

# 두 음성 파일의 경로를 지정합니다.
audio_path1 = "./example_tts.wav"
audio_path2 = "./user_audio.wav"

# 각 음성 파일에서 특징을 추출합니다.
features1 = extract_features(audio_path1).squeeze(0).cpu().numpy()
features2 = extract_features(audio_path2).squeeze(0).cpu().numpy()

# DTW 기반의 코사인 유사도를 계산합니다.
similarity = calculate_dtw_cosine_similarity(features1, features2)
print("DTW-based Cosine Similarity:", similarity)


DTW-based Cosine Similarity: 0.45151969478301957


In [19]:
# [PTAAT: 음성 일부 특징(피치, 포먼트) 벡터 추출]
# 피치: 1차원 벡터   # 포먼트: 2차원 벡터
import parselmouth
import numpy as np
import librosa

def extract_pitch_formant(audio_path, min_length=16000):
    # 오디오 파일을 로드하고 샘플링 레이트를 설정합니다.
    audio, sample_rate = librosa.load(audio_path, sr=16000)

    # 입력 오디오의 길이가 최소 길이보다 짧으면 패딩을 추가합니다.
    if len(audio) < min_length:
        audio = np.pad(audio, (0, min_length - len(audio)), mode='constant')
    
    # parselmouth를 통해 오디오 객체 생성
    snd = parselmouth.Sound(audio_path)
    
    # 피치 추출
    pitch = snd.to_pitch()
    pitch_values = pitch.selected_array['frequency']  # Hz 단위의 피치 값 배열
    pitch_values = pitch_values[pitch_values > 0]     # 유효한 피치 값만 사용
    
    # 포먼트 추출
    formant = snd.to_formant_burg()
    formant_values = []
    for t in np.arange(0, snd.duration, 0.01):  # 매 10ms마다 포먼트 값을 추출
        formants_at_t = []
        for i in range(1, 5):  # 첫 네 개의 포먼트만 추출
            try:
                formants_at_t.append(formant.get_value_at_time(i, t))
            except Exception as e:
                formants_at_t.append(np.nan)  # 값이 없을 경우 NaN 추가
        formant_values.append(formants_at_t)
    
    formant_values = np.array(formant_values)

    return pitch_values, formant_values

# 두 음성 파일의 경로를 지정합니다.
audio_path1 = "./example_tts.wav"
audio_path2 = "./user_audio.wav"

# 각 음성 파일에서 피치와 포먼트를 추출합니다.
pitch1, formants1 = extract_pitch_formant(audio_path1)
pitch2, formants2 = extract_pitch_formant(audio_path2)

print("First audio pitch shape:", pitch1.shape)
print("First audio formant shape:", formants1.shape)
print("Second audio pitch shape:", pitch2.shape)
print("Second audio formant shape:", formants2.shape)


First audio pitch shape: (291,)
First audio formant shape: (447, 4)
Second audio pitch shape: (199,)
Second audio formant shape: (660, 4)


In [24]:
# [PTAAT: 음성 일부 특징(피치, 포먼트) 벡터 추출] + [DTW: 구간 맞춤] + [유클리디안 거리 계산(패딩 제외)]
import numpy as np
from fastdtw import fastdtw
import librosa

# NaN 값을 처리하는 함수
def handle_nan_values(matrix):
    # NaN 및 무한대 값을 0으로 대체
    matrix = np.nan_to_num(matrix, nan=0.0, posinf=0.0, neginf=0.0)
    return matrix

# 피치의 유클리디안 거리 기반 유사도 계산
def calculate_dtw_euclidean_similarity_pitch(pitch1, pitch2):
    # fastdtw를 사용하여 유클리디안 거리로 DTW 계산
    distance, path = fastdtw(pitch1, pitch2, dist=lambda x, y: np.linalg.norm(x - y))
    similarity = 1 / (1 + distance)  # 거리값을 유사도로 변환
    return similarity

# 포먼트의 유클리디안 거리 기반 유사도 계산
def calculate_dtw_euclidean_similarity_formant(formants1, formants2):
    # NaN 값 및 무한대 값 처리
    formants1 = handle_nan_values(formants1)
    formants2 = handle_nan_values(formants2)
    
    # librosa의 DTW 기능을 사용하여 유클리디안 거리를 계산
    cost, wp = librosa.sequence.dtw(X=formants1.T, Y=formants2.T, metric="euclidean")

    # 유사도 계산 (1 / (1 + 거리))
    similarities = [1 / (1 + np.linalg.norm(formants1[i] - formants2[j])) for i, j in wp]
    
    # 평균 유사도 계산
    return np.mean(similarities) if similarities else 0

# 피치와 포먼트를 추출한 두 음성 파일
pitch1, formants1 = extract_pitch_formant("./example_tts.wav")
pitch2, formants2 = extract_pitch_formant("./user_audio.wav")

# 피치의 유클리디안 거리 기반 유사도 계산
pitch_similarity = calculate_dtw_euclidean_similarity_pitch(pitch1, pitch2)
print("Pitch DTW-based Euclidean Similarity:", pitch_similarity)

# 포먼트의 유클리디안 거리 기반 유사도 계산
formant_similarity = calculate_dtw_euclidean_similarity_formant(formants1, formants2)
print("Formant DTW-based Euclidean Similarity:", formant_similarity)

Pitch DTW-based Euclidean Similarity: 0.00031211457150652125
Formant DTW-based Euclidean Similarity: 0.01004809541884612


In [31]:
# [MFCC: 음소 단위 분석을 통해 피치, 포먼트, 길이 등의 세부 피드백 제공]
# 발음 채점 시 단순히 피치와 포먼트만 비교하는 것 보다 더 복합적인 음성 특징 고려
import parselmouth
import numpy as np
import whisper

# Whisper 모델을 로드합니다.
model = whisper.load_model("small")

# 텍스트 추출 함수 (언어를 영어로 강제 설정)
def transcribe_audio(audio_path):
    result = model.transcribe(audio_path, language="en")
    return result["text"]

def extract_phoneme_features(audio_path):
    snd = parselmouth.Sound(audio_path)
    
    # 피치 추출
    pitch = snd.to_pitch()
    pitch_values = pitch.selected_array['frequency']
    
    # 포먼트 추출
    formant = snd.to_formant_burg()
    formant_values = []
    for t in np.arange(0, snd.duration, 0.01):  # 10ms 간격
        formants_at_t = []
        for i in range(1, 5):  # 첫 4개 포먼트 추출
            try:
                formants_at_t.append(formant.get_value_at_time(i, t))
            except Exception:
                formants_at_t.append(np.nan)
        formant_values.append(formants_at_t)
    
    # NaN 값 처리
    formant_values = np.nan_to_num(formant_values, nan=0.0)
    
    return pitch_values, np.array(formant_values)

def compare_phoneme_features(pitch1, pitch2, formant1, formant2):
    differences = []
    min_length = min(len(pitch1), len(pitch2), len(formant1), len(formant2))

    for i in range(min_length):
        # 피치 차이 계산
        if pitch1[i] > 0 and pitch2[i] > 0:  # 피치가 유효한 경우
            pitch_diff = abs(pitch1[i] - pitch2[i])
        else:
            pitch_diff = None  # 피치 값이 유효하지 않으면 None

        # 포먼트 차이 계산 (여기서는 첫 번째 포먼트만 비교 예시)
        formant_diff = np.linalg.norm(formant1[i] - formant2[i])

        differences.append((i * 0.01, pitch_diff, formant_diff))  # 시간(s), 피치 차이, 포먼트 차이

    return differences

# 원어민과 사용자 음성 파일의 피치 및 포먼트 추출
pitch1, formants1 = extract_phoneme_features("./example_tts.wav")
pitch2, formants2 = extract_phoneme_features("./user_audio.wav")

# Whisper 모델로 example_tts.wav와 user_audio.wav에서 영어 텍스트 추출
transcribed_text1 = transcribe_audio("./example_tts.wav")
transcribed_text2 = transcribe_audio("./user_audio.wav")

# 각 음소 단위로 피치 및 포먼트 차이 계산
differences = compare_phoneme_features(pitch1, pitch2, formants1, formants2)

# 결과 출력
print("Transcribed Text from example_tts.wav:", transcribed_text1)
print("Transcribed Text from user_audio.wav:", transcribed_text2)
print("\nTime(s)\tPitch Difference\tFormant Difference")
for time, pitch_diff, formant_diff in differences:
    print(f"{time:.2f}\t{pitch_diff if pitch_diff is not None else 'N/A'}\t\t{formant_diff:.2f}")

100%|███████████████████████████████████████| 461M/461M [00:05<00:00, 87.5MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcribed Text from example_tts.wav:  This is a sample sentence for pronunciation evaluation.
Transcribed Text from user_audio.wav:  This is a sample sentence for pronunciation evaluation.

Time(s)	Pitch Difference	Formant Difference
0.00	N/A		0.00
0.01	N/A		0.00
0.02	N/A		0.00
0.03	N/A		122.52
0.04	N/A		233.89
0.05	N/A		434.21
0.06	N/A		541.69
0.07	N/A		1061.67
0.08	N/A		660.93
0.09	N/A		1431.18
0.10	N/A		487.39
0.11	N/A		817.02
0.12	N/A		1004.49
0.13	N/A		1301.89
0.14	N/A		1329.85
0.15	N/A		1053.77
0.16	N/A		1084.25
0.17	N/A		1121.10
0.18	N/A		903.72
0.19	N/A		1131.91
0.20	N/A		972.85
0.21	N/A		713.07
0.22	N/A		238.20
0.23	N/A		934.51
0.24	N/A		1182.05
0.25	N/A		563.26
0.26	N/A		440.09
0.27	N/A		665.83
0.28	N/A		649.45
0.29	N/A		1045.46
0.30	N/A		607.18
0.31	N/A		674.22
0.32	N/A		676.46
0.33	N/A		990.17
0.34	N/A		614.79
0.35	N/A		671.41
0.36	N/A		540.77
0.37	N/A		493.54
0.38	N/A		665.83
0.39	N/A		936.08
0.40	N/A		802.84
0.41	N/A		1052.68
0.42	N/A		846.20
0.43	N/A		855.67
0.44	N/A		

In [35]:
# 피치, 포먼트 차이가 큰 특정 단어를 표시(기준: 음소 전체의 피치, 포먼트 차이의 중앙값)
# 중앙값 사용 이유: 음소별 피치, 포먼트가 측정이 안되어 NaN인 구간이 존재 
import parselmouth
import numpy as np
import whisper

# Whisper 모델 로드
model = whisper.load_model("small")

# 텍스트 및 타임스탬프 추출 함수
def transcribe_audio_with_timestamps(audio_path):
    result = model.transcribe(audio_path, language="en", word_timestamps=True)
    return result["text"], result["segments"]

def extract_phoneme_features(audio_path):
    snd = parselmouth.Sound(audio_path)
    
    # 피치 추출
    pitch = snd.to_pitch()
    pitch_values = pitch.selected_array['frequency']
    
    # 포먼트 추출
    formant = snd.to_formant_burg()
    formant_values = []
    for t in np.arange(0, snd.duration, 0.01):  # 10ms 간격
        formants_at_t = []
        for i in range(1, 5):  # 첫 4개 포먼트 추출
            try:
                formants_at_t.append(formant.get_value_at_time(i, t))
            except Exception:
                formants_at_t.append(np.nan)
        formant_values.append(formants_at_t)
    
    formant_values = np.nan_to_num(formant_values, nan=0.0)
    return pitch_values, np.array(formant_values)

def compare_phoneme_features(pitch1, pitch2, formant1, formant2):
    differences = []
    min_length = min(len(pitch1), len(pitch2), len(formant1), len(formant2))

    for i in range(min_length):
        if pitch1[i] > 0 and pitch2[i] > 0:
            pitch_diff = abs(pitch1[i] - pitch2[i])
        else:
            pitch_diff = None
        formant_diff = np.linalg.norm(formant1[i] - formant2[i])
        differences.append((i * 0.01, pitch_diff, formant_diff))  # 시간(s), 피치 차이, 포먼트 차이

    return differences

def generate_feedback(timestamps, differences, threshold_multiplier=1.5):
    feedback = []

    # 피치와 포먼트 차이에서 중앙값을 기준으로 차이가 큰 구간만 선택
    pitch_diffs = [diff[1] for diff in differences if diff[1] is not None]
    formant_diffs = [diff[2] for diff in differences]
    median_pitch_diff = np.median(pitch_diffs)
    median_formant_diff = np.median(formant_diffs)

    # 중앙값을 기준으로 큰 차이를 가진 구간만 필터링
    for start_time, end_time, word in timestamps:
        relevant_diffs = [diff for diff in differences if start_time <= diff[0] <= end_time]
        
        if not relevant_diffs:
            continue

        avg_pitch_diff = np.mean([diff[1] for diff in relevant_diffs if diff[1] is not None])
        avg_formant_diff = np.mean([diff[2] for diff in relevant_diffs])

        # 중앙값보다 큰 차이를 가진 음소를 필터링
        high_diff_segments = [
            (time, pitch, formant) for time, pitch, formant in relevant_diffs
            if (pitch is not None and pitch > median_pitch_diff * threshold_multiplier) or formant > median_formant_diff * threshold_multiplier
        ]

        if high_diff_segments:
            feedback.append({
                "word": word,
                "start_time": start_time,
                "end_time": end_time,
                "median_based_pitch_difference": avg_pitch_diff,
                "median_based_formant_difference": avg_formant_diff,
                "high_diff_segments": high_diff_segments
            })
    
    return feedback

# 원어민과 사용자 음성 파일의 피치 및 포먼트 추출
pitch1, formants1 = extract_phoneme_features("./example_tts.wav")
pitch2, formants2 = extract_phoneme_features("./user_audio.wav")

# Whisper로 영어 텍스트 및 타임스탬프 추출
_, timestamps1 = transcribe_audio_with_timestamps("./example_tts.wav")
_, timestamps2 = transcribe_audio_with_timestamps("./user_audio.wav")

# 피치 및 포먼트 차이 계산
differences = compare_phoneme_features(pitch1, pitch2, formants1, formants2)

# 피드백 생성
timestamps = [(seg["start"], seg["end"], seg["text"]) for seg in timestamps2]
feedback = generate_feedback(timestamps, differences)

# 피드백 출력
for item in feedback:
    print(f"Word: '{item['word']}' (Time: {item['start_time']:.2f}s - {item['end_time']:.2f}s)")
    print(f" - Median-Based Pitch Difference: {item['median_based_pitch_difference']:.2f}")
    print(f" - Median-Based Formant Difference: {item['median_based_formant_difference']:.2f}")
    
    # 음소별 피드백 출력
    print("   High Difference Phonemes:")
    for time, pitch_diff, formant_diff in item["high_diff_segments"]:
        pitch_output = f"{pitch_diff:.2f}" if pitch_diff is not None else "N/A"
        print(f"     Time: {time:.2f}s - Pitch Diff: {pitch_output} - Formant Diff: {formant_diff:.2f}")
    print()

Word: ' This is a sample sentence for pronunciation evaluation.' (Time: 0.00s - 5.24s)
 - Median-Based Pitch Difference: 29.79
 - Median-Based Formant Difference: 1664.74
   High Difference Phonemes:
     Time: 0.45s - Pitch Diff: N/A - Formant Diff: 2039.87
     Time: 0.47s - Pitch Diff: N/A - Formant Diff: 5083.11
     Time: 0.55s - Pitch Diff: N/A - Formant Diff: 2258.79
     Time: 0.56s - Pitch Diff: N/A - Formant Diff: 5133.40
     Time: 0.71s - Pitch Diff: N/A - Formant Diff: 4174.20
     Time: 0.94s - Pitch Diff: N/A - Formant Diff: 4391.00
     Time: 1.12s - Pitch Diff: N/A - Formant Diff: 4553.94
     Time: 1.13s - Pitch Diff: N/A - Formant Diff: 1829.17
     Time: 1.14s - Pitch Diff: N/A - Formant Diff: 4564.95
     Time: 1.22s - Pitch Diff: N/A - Formant Diff: 4586.56
     Time: 1.43s - Pitch Diff: N/A - Formant Diff: 2473.01
     Time: 1.44s - Pitch Diff: N/A - Formant Diff: 2787.81
     Time: 1.46s - Pitch Diff: 8.34 - Formant Diff: 2544.87
     Time: 1.49s - Pitch Diff: 2

In [36]:
import parselmouth
import numpy as np
import whisper

# Whisper 모델 로드
model = whisper.load_model("small")

# 텍스트 및 타임스탬프 추출 함수
def transcribe_audio_with_timestamps(audio_path):
    result = model.transcribe(audio_path, language="en", word_timestamps=True)
    return result["text"], result["segments"]

def extract_phoneme_features(audio_path):
    snd = parselmouth.Sound(audio_path)
    
    # 피치 추출
    pitch = snd.to_pitch()
    pitch_values = pitch.selected_array['frequency']
    
    # 포먼트 추출
    formant = snd.to_formant_burg()
    formant_values = []
    for t in np.arange(0, snd.duration, 0.01):  # 10ms 간격
        formants_at_t = []
        for i in range(1, 5):  # 첫 4개 포먼트 추출
            try:
                formants_at_t.append(formant.get_value_at_time(i, t))
            except Exception:
                formants_at_t.append(np.nan)
        formant_values.append(formants_at_t)
    
    formant_values = np.nan_to_num(formant_values, nan=0.0)
    return pitch_values, np.array(formant_values)

def compare_phoneme_features(pitch1, pitch2, formant1, formant2):
    differences = []
    min_length = min(len(pitch1), len(pitch2), len(formant1), len(formant2))

    for i in range(min_length):
        if pitch1[i] > 0 and pitch2[i] > 0:
            pitch_diff = abs(pitch1[i] - pitch2[i])
        else:
            pitch_diff = None
        formant_diff = np.linalg.norm(formant1[i] - formant2[i])
        differences.append((i * 0.01, pitch_diff, formant_diff))  # 시간(s), 피치 차이, 포먼트 차이

    return differences

def generate_feedback(timestamps, differences, threshold_multiplier=1.5):
    feedback = []

    # 피치와 포먼트 차이에서 중앙값을 기준으로 차이가 큰 구간만 선택
    pitch_diffs = [diff[1] for diff in differences if diff[1] is not None]
    formant_diffs = [diff[2] for diff in differences]
    median_pitch_diff = np.median(pitch_diffs)
    median_formant_diff = np.median(formant_diffs)

    # 중앙값을 기준으로 큰 차이를 가진 구간만 필터링
    for start_time, end_time, word in timestamps:
        relevant_diffs = [diff for diff in differences if start_time <= diff[0] <= end_time]
        
        if not relevant_diffs:
            continue

        avg_pitch_diff = np.mean([diff[1] for diff in relevant_diffs if diff[1] is not None])
        avg_formant_diff = np.mean([diff[2] for diff in relevant_diffs])

        # 중앙값보다 큰 차이를 가진 음소를 필터링
        high_diff_segments = [
            (time, pitch, formant) for time, pitch, formant in relevant_diffs
            if (pitch is not None and pitch > median_pitch_diff * threshold_multiplier) or formant > median_formant_diff * threshold_multiplier
        ]

        if high_diff_segments:
            feedback.append({
                "word": word,
                "start_time": start_time,
                "end_time": end_time,
                "median_based_pitch_difference": avg_pitch_diff,
                "median_based_formant_difference": avg_formant_diff,
                "high_diff_segments": high_diff_segments
            })
    
    return feedback

# 원어민과 사용자 음성 파일의 피치 및 포먼트 추출
pitch1, formants1 = extract_phoneme_features("./example_tts.wav")
pitch2, formants2 = extract_phoneme_features("./user_audio.wav")

# Whisper로 영어 텍스트 및 타임스탬프 추출
_, timestamps1 = transcribe_audio_with_timestamps("./example_tts.wav")
_, timestamps2 = transcribe_audio_with_timestamps("./user_audio.wav")

# 피치 및 포먼트 차이 계산
differences = compare_phoneme_features(pitch1, pitch2, formants1, formants2)

# 피드백 생성
timestamps = [(seg["start"], seg["end"], seg["text"]) for seg in timestamps2]
feedback = generate_feedback(timestamps, differences)

# 피드백 출력
for item in feedback:
    print(f"Word: '{item['word']}' (Time: {item['start_time']:.2f}s - {item['end_time']:.2f}s)")
    print(f" - Median-Based Pitch Difference: {item['median_based_pitch_difference']:.2f}")
    print(f" - Median-Based Formant Difference: {item['median_based_formant_difference']:.2f}")
    
    # 음소별 피드백 출력
    print("   High Difference Phonemes:")
    for time, pitch_diff, formant_diff in item["high_diff_segments"]:
        pitch_output = f"{pitch_diff:.2f}" if pitch_diff is not None else "N/A"
        print(f"     Time: {time:.2f}s - Pitch Diff: {pitch_output} - Formant Diff: {formant_diff:.2f}")
    print()


Word: ' This is a sample sentence for pronunciation evaluation.' (Time: 0.00s - 5.24s)
 - Median-Based Pitch Difference: 29.79
 - Median-Based Formant Difference: 1664.74
   High Difference Phonemes:
     Time: 0.45s - Pitch Diff: N/A - Formant Diff: 2039.87
     Time: 0.47s - Pitch Diff: N/A - Formant Diff: 5083.11
     Time: 0.55s - Pitch Diff: N/A - Formant Diff: 2258.79
     Time: 0.56s - Pitch Diff: N/A - Formant Diff: 5133.40
     Time: 0.71s - Pitch Diff: N/A - Formant Diff: 4174.20
     Time: 0.94s - Pitch Diff: N/A - Formant Diff: 4391.00
     Time: 1.12s - Pitch Diff: N/A - Formant Diff: 4553.94
     Time: 1.13s - Pitch Diff: N/A - Formant Diff: 1829.17
     Time: 1.14s - Pitch Diff: N/A - Formant Diff: 4564.95
     Time: 1.22s - Pitch Diff: N/A - Formant Diff: 4586.56
     Time: 1.43s - Pitch Diff: N/A - Formant Diff: 2473.01
     Time: 1.44s - Pitch Diff: N/A - Formant Diff: 2787.81
     Time: 1.46s - Pitch Diff: 8.34 - Formant Diff: 2544.87
     Time: 1.49s - Pitch Diff: 2