In [None]:
# 단어 start/end time 추출: Google Cloud Speech-to-Text API -> 두 음성의 각 단어 별 duration time 비교
# 두 음성의 각 단어 별 피치, 포먼트, 강세 비교: Parselmouth

In [None]:
import librosa
import soundfile as sf
from google.cloud import speech
import parselmouth
from scipy.spatial.distance import cosine
import os
import pandas as pd
from dotenv import load_dotenv

# Google Cloud Speech-to-Text API 설정
load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("KEY_PATH")
speech_client = speech.SpeechClient()

In [None]:
# 샘플링 속도 확인
def check_sampling_rate(audio_path):
    _, sr = librosa.load(audio_path, sr=None)
    print(f"{audio_path} 샘플링 속도: {sr} Hz")
    return sr


# 샘플링 속도 변경 함수
def resample_audio(input_path, output_path, target_sr):
    y, sr = librosa.load(input_path, sr=None)
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    sf.write(output_path, y_resampled, target_sr)
    print(f"{input_path} -> {output_path} 샘플링 속도를 {target_sr} Hz로 변경 완료")


# Google Speech-to-Text를 사용하여 단어별 타임스탬프 추출
def get_word_timestamps(audio_path):
    with open(audio_path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,  # 샘플링 속도와 일치
        language_code="en-US",
        enable_word_time_offsets=True,
    )

    response = speech_client.recognize(config=config, audio=audio)
    word_timestamps = []
    for result in response.results:
        for word_info in result.alternatives[0].words:
            word_timestamps.append(
                {
                    "word": word_info.word,
                    "start_time": word_info.start_time.total_seconds(),
                    "end_time": word_info.end_time.total_seconds(),
                }
            )
    return word_timestamps


# 단어별 피치, 포먼트, 강세 분석
def analyze_audio_features(audio_path, word_info):
    y, sr = librosa.load(audio_path, sr=None)
    start_sample = int(word_info["start_time"] * sr)
    end_sample = int(word_info["end_time"] * sr)
    word_audio = y[start_sample:end_sample]

    # 피치 계산
    pitches, _ = librosa.piptrack(y=word_audio, sr=sr)
    pitch = pitches[pitches > 0].mean() if pitches[pitches > 0].size > 0 else 0

    # 강세 계산
    intensity = librosa.feature.rms(y=word_audio).mean()

    # 포먼트 계산 (parselmouth 사용)
    sound = parselmouth.Sound(audio_path)
    formant = parselmouth.praat.call(
        sound, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50
    )
    f1 = parselmouth.praat.call(
        formant,
        "Get value at time",
        1,
        (word_info["start_time"] + word_info["end_time"]) / 2,
        "Hertz",
        "Linear",
    )
    f2 = parselmouth.praat.call(
        formant,
        "Get value at time",
        2,
        (word_info["start_time"] + word_info["end_time"]) / 2,
        "Hertz",
        "Linear",
    )

    return pitch, intensity, (f1, f2)

# 두 음성 파일의 단어별 비교
def compare_audio_features(audio_path1, audio_path2):
    native_timestamps = get_word_timestamps(audio_path1)
    user_timestamps = get_word_timestamps(audio_path2)

    results = []
    min_length = min(len(native_timestamps), len(user_timestamps))
    for i in range(min_length):
        native_word = native_timestamps[i]
        user_word = user_timestamps[i]

        # 단어별 지속 시간 계산
        native_duration = native_word["end_time"] - native_word["start_time"]
        user_duration = user_word["end_time"] - user_word["start_time"]
        duration_difference = user_duration - native_duration

        # 피치, 강세, 포먼트 분석
        native_pitch, native_intensity, native_formants = analyze_audio_features(audio_path1, native_word)
        user_pitch, user_intensity, user_formants = analyze_audio_features(audio_path2, user_word)

        # 비율 계산
        pitch_ratio = user_pitch / native_pitch if native_pitch != 0 else 0
        intensity_ratio = user_intensity / native_intensity if native_intensity != 0 else 0
        formant_ratio_f1 = user_formants[0] / native_formants[0] if native_formants[0] != 0 else 0
        formant_ratio_f2 = user_formants[1] / native_formants[1] if native_formants[1] != 0 else 0

        results.append({
            "Word": native_word["word"],
            "Pitch Ratio": round(pitch_ratio, 2),
            "Intensity Ratio": round(intensity_ratio, 2),
            "Formant Ratio (F1)": round(formant_ratio_f1, 2),
            "Formant Ratio (F2)": round(formant_ratio_f2, 2),
            "Duration Difference (s)": round(duration_difference, 2),
        })
    return pd.DataFrame(results)

# 메인 실행 함수
def main():
    # 음성 파일 경로
    native_audio = "./audio/native_audio.wav"
    user_audio = "./audio/user_audio.wav"

    # 샘플링 속도 확인
    native_sr = check_sampling_rate(native_audio)
    user_sr = check_sampling_rate(user_audio)

    # 샘플링 속도를 48000 Hz로 변환 (필요한 경우)
    target_sr = 48000
    if native_sr != target_sr:
        resample_audio(native_audio, "./audio/native_audio_48k.wav", target_sr)
        native_audio = "./audio/native_audio_48k.wav"
    if user_sr != target_sr:
        resample_audio(user_audio, "./audio/user_audio_48k.wav", target_sr)
        user_audio = "./audio/user_audio_48k.wav"

    # 음성 비교
    comparison_df = compare_audio_features(native_audio, user_audio)

    # 결과 출력
    print("\n단어별 비교 결과 (DataFrame):")
    print(comparison_df)

    # CSV로 저장 (선택 사항)
    comparison_df.to_csv("./comparison_results.csv", index=False)
    print("\n결과를 comparison_results.csv로 저장 완료!")


# 실행
if __name__ == "__main__":
    main()

./audio/native_audio.wav 샘플링 속도: 24000 Hz
./audio/user_audio.wav 샘플링 속도: 48000 Hz
./audio/native_audio.wav -> ./audio/native_audio_48k.wav 샘플링 속도를 48000 Hz로 변경 완료

단어별 비교 결과 (DataFrame):
            Word  Pitch Ratio  Intensity Ratio  Formant Ratio (F1)  \
0           this         0.78             0.29                3.00   
1             is         0.95             1.06                0.79   
2              a         2.64             0.24                0.52   
3         sample         1.15             0.79                0.75   
4       sentence         0.81             0.59                0.88   
5            for         0.56             2.87                0.76   
6  pronunciation         1.28             0.43                0.85   

   Formant Ratio (F2)  Duration Difference (s)  
0                1.40                      0.6  
1                1.12                      0.1  
2                0.81                      0.1  
3                0.92                      0.2  
4      

In [None]:
# 1. 애기 목소리랑 tts 랑 비교 -> 성인이랑 얼만큼 차이나는지 분석
# 2. 차이 많이나면 그때 정규화든 뭐든 생각하기
