In [1]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import glob
import json
%matplotlib inline

size = 40000

In [None]:
# 音源の読み込み
audio_dir = '../JKspeech/'
file_names = glob.glob(f'{audio_dir}/J*.wav')
sounds = [librosa.load(file_name, sr=size, mono=True)[0] 
          for file_name in file_names]
    
print("Done")

In [None]:
def audio_matching(origin_init, targets_init):
    result = []
    for target_init in targets_init:
        # サイズ調整
        start = np.abs(len(origin_init) - len(target_init)) / 180
        start = np.min([int(start), 640])
        end = np.min([len(origin_init), len(target_init)])
        target = target_init[start:end]
        origin = origin_init[start:end]
    
        # フーリエ変換
        fft_O = np.fft.fft(origin)
        fft_T = np.fft.fft(target)
        
        # 畳み込み処理
        fft_Oc = np.conj(fft_O)        # 複素共役を取る
        sigma_C = fft_Oc * fft_T       # 要素ごとに乗算
        sigma_T = np.fft.ifft(sigma_C) # 逆フーリエ変換
        sigma_T = np.abs(sigma_T)      # 絶対値を取りスカラー値に変換
        result.append(np.max(sigma_T)) # スカラー値の最大値をappend
        
    return np.array(result)

In [None]:
# メタデータ読み込み
f = open('../questions_ja/metadata.json', 'r', encoding="utf-8_sig")
data = json.load(f)
f.close()

In [2]:
Q_id = "Q017_ja"
fname = f"../questions_ja/{Q_id}.wav"
sound = librosa.load(fname, sr=size, mono=True)[0]

result = audio_matching(sound, sounds)
print("Done")

# from IPython.display import Audio
# Audio(fname, rate=sound[1])

<class 'numpy.ndarray'>


In [None]:
metadata = data[Q_id]
N = metadata['length']
answer = []
for i in np.argsort(result)[::-1][:N]:
#     print(i+1, result[i])
    answer.append(i+1)

correct_numbers = metadata['correct_numbers']
matches = [n for n in answer if n in correct_numbers]
shortages = [n for n in correct_numbers if n not in answer]

print("\n問題ナンバー:", Q_id)
print("正答:", correct_numbers)
print("回答:", sorted(answer))
print("一致:", sorted(matches))
print("不足:", sorted(shortages))
print(f"正答率: {(len(matches) / N) * 100:.02f}%")

In [None]:
# 複数シミュレーション

# f = open('../questions_ja/metadata.json', 'r', encoding="utf-8_sig")
# data = json.load(f)
# f.close()

print("Start\n")

for _ in range(18):
    Q_id = f"Q0{_+1:02}_ja"
    metadata = data[Q_id]
    N = metadata['length']
    fname = f"../questions_ja/{Q_id}.wav"
    sound = librosa.load(fname, sr=size, mono=True)[0]

    result = audio_matching(sound, sounds)

    answer = []
    for i in np.argsort(result)[::-1][:N]:
        answer.append(i+1)

    correct_numbers = metadata['correct_numbers']
    matches = [n for n in answer if n in correct_numbers]
    
    rate = len(matches) / N
    if rate == 1:
        continue

    print("音声数:", N)
    print(f"正答率: {rate*100:.02f}%\n")
    
print("Done")

In [None]:
# # サンプリングレートシミュレーション

# print("Start\n")

# for sr in range(30000, 50000, 2000):
    
#     audio_dir = '../JKspeech/'
#     file_names = glob.glob(f'{audio_dir}/J*.wav')
#     sounds = [librosa.load(file_name, sr=sr, mono=True)[0] 
#           for file_name in file_names]
#     print("\nsr:", sr)
    
#     for _ in [5, 9]:
#         Q_id = f"Q0{_+1:02}_ja"
#         metadata = data[Q_id]
#         N = metadata['length']
#         fname = f"../questions_ja/{Q_id}.wav"
#         sound = librosa.load(fname, sr=sr, mono=True)[0]

#         result = audio_matching(sound, sounds)

#         answer = []
#         for i in np.argsort(result)[::-1][:N]:
#             answer.append(i+1)

#         correct_numbers = metadata['correct_numbers']
#         matches = [n for n in answer if n in correct_numbers]

#         rate = len(matches) / N
#         if rate == 1:
#             continue

#         print(f"{N} {rate*100:.02f}%")

# print("Done")