In [18]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from timeit import default_timer as timer
from torch.nn import Transformer
from torch import Tensor
from sklearn.model_selection import train_test_split
import tqdm
import librosa
import seaborn as sns
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import textgrid

import jiwer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import AutoProcessor, AutoModelForCTC
from phonemizer.backend.espeak.wrapper import EspeakWrapper
import soundfile as sf

_ESPEAK_LIBRARY = r"C:\Program Files\eSpeak NG\libespeak-ng.dll"
EspeakWrapper.set_library(_ESPEAK_LIBRARY)
processor_P = AutoProcessor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model_P = AutoModelForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")




Some weights of the model checkpoint at facebook/wav2vec2-lv-60-espeak-cv-ft were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-lv-60-espeak-cv-ft and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably

In [20]:
def get_pathset(paths):
    return [os.path.join(dir, each_file) for dir, mid, files in os.walk(paths) for each_file in files if each_file.endswith(".wav")]

def CTC_index(processor,outind):
    meaningful_ids = []
    meaningful_indices = []
    previous_id = -1  
    blank_token_id = processor.tokenizer.pad_token_id  
    for i, token_id in enumerate(outind[0]):  
        if token_id != previous_id and token_id != blank_token_id:
            meaningful_ids.append(token_id.item())  
            meaningful_indices.append(i)  
        previous_id = token_id
    
    return meaningful_indices

def get_set_diphone(paths,model,processor):
    out_dict={}
    for each_sentence in tqdm.tqdm(paths):
        tg = textgrid.TextGrid.fromFile(each_sentence[:-3]+"TextGrid")
        tg_sentence = [i for i in tg[0] if i.mark!=""]
        tg_word = [i for i in tg[1] if i.mark!="" and i.mark!="sp"]

        sentence16_end_time=tg_sentence[15].maxTime
        tg_sentence = [i for i in tg_sentence if i.maxTime<=sentence16_end_time]
        tg_word = [i for i in tg_word if i.maxTime<=sentence16_end_time]
        
        wave, sr = librosa.load(each_sentence)
        wave_res = librosa.resample(wave, orig_sr=sr, target_sr=16000)
        wave_res = wave_res[:int(sentence16_end_time*16000)]
        input=processor(wave_res,sampling_rate=16000, return_tensors="pt").input_values
        input=input.to(device)
        model.to(device)
        with torch.no_grad():
            out_encoder1=model(input).logits
        outind=torch.argmax(out_encoder1,dim=-1).cpu().numpy()
        transcription = processor.batch_decode(outind)[0].split(" ")
        phonemeindex = CTC_index(processor,outind)
        out_FE=model.wav2vec2.feature_extractor(input)[0].transpose(1,0).cpu().detach().numpy()
        for i in range(len(transcription)-1):
            key = transcription[i] + transcription[i + 1]
            if key not in out_dict:
                out_dict[key] = []
            out_dict[key].append(np.vstack((out_FE[phonemeindex[i]], out_FE[phonemeindex[i + 1]])))
    return out_dict

ALL_ENG_ENG_path=r"..\data\raw_L1"
ALL_ENG_ENG_pathset=get_pathset(ALL_ENG_ENG_path)
ALL_ENG_ENG_dict = get_set_diphone(ALL_ENG_ENG_pathset, model_P, processor_P)

## Get sentence and keyword

In [24]:
human_result_path=r"..\data\test.xlsx"
human_result = pd.read_excel(human_result_path)

In [25]:
human_result_1a=human_result[human_result["Experiment"]=="1a"]
human_result_1a_set21=human_result_1a[human_result_1a["TrainingTestSet"]=="set2,set1"]

human_result_1a_set12=human_result_1a[human_result_1a["TrainingTestSet"]=="set1,set2"]
set(human_result_1a_set21["Sentence"])

{'A boy fell from the window.',
 'Big dogs can be dangerous.',
 'He grew lots of vegetables.',
 'She argues with her sister.',
 "She's drinking from her own cup.",
 'Somebody stole the money.',
 'The bananas are too ripe.',
 'The car is going too fast.',
 'The family likes fish.',
 'The fire was very hot.',
 'The kitchen window was clean.',
 'The paint dripped on the ground.',
 'The picture came from a book.',
 'The player lost a shoe.',
 'The shoes were very dirty.',
 'The wife helped her husband.'}

In [72]:
sentenceID_1a=set([int(i[-2:])-1 for i in list(set(human_result_1a_set21["SentenceID"]))])
sentenceID_1a

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16}

In [69]:
list(human_result_1a_set21[human_result_1a_set21["SentenceID"]=='HT1_S001']["Keywords"])[0].split(",")

['boy', ' fell', ' window']

In [78]:
sorted(list(set(human_result_1a_set21["SentenceID"])))

['HT1_S001',
 'HT1_S002',
 'HT1_S003',
 'HT1_S004',
 'HT1_S005',
 'HT1_S006',
 'HT1_S007',
 'HT1_S008',
 'HT1_S009',
 'HT1_S010',
 'HT1_S011',
 'HT1_S013',
 'HT1_S014',
 'HT1_S015',
 'HT1_S016',
 'HT1_S017']

In [87]:
list(human_result_1a_set21[human_result_1a_set21["SentenceID"]=="HT1_S001"]["Keywords"])[0].split(",")

['boy', ' fell', ' window']

In [96]:
set21_keywords_list={str(int(each_ID[-2:])-1):list(human_result_1a_set21[human_result_1a_set21["SentenceID"]==each_ID]["Keywords"])[0].split(",") for each_ID in sorted(list(set(human_result_1a_set21["SentenceID"])))}
set21_keywords_list={key:[value.strip() for value in values] for key,values in set21_keywords_list.items()}

set12_keywords_list={str(int(each_ID[-2:])-1):list(human_result_1a_set12[human_result_1a_set12["SentenceID"]==each_ID]["Keywords"])[0].split(",") for each_ID in sorted(list(set(human_result_1a_set12["SentenceID"])))}
set12_keywords_list={key:[value.strip() for value in values] for key,values in set12_keywords_list.items()}

In [91]:
set12_keywords_list["17"][-1].strip()

'raincoat'

In [97]:
list(set12_keywords_list.items())

[('17', ['hung', 'raincoat']),
 ('18', ['mailman', 'brought', 'letter']),
 ('19', ['mother', 'heard', 'baby']),
 ('20', ['found', 'purse', 'trash']),
 ('21', ['table', 'has', 'three', 'legs']),
 ('22', ['children', 'waved', 'train']),
 ('24', ['girl', 'fixing', 'dress']),
 ('25', ['time', 'go', 'bed']),
 ('26', ['mother', 'read', 'instructions']),
 ('27', ['dog', 'eating', 'some', 'meat']),
 ('28', ['father', 'forgot', 'bread']),
 ('29', ['road', 'goes', 'hill']),
 ('30', ['painter', 'uses', 'brush']),
 ('31', ['family', 'bought', 'house']),
 ('37', ['had', 'two', 'empty', 'bottles']),
 ('40', ['house', 'had', 'nine', 'bedrooms'])]

In [29]:
example=r"..\data\raw_L1\ALL_ENG_ENG_HT1\ALL_133_M_ENG_ENG_HT1.wav"
tg = textgrid.TextGrid.fromFile(example[:-3]+"TextGrid")
tg_sentence = [i for i in tg[0] if i.mark!=""]
tg_word = [i for i in tg[1] if i.mark!="" and i.mark!="sp"]

[tg_sentence[each] for each in set12_keywords_list.keys()]

[Interval(32.049, 33.518, HE HUNG UP HIS RAINCOAT),
 Interval(33.964, 35.505, THE MAILMAN BROUGHT A LETTER),
 Interval(35.798, 37.308, THE MOTHER HEARD THE BABY),
 Interval(37.479, 39.429, SHE FOUND HER PURSE IN THE TRASH),
 Interval(39.477, 41.456, THE TABLE HAS THREE LEGS),
 Interval(41.565, 43.304, THE CHILDREN WAVED AT THE TRAIN),
 Interval(45.1, 47.02, THE GIRL IS FIXING HER DRESS),
 Interval(47.081, 48.5, IT'S TIME TO GO TO BED),
 Interval(48.711, 50.62, MOTHER READ THE INSTRUCTIONS),
 Interval(50.762, 52.392, THE DOG IS EATING SOME MEAT),
 Interval(52.751, 54.251, FATHER FORGOT THE BREAD),
 Interval(54.487, 55.926, THE ROAD GOES UP A HILL),
 Interval(56.081, 57.692, THE PAINTER USES A BRUSH),
 Interval(57.865, 59.405, THE FAMILY BOUGHT A HOUSE),
 Interval(68.926, 70.835, THEY HAD TWO EMPTY BOTTLES),
 Interval(74.492, 76.442, THE HOUSE HAD NINE BEDROOMS)]

In [99]:
ALL_CMN_ENG_HT1_pathset_exposure_specific[0]

'..\\data\\raw\\ALL_CMN_ENG_HT1\\ALL_035_M_CMN_ENG_HT1.wav'

In [4]:
for _, (key,value) in enumerate({1:1,2:2}.items()):
    print(_,key,value)

0 1 1
1 2 2


In [60]:
alist={1:[1,2],2:[2,3,4]}
del alist[1][0]
alist

{1: [2], 2: [2, 3, 4]}

In [99]:
def get_test_set_dict(path, keywords_list, model, processor):
    tg = textgrid.TextGrid.fromFile(path[:-3]+"TextGrid")
    tg_sentence = [i for i in tg[0] if i.mark!=""]
    tg_word = [i for i in tg[1] if i.mark!="" and i.mark!="sp"]

    tg_sentences=[tg_sentence[int(each)] for each in keywords_list.keys()]
    
    tg_word_dict={}
    for each_word in tg_word:
        for each_sentence in tg_sentences:
            if each_sentence.mark not in tg_word_dict.keys():
                tg_word_dict[each_sentence.mark]=[]
            if each_sentence.minTime <= each_word.minTime and each_sentence.maxTime >= each_word.maxTime:
                tg_word_dict[each_sentence.mark].append(each_word)
    
    out_dict={}
    for _,(each_sentence, words_tg) in enumerate(tg_word_dict.items()):
        if each_sentence not in out_dict.keys():
            out_dict[each_sentence]=[]
        for __,each_tg in enumerate(words_tg):
            if each_tg.mark.lower() in list(keywords_list.items())[_][-1]:
                out_dict[each_sentence].append(each_tg)
    return out_dict

tg_word_dict=get_test_set_dict(ALL_CMN_ENG_HT1_pathset_exposure_specific[0], set12_keywords_list, model, processor)
tg_word_dict

{'HE HUNG UP HIS RAINCOAT': [Interval(33.096, 33.335, HUNG),
  Interval(34.475, 35.105, RAINCOAT)],
 'THE MAILMAN BROUGHT A LETTER': [Interval(35.428, 35.967, MAILMAN),
  Interval(35.967, 36.288, BROUGHT),
  Interval(36.358, 36.867, LETTER)],
 'THE MOTHER HEARD THE BABY': [Interval(37.178, 37.677, MOTHER),
  Interval(37.857, 38.177, HEARD),
  Interval(38.287, 38.707, BABY)],
 'SHE FOUND HER PURSE IN THE TRASH': [Interval(39.157, 39.467, FOUND),
  Interval(39.546, 40.116, PURSE),
  Interval(40.727, 41.357, TRASH)],
 'THE TABLE HAS THREE LEGS': [Interval(41.655, 42.104, TABLE),
  Interval(42.104, 42.324, HAS),
  Interval(42.364, 42.714, THREE),
  Interval(42.714, 43.214, LEGS)],
 'THE CHILDREN WAVED AT THE TRAIN': [Interval(43.559, 43.888, CHILDREN),
  Interval(43.888, 44.229, WAVED),
  Interval(44.668, 44.989, TRAIN)],
 'THE GIRL IS FIXING HER DRESS': [Interval(47.605, 48.014, GIRL),
  Interval(48.184, 48.594, FIXING),
  Interval(48.755, 49.314, DRESS)],
 "IT'S TIME TO GO TO BED": [Inte

In [104]:
wave, sr = librosa.load(ALL_CMN_ENG_HT1_pathset_exposure_specific[0])
wave_res = librosa.resample(wave, orig_sr=sr, target_sr=16000)
input=processor(wave_res, sampling_rate=16000, return_tensors="pt").input_values
len(input[0])

1787230

In [122]:
english_phonemes = ['<pad>', '<s>', '</s>', '<unk>', 'p', 'b', 't', 'd', 'k', 'g', 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 
                    'h', 'm', 'n', 'ŋ', 'l', 'ɹ', 'w', 'j', 'tʃ', 'dʒ', 
                    'i', 'ɪ', 'eɪ', 'ɛ', 'æ', 'ɑ', 'ʌ', 'ɔ', 'oʊ', 'ʊ', 'u', 
                    'ɜː', 'ə', 'aɪ', 'aʊ', 'ɔɪ']
english_phoneme_dict = {k: v for k, v in processor_P.tokenizer.get_vocab().items() if k in english_phonemes}
english_phoneme_dict.values()

dict_values([1, 0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 17, 18, 21, 22, 23, 24, 25, 26, 27, 29, 32, 33, 34, 36, 37, 38, 39, 40, 42, 44, 49, 52, 53, 59, 60, 63, 65, 66, 100])

In [118]:
processor_P.tokenizer.get_vocab()

{'<s>': 1,
 '<pad>': 0,
 '</s>': 2,
 '<unk>': 3,
 'n': 4,
 's': 5,
 't': 6,
 'ə': 7,
 'l': 8,
 'a': 9,
 'i': 10,
 'k': 11,
 'd': 12,
 'm': 13,
 'ɛ': 14,
 'ɾ': 15,
 'e': 16,
 'ɪ': 17,
 'p': 18,
 'o': 19,
 'ɐ': 20,
 'z': 21,
 'ð': 22,
 'f': 23,
 'j': 24,
 'v': 25,
 'b': 26,
 'ɹ': 27,
 'ʁ': 28,
 'ʊ': 29,
 'iː': 30,
 'r': 31,
 'w': 32,
 'ʌ': 33,
 'u': 34,
 'ɡ': 35,
 'æ': 36,
 'aɪ': 37,
 'ʃ': 38,
 'h': 39,
 'ɔ': 40,
 'ɑː': 41,
 'ŋ': 42,
 'ɚ': 43,
 'eɪ': 44,
 'β': 45,
 'uː': 46,
 'y': 47,
 'ɑ̃': 48,
 'oʊ': 49,
 'ᵻ': 50,
 'eː': 51,
 'θ': 52,
 'aʊ': 53,
 'ts': 54,
 'oː': 55,
 'ɔ̃': 56,
 'ɣ': 57,
 'ɜ': 58,
 'ɑ': 59,
 'dʒ': 60,
 'əl': 61,
 'x': 62,
 'ɜː': 63,
 'ç': 64,
 'ʒ': 65,
 'tʃ': 66,
 'ɔː': 67,
 'ɑːɹ': 68,
 'ɛ̃': 69,
 'ʎ': 70,
 'ɔːɹ': 71,
 'ʋ': 72,
 'aː': 73,
 'ɕ': 74,
 'œ': 75,
 'ø': 76,
 'oːɹ': 77,
 'ɲ': 78,
 'yː': 79,
 'ʔ': 80,
 'iə': 81,
 'i5': 82,
 's.': 83,
 'tɕ': 84,
 '??': 85,
 'nʲ': 86,
 'ɛː': 87,
 'œ̃': 88,
 'ɭ': 89,
 'ɔø': 90,
 'ʑ': 91,
 'tʲ': 92,
 'ɨ': 93,
 'ɛɹ': 94,
 'ts.': 95

In [107]:
tg_word_dict.values()[0]

TypeError: 'dict_values' object is not subscriptable

In [129]:
mask = np.ones(out_encoder.shape[-1], dtype=bool)
mask[list(english_phoneme_dict.values())] = False
out_encoder[:, :, mask]=0
outind=torch.argmax(out_encoder,dim=-1).cpu().numpy()
phonemeindex = CTC_index(processor,outind)
phonemeindex
#transcription = processor_P.batch_decode(outind)[0].split(" ")
#transcription

[2]

In [116]:
cut=input[:,round(list(tg_word_dict.values())[0][0].minTime*16000):round(list(tg_word_dict.values())[0][0].maxTime*16000)]
with torch.no_grad():
    out_encoder=model_P(cut.to(device)).logits
outind=torch.argmax(out_encoder,dim=-1).cpu().numpy()
transcription = processor_P.batch_decode(outind)[0].split(" ")
transcription

['x', 'iɑ5']

In [None]:
tg_word_dict
def get_test_set_diphone(path, keywords_list, model, processor):
    wave, sr = librosa.load(path)
    wave_res = librosa.resample(wave, orig_sr=sr, target_sr=16000)
    input=processor(wave_res, sampling_rate=16000, return_tensors="pt").input_values.to(device)
    
    return 

In [22]:
def get_test_set_diphone(path, keywords_list, model, processor):
    tg = textgrid.TextGrid.fromFile(path[:-3]+"TextGrid")
    tg_sentence = [i for i in tg[0] if i.mark!=""]
    tg_word = [i for i in tg[1] if i.mark!="" and i.mark!="sp"]

    tg_sentences=[tg_sentence[each] for each in keywords_list.keys()]
    
    tg_word_dict={}
    for each_word in tg_word:
        for each_sentence in tg_sentences:
            if each_sentence.mark not in tg_word_dict.keys():
                tg_word_dict[each_sentence.mark]=[]
            if each_sentence.minTime <= each_word.minTime and each_sentence.maxTime >= each_word.maxTime:
                tg_word_dict[each_sentence.mark].append(each_word)
                
                
                
                
    wave, sr = librosa.load(path)
    wave_res = librosa.resample(wave, orig_sr=sr, target_sr=16000)

    input=processor(wave_res, sampling_rate=16000, return_tensors="pt").input_values.to(device)
    model.to(device)
    with torch.no_grad():
        out_FE=model.wav2vec2.feature_extractor(input)[0].transpose(1,0).cpu().numpy()
        out_encoder=model(input).logits
    #outind=torch.argmax(out_encoder,dim=-1).cpu().numpy()
    
    sentence_dict={}
    for each_sentence in tg_sentence:
        if each_sentence.mark not in sentence_dict.keys():
            sentence_dict[each_sentence.mark] = []
        
        sentence_start=round(each_sentence.minTime/sentence16_end_time*out_FE.shape[0])+1
        sentence_end=round(each_sentence.maxTime/sentence16_end_time*out_FE.shape[0])+1
        
        outind=torch.argmax(out_encoder[:,sentence_start:sentence_end,:],dim=-1).cpu().numpy()
        transcription = processor.batch_decode(outind)[0].split(" ")
        phonemeindex = CTC_index(processor,outind)
        each_FE = out_FE[sentence_start:sentence_end,:]
        for i in range(len(transcription)-1):
            key = transcription[i] + transcription[i + 1]
            sentence_dict[each_sentence.mark].append((key,np.vstack((each_FE[phonemeindex[i]], each_FE[phonemeindex[i + 1]]))))

    return sentence_dict

ALL_ENG_ENG_HT1_path=r"..\data\raw\ALL_ENG_ENG_HT1"
ALL_ENG_ENG_HT1_pathset=get_pathset(ALL_ENG_ENG_HT1_path)
ALL_ENG_ENG_HT2_path=r"..\data\raw\ALL_ENG_ENG_HT2"
ALL_ENG_ENG_HT2_pathset=get_pathset(ALL_ENG_ENG_HT2_path)

ALL_CMN_ENG_HT1_path=r"..\data\raw\ALL_CMN_ENG_HT1"
ALL_CMN_ENG_HT1_pathset=get_pathset(ALL_CMN_ENG_HT1_path)
ALL_CMN_ENG_HT2_path=r"..\data\raw\ALL_CMN_ENG_HT2"
ALL_CMN_ENG_HT2_pathset=get_pathset(ALL_CMN_ENG_HT2_path)


ALL_ENG_ENG_HT1_pathset_exposure_control=['..\\data\\raw\\ALL_ENG_ENG_HT1\\ALL_055_M_ENG_ENG_HT1.wav','..\\data\\raw\\ALL_ENG_ENG_HT1\\ALL_066_M_ENG_ENG_HT1.wav','..\\data\\raw\\ALL_ENG_ENG_HT1\\ALL_070_M_ENG_ENG_HT1.wav','..\\data\\raw\\ALL_ENG_ENG_HT1\\ALL_131_M_ENG_ENG_HT1.wav','..\\data\\raw\\ALL_ENG_ENG_HT1\\ALL_133_M_ENG_ENG_HT1.wav']
ALL_ENG_ENG_HT2_pathset_exposure_control=['..\\data\\raw\\ALL_ENG_ENG_HT2\\ALL_055_M_ENG_ENG_HT2.wav','..\\data\\raw\\ALL_ENG_ENG_HT2\\ALL_066_M_ENG_ENG_HT2.wav','..\\data\\raw\\ALL_ENG_ENG_HT2\\ALL_070_M_ENG_ENG_HT2.wav','..\\data\\raw\\ALL_ENG_ENG_HT2\\ALL_131_M_ENG_ENG_HT2.wav','..\\data\\raw\\ALL_ENG_ENG_HT2\\ALL_133_M_ENG_ENG_HT2.wav']

ALL_CMN_ENG_HT1_pathset_exposure_multi=ALL_CMN_ENG_HT1_pathset[-5:]
ALL_CMN_ENG_HT2_pathset_exposure_multi=ALL_CMN_ENG_HT2_pathset[-5:]

ALL_CMN_ENG_HT1_pathset_exposure_specific=ALL_CMN_ENG_HT1_pathset[-4:]
ALL_CMN_ENG_HT2_pathset_exposure_specific=ALL_CMN_ENG_HT2_pathset[-4:]

ALL_CMN_ENG_HT1_pathset_exposure_single=ALL_CMN_ENG_HT1_pathset[-5:]
ALL_CMN_ENG_HT2_pathset_exposure_single=ALL_CMN_ENG_HT2_pathset[-5:]




ALL_ENG_ENG_HT1_pathset_exposure_control_diphone=get_set_diphone(ALL_ENG_ENG_HT1_pathset_exposure_control,model_P,processor_P)
ALL_ENG_ENG_HT2_pathset_exposure_control_diphone=get_set_diphone(ALL_ENG_ENG_HT2_pathset_exposure_control,model_P,processor_P)

ALL_035_M_CMN_ENG_HT1_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT1_pathset_exposure_specific[0],model_P,processor_P)
ALL_035_M_CMN_ENG_HT2_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT2_pathset_exposure_specific[0],model_P,processor_P)

ALL_037_M_CMN_ENG_HT1_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT1_pathset_exposure_specific[1],model_P,processor_P)
ALL_037_M_CMN_ENG_HT2_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT2_pathset_exposure_specific[1],model_P,processor_P)

ALL_039_M_CMN_ENG_HT1_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT1_pathset_exposure_specific[2],model_P,processor_P)
ALL_039_M_CMN_ENG_HT2_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT2_pathset_exposure_specific[2],model_P,processor_P)

ALL_043_M_CMN_ENG_HT1_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT1_pathset_exposure_specific[3],model_P,processor_P)
ALL_043_M_CMN_ENG_HT2_test_specific_diphone=get_test_set_diphone(ALL_CMN_ENG_HT2_pathset_exposure_specific[3],model_P,processor_P)

100%|██████████| 5/5 [00:01<00:00,  3.49it/s]
100%|██████████| 5/5 [00:01<00:00,  3.73it/s]
