# Data pre-processing for phonological process error Binary Classification
**Author:** chhsiao<br>
**Date created:** 2022/07/21<br>
**Last modified:** 2022/09/20<br>

## Introduction
利用治療師的標記結果將資料做二分類處理，並且會將單音資料(syllable)重新命名為0和16兩類(e.g. 0=>某種phonological process, 16=>某種phonological process以外的音檔)<br>

注意：<br>
此程式預設字卡為0327word，需要更改字卡請更改**wordcard_dic**<br>
使用前要更改的部份為治療師統計好的標記結果(.csv)的路徑,要計算的構音錯誤TARGET(e.g. 塞音化=>1),以及收音期間


In [1]:
import pandas as pd
import os
import shutil
import json

## SETUP

In [2]:
TARGET = "塞音化"
WORDCARD = "data_0327word"
DATE = "0401_0811"


In [3]:
STATPATH = '/D/TWCC/work/cgh_2022/jsonmv/stat/'+WORDCARD+'/'+DATE+'/compareST_'+DATE+'.csv'
DATASET = "binear_classification"


DATASETPATH = '/D/TWCC/work/cgh_2022/data/'+WORDCARD+'/use-three-ST-label-dataset_' + \
    DATE+'/' + DATASET+'/'

SYLLABLEPATH = DATASETPATH + 'syllable/'
WORDPATH = DATASETPATH + 'word/'

DATAPATH = '/D/TWCC/data2022_ori/'+WORDCARD+'/'+DATE+'/'


In [4]:
class_dict = {
    "塞音化": 1,
    "母音化": 2,
    "母音省略": 3,
    "舌前音化": 4,
    "舌根音化": 5,
    "不送氣音化": 6,
    "聲隨韻母": 7,
    "邊音化": 8,
    "齒間音": 9,
    "子音省略": 10,
    "擦音化": 11,
    "介音省略": 12,
    "塞擦音化": 13,
    "複韻母省略": 14,
    "其他": 15,
    "正確": 16,
    "雜訊無法辨識": 17
}
class_name = ["塞音化", "母音化", "母音省略", "舌前音化", "舌根音化", "不送氣音化", "聲隨韻母",
              "邊音化", "齒間音", "子音省略", "擦音化", "介音省略", "塞擦音化", "複韻母省略", "其他", "正確", "雜訊無法辨識"]

In [5]:
BACKINGPATH = DATASETPATH + 'Backing/'
save_Backing = 'class_number_'+DATE+'_Backing'

STOPINGPATH = DATASETPATH + 'Stoping/'
save_Stoping = 'class_number_'+DATE+'_Stoping'



backing_list0327 = [
    "wordcard03_01_2.wav",
    "wordcard03_02_2.wav",
    "wordcard03_07_2.wav",
    "wordcard03_08_1.wav"
]
backing_list0820 = [
    "wordcard03_01_2.wav",
    "wordcard03_02_2.wav",
    "wordcard03_07_2.wav",
    "wordcard03_08_1.wav"
]
stoping_list0327 = [
    "wordcard03_01_1.wav",
    "wordcard03_02_1.wav",
    "wordcard03_04_1.wav",
    "wordcard03_05_2.wav",
    "wordcard03_08_2.wav",
    "wordcard04_01_1.wav"
]
stoping_list0820 = [
    "wordcard03_01_1.wav",
    "wordcard03_02_1.wav",
    "wordcard03_04_1.wav",
    "wordcard03_05_2.wav",
    "wordcard03_08_2.wav",
    "wordcard04_01_1.wav",
    "wordcard05_05_2.wav",
    "wordcard05_08_1.wav"
]


classBackingChildNumber = {
    "非舌根音化": 0,
    "舌根音化": 0
}
classBackingAdultNumber = {
    "非舌根音化": 0,
    "舌根音化": 0
}

classStopingChildNumber = {
    "非塞音化": 0,
    "塞音化": 0
}
classStopingAdultNumber = {
    "非塞音化": 0,
    "塞音化": 0
}


In [6]:
if(TARGET == "舌根音化"):
    classAdultNumber = classBackingAdultNumber
    classChildNumber = classBackingChildNumber
    ERRORPATH = BACKINGPATH
    SAVEERRORNAME = save_Backing
    if not os.path.exists(ERRORPATH):
        os.makedirs(ERRORPATH)
    if(WORDCARD == "data_0327word"):
        error_list = backing_list0327
    elif(WORDCARD== "data_0820word"):
        error_list = backing_list0820
elif(TARGET == "塞音化"):
    classAdultNumber = classStopingAdultNumber
    classChildNumber = classStopingChildNumber
    ERRORPATH = STOPINGPATH
    SAVEERRORNAME = save_Stoping
    if not os.path.exists(ERRORPATH):
        os.makedirs(ERRORPATH)
    if(WORDCARD == "data_0327word"):
        error_list = stoping_list0327
    elif(WORDCARD == "data_0820word"):
        error_list = stoping_list0820


In [7]:
class_dict[TARGET]


1

In [8]:
from ast import literal_eval
df_stat = pd.read_csv(STATPATH, index_col=0)

for case in df_stat.columns.tolist(): 
    for wordcard in df_stat.index.tolist(): 
        if not df_stat[case][wordcard]: 
            pass
        elif (df_stat[case][wordcard] != df_stat[case][wordcard]): 
            pass 
        else: 
            df_stat[case][wordcard] = literal_eval(df_stat[case][wordcard]) # convert to list

In [9]:
df_stat.head(20)

Unnamed: 0,2022.04.01.13.12.28_477493581,2022.04.06.10.08.07_56554239,2022.04.06.14.09.10_42091033,2022.04.06.15.34.12_36988952,2022.04.07.09.35.40_27898863_test,2022.04.07.13.45.02_42860780,2022.04.08.08.44.36_111234296_test,2022.04.08.09.42.57_114736422_test,2022.04.08.10.00.15_115774292_test,2022.04.08.10.28.39_117478380_test,...,2022.08.11.13.47.46_1955476989_adult,2022.08.11.14.03.38_1954525545_adult,2022.08.11.15.06.19_1950764235_adult,2022.08.11.15.24.53_1949650045_adult,2022.08.11.15.35.06_1949037459_adult,2022.08.11.15.44.07_1948496007_adult,2022.08.11.15.59.36_1947567322_adult,2022.08.11.17.11.20_1943263647_adult,2022.08.11.17.19.36_1942766623_adult,2022.08.11.17.27.37_1942285355_adult
wordcard03_01.wav,[16],[16],[4],[16],[16],[16],[4],[16],[6],[16],...,[5],[16],[5],[5],[5],[5],[5],[17],[5],[5]
wordcard03_01_1.wav,[16],[16],[16],[16],[16],[16],[16],[5],[16],[16],...,[5],[5],[5],[5],[5],[5],[5],[17],[5],[5]
wordcard03_01_2.wav,[16],[11],[4],[16],[16],[16],[4],[16],"[15, 6, 10]",[16],...,[16],[16],[16],[16],[16],[16],[16],[17],[16],[16]
wordcard03_02.wav,[16],[16],[4],[16],[16],[16],[4],[16],[16],[16],...,[5],[5],[5],[5],[5],[5],[5],[5],[16],[16]
wordcard03_02_1.wav,[16],[16],[16],[16],[16],[16],[16],[16],[16],[16],...,[5],[5],[5],[5],[5],[5],[5],[17],[5],[5]
wordcard03_02_2.wav,[16],[16],[4],[16],[16],[16],[4],[16],[16],[16],...,[16],[16],[16],[16],[16],[16],[16],[17],[16],[16]
wordcard03_03.wav,[5],[16],,[16],[16],[16],[5],"[16, 5, 1]",[16],[16],...,[5],[5],[5],[5],[5],[5],[5],[17],[5],[5]
wordcard03_03_1.wav,[5],[16],,[16],[5],[16],[5],"[16, 5, 1]",[16],[16],...,[5],[5],[5],[5],[5],[5],[5],[17],[5],[5]
wordcard03_03_2.wav,[16],[16],,[16],[16],[16],[16],[16],[16],[16],...,[5],[16],[5],[5],[5],[16],[5],[17],[16],[16]
wordcard03_04.wav,[16],[16],,[16],[16],"[5, 10, 16]",[16],[5],[16],[16],...,[5],[5],[5],[5],[5],[5],[5],[17],[5],[5]


In [10]:
def isWord(wordcard):
    if(len(wordcard.split('_')) == 2):  # if the wordcard is WORD
        return True
    return False

def isAdult(case):
    if(case.split('_')[-1] == 'adult'):
        return True
    return False


def classPlusOne(ISADULT, TARGET):
    if(ISADULT):
        classAdultNumber[TARGET] += 1
    else:
        classChildNumber[TARGET] += 1


def copy(src, destination):
    if(os.path.exists(src)):
        tmp = os.path.splitext(destination)[0]
        tmpend = tmp.split('/')
        index = tmp.find(tmpend[-1])
        if not os.path.isdir(destination[:index]):
            os.makedirs(destination[:index])
        # print(src, destination)
        # print(destination[:index])
        shutil.copyfile(src, destination)
    else:
        print(f"{src} not found")

for case in df_stat.columns.tolist():
    for wordcard in df_stat.index.tolist():
        # if the case is adult(2022.05.15.19.58.12_946516734_adult)
        if(isAdult(case)):
            tmpNewPath = ERRORPATH + 'adult/'
            ISADULT = True
        else:
            tmpNewPath = ERRORPATH + 'child/'
            ISADULT = False
        ISWORD = isWord(wordcard)
        if(ISWORD):#二分類不考慮詞的情況
            continue
        if not df_stat[case][wordcard]: # if the value is empty
            pass
        elif (df_stat[case][wordcard] != df_stat[case][wordcard]): # if the value is NaN
            pass
        else:

            case_number = case.split('_')[1]  # 2022.04.01.13.12.28_477493581
            # '/D/TWCC/data2022_ori/data_0327word/0401_0707/2022.06.16.09.33.28_1780799170_test/wordcard05_07_1.wav'
            oldFilePath = DATAPATH+case+'/'+wordcard
            # '/D/TWCC/data2022_ori/data_0327word/0401_0707/2022.06.16.09.33.28_1780799170_test/wordcard05_07_1'
            filePath = os.path.splitext(oldFilePath)[0]
            oldFileName = filePath.split('/')[-1]  # 'wordcard05_07_1'
            if (17 in df_stat[case][wordcard]):
                pass
            # if ('fcdp' in wordcard):
            #     pass
            elif(16 in df_stat[case][wordcard]):
                if(wordcard in error_list):
                    newFileName = "0_" + str(case_number) + '_' + oldFileName.split('wordcard')[1] + '.wav'
                    classPlusOne(ISADULT, TARGET)
                    tmpNewPath = tmpNewPath + '0/'
                else:
                    newFileName = "16_" + str(case_number) + '_' + oldFileName.split('wordcard')[1] + '.wav'
                    classPlusOne(ISADULT, "非"+TARGET)
                    tmpNewPath = tmpNewPath + '16/'
                if not os.path.isdir(tmpNewPath):
                    os.makedirs(tmpNewPath)
                newFilePath = os.path.join(tmpNewPath, newFileName)
                # print(df_stat[case][wordcard], newFileName)
                print(f"{oldFilePath} -> {newFilePath}")
                copy(oldFilePath, newFilePath)
                # shutil.copyfile(oldFilePath, newFilePath)
            elif(class_dict[TARGET] in df_stat[case][wordcard]):
                newFileName = "0_" + str(case_number) + '_' + oldFileName.split('wordcard')[1] + '.wav'
                classPlusOne(ISADULT, TARGET)
                tmpNewPath = tmpNewPath + '0/'
                if not os.path.isdir(tmpNewPath):
                    os.makedirs(tmpNewPath)
                newFilePath = os.path.join(tmpNewPath, newFileName)
                print(f"{oldFilePath} -> {newFilePath}")
                # print(df_stat[case][wordcard], newFileName)
                copy(oldFilePath, newFilePath)
                # shutil.copyfile(oldFilePath, newFilePath)
            elif(15 not in df_stat[case][wordcard]): #排除其他
                newFileName = "16_" + str(case_number) + '_' + oldFileName.split('wordcard')[1] + '.wav'
                classPlusOne(ISADULT, "非"+TARGET)
                tmpNewPath = tmpNewPath + '16/'
                if not os.path.isdir(tmpNewPath):
                    os.makedirs(tmpNewPath)
                newFilePath = os.path.join(tmpNewPath, newFileName)
                # print(f"{oldFilePath} -> {newFilePath}")
                copy(oldFilePath, newFilePath)
                # shutil.copyfile(oldFilePath, newFilePath)



/D/TWCC/data2022_ori/data_0327word/0401_0811/2022.04.01.13.12.28_477493581/wordcard03_01_1.wav -> /D/TWCC/work/cgh_2022/data/data_0327word/use-three-ST-label-dataset_0401_0811/binear_classification/Stoping/child/0/0_477493581_03_01_1.wav
/D/TWCC/data2022_ori/data_0327word/0401_0811/2022.04.01.13.12.28_477493581/wordcard03_01_2.wav -> /D/TWCC/work/cgh_2022/data/data_0327word/use-three-ST-label-dataset_0401_0811/binear_classification/Stoping/child/16/16_477493581_03_01_2.wav
/D/TWCC/data2022_ori/data_0327word/0401_0811/2022.04.01.13.12.28_477493581/wordcard03_02_1.wav -> /D/TWCC/work/cgh_2022/data/data_0327word/use-three-ST-label-dataset_0401_0811/binear_classification/Stoping/child/0/0_477493581_03_02_1.wav
/D/TWCC/data2022_ori/data_0327word/0401_0811/2022.04.01.13.12.28_477493581/wordcard03_02_2.wav -> /D/TWCC/work/cgh_2022/data/data_0327word/use-three-ST-label-dataset_0401_0811/binear_classification/Stoping/child/16/16_477493581_03_02_2.wav
/D/TWCC/data2022_ori/data_0327word/0401_0811

In [11]:
classChildNumber


{'非塞音化': 1082, '塞音化': 200}

In [12]:
classAdultNumber


{'非塞音化': 1219, '塞音化': 102}

In [13]:
with open(ERRORPATH+SAVEERRORNAME+'child.json', 'w', encoding='utf-8') as f:
    json.dump(classChildNumber, f, ensure_ascii=False, indent=4)
    print("Saving"+ERRORPATH+SAVEERRORNAME+'child.json')
    f.close()


Saving/D/TWCC/work/cgh_2022/data/data_0327word/use-three-ST-label-dataset_0401_0811/binear_classification/Stoping/class_number_0401_0811_Stopingchild.json


In [14]:
with open(ERRORPATH+SAVEERRORNAME+'adult.json', 'w', encoding='utf-8') as f:
    json.dump(classAdultNumber, f, ensure_ascii=False, indent=4)
    print("Saving"+ERRORPATH+SAVEERRORNAME+'adult.json')
    f.close()


Saving/D/TWCC/work/cgh_2022/data/data_0327word/use-three-ST-label-dataset_0401_0811/binear_classification/Stoping/class_number_0401_0811_Stopingadult.json
