In [1]:
corpus_list = ['BibleTTS/akuapem-twi',
    'BibleTTS/asante-twi',
    'BibleTTS/ewe',
    'BibleTTS/hausa',
    'BibleTTS/lingala'
    'BibleTTS/yoruba',
    'Buckeye',
    'EUROM',
    'LibriSpeech',
    'LibriVox',
    'MediaSpeech/AR',
    'MediaSpeech/ES',
    'MediaSpeech/FR',
    'MediaSpeech/TR',
    'MozillaCommonVoice/ab',
    'MozillaCommonVoice/ar',
    'MozillaCommonVoice/ba',
    'MozillaCommonVoice/be',
    'MozillaCommonVoice/bg',
    'MozillaCommonVoice/bn',
    'MozillaCommonVoice/br',
    'MozillaCommonVoice/ca',
    'MozillaCommonVoice/ckb',
    'MozillaCommonVoice/cnh',
    'MozillaCommonVoice/cs',
    'MozillaCommonVoice/cv',
    'MozillaCommonVoice/cy',
    'MozillaCommonVoice/da',
    'MozillaCommonVoice/de',
    'MozillaCommonVoice/dv',
    'MozillaCommonVoice/el',
    'MozillaCommonVoice/en',
    'MozillaCommonVoice/eo',
    'MozillaCommonVoice/es',
    'MozillaCommonVoice/et',
    'MozillaCommonVoice/eu',
    'MozillaCommonVoice/fa',
    'MozillaCommonVoice/fi',
    'MozillaCommonVoice/fr',
    'MozillaCommonVoice/fy-NL',
    'MozillaCommonVoice/ga-IE',
    'MozillaCommonVoice/gl',
    'MozillaCommonVoice/gn',
    'MozillaCommonVoice/hi',
    'MozillaCommonVoice/hu',
    'MozillaCommonVoice/hy-AM',
    'MozillaCommonVoice/id',
    'MozillaCommonVoice/ig',
    'MozillaCommonVoice/it',
    'MozillaCommonVoice/ja',
    'MozillaCommonVoice/ka',
    'MozillaCommonVoice/kab',
    'MozillaCommonVoice/kk',
    'MozillaCommonVoice/kmr',
    'MozillaCommonVoice/ky',
    'MozillaCommonVoice/lg',
    'MozillaCommonVoice/lt',
    'MozillaCommonVoice/ltg',
    'MozillaCommonVoice/lv',
    'MozillaCommonVoice/mhr',
    'MozillaCommonVoice/ml',
    'MozillaCommonVoice/mn',
    'MozillaCommonVoice/mt',
    'MozillaCommonVoice/nan-tw',
    'MozillaCommonVoice/nl',
    'MozillaCommonVoice/oc',
    'MozillaCommonVoice/or',
    'MozillaCommonVoice/pl',
    'MozillaCommonVoice/pt',
    'MozillaCommonVoice/ro',
    'MozillaCommonVoice/ru',
    'MozillaCommonVoice/rw',
    'MozillaCommonVoice/sr',
    'MozillaCommonVoice/sv-SE',
    'MozillaCommonVoice/sw',
    'MozillaCommonVoice/ta',
    'MozillaCommonVoice/th',
    'MozillaCommonVoice/tr',
    'MozillaCommonVoice/tt',
    'MozillaCommonVoice/ug',
    'MozillaCommonVoice/uk',
    'MozillaCommonVoice/ur',
    'MozillaCommonVoice/uz',
    'MozillaCommonVoice/vi',
    'MozillaCommonVoice/yo',
    'MozillaCommonVoice/yue',
    'MozillaCommonVoice/zh-CN',
    'MozillaCommonVoice/zh-TW',
    'primewords_chinese',
    'room_reader',
    'SpeechClarity',
    'TAT-Vol2',
    'thchs30',
    'TIMIT',
    'TTS_Javanese',
    'zeroth_korean',]

In [7]:
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv
import json


def make_meta_file(corpus):
    
    params_list = glob.glob('STM_output/Survey/speech_params_'+corpus+'/*')
    df_list = []

    # load the data from the mat file
    for params_file in params_list:
        data_dict = sio.loadmat(params_file)
        structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
        df = pd.DataFrame(structure_dict)
        df.drop(columns=['x_axis','y_axis'], inplace=True)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df_list.append(df)
    
    df_all = pd.concat(df_list, ignore_index=True)

    # add the speaker ID and gender info
    if 'MozillaCommonVoice' in corpus:
        valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)
        valid_df['path'] = valid_df['path'].str.replace('.mp3', '')
        valid_df.rename(columns={'client_id':'speaker/player'}, inplace=True)
        df_all = df_all.merge(valid_df[['speaker/player', 'path', 'gender']], how='left', left_on='filename', right_on='path').drop(columns=['path'])
    elif 'BibleTTS' in corpus:
        df_all['speaker/player'] = 'BibleTTS_' + df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'Buckeye' in corpus:
        df_all['speaker/player'] = df_all['filename'].str[:3].str.replace('s', 'S')
        df_gender = pd.read_csv('data/speechCorp/Buckeye/Buckeye_speaker_info.csv')
        df_all = df_all.merge(df_gender[['SPEAKER', "SPEAKER'S GENDER"]], how='left', left_on='speaker/player', right_on='SPEAKER').drop(columns=['SPEAKER'])
        df_all.rename(columns={"SPEAKER'S GENDER":'gender'}, inplace=True)
        df_all['speaker/player'] = 'Buckeye_'+df_all['speaker/player']
    elif 'EUROM' in corpus:
        df_all['speaker/player'] = 'EUROM_' + df_all['LangOrInstru'] +'_'+ df_all['filename'].str[:2]
        df_all['gender'] = np.nan
    elif 'MediaSpeech' in corpus:
        df_all['speaker/player'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
        df_all['gender'] = np.nan
    elif 'LibriSpeech' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        first_parts = split_names.str[0] # Extract the first part of the split result
        df_all['speaker/player'] = first_parts # Add the extracted part as a new column in the DataFrame
        
        ## load LibriSpeech text file
        reader_ids = []
        genders = []
        subsets = []
        durations = []
        names = []
        # Open the file and read line by line
        with open('data/speechCorp/LibriSpeech/SPEAKERS.TXT', 'r') as file:
            # Skip lines starting with ";" (comments) until reaching the data
            while True:
                line = file.readline()
                if not line.startswith(';'):
                    # Start processing data from this line
                    break
            
            # Read the rest of the lines and parse them
            while line:
                # Split each line by "|" character
                data = line.strip().split('|')
                # Extract relevant information
                reader_ids.append(int(data[0].strip()))
                genders.append(data[1].strip())
                subsets.append(data[2].strip())
                durations.append(float(data[3].strip()))
                names.append(data[4].strip())
                # Read the next line
                line = file.readline()
        
        # Create a DataFrame using the lists
        df_LibriSpeech = pd.DataFrame({'reader_id': reader_ids, 'gender': genders, 'subset': subsets, 'duration': durations, 'name': names})
        df_LibriSpeech['reader_id']= df_LibriSpeech['reader_id'].astype(str)
        df_all = df_all.merge(df_LibriSpeech[['reader_id', 'gender']], how='left', left_on='speaker/player', right_on='reader_id').drop(columns=['reader_id'])
    elif 'primewords_chinese' in corpus:
        with open('data/speechCorp/primewords_chinese/set1_transcript.json', 'r') as file:
            data = json.load(file)
        primewords_df = pd.DataFrame(data)
        primewords_df['file'] = primewords_df['file'].str.replace('.wav', '')
        df_all = df_all.merge(primewords_df[['file', 'user_id']], how='left', left_on='filename', right_on='file').drop(columns=['file'])
        df_all.rename(columns={"user_id":'speaker/player'}, inplace=True)
        df_all['speaker/player'] = 'primewords_'+df_all['speaker/player']
        df_all['gender']=np.nan
    elif 'room_reader' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/player'] = split_names.str[1] # Extract the first part of the split result
        RR_df = pd.read_excel('data/speechCorp/room_reader/RoomReader_SessionsEvents.xlsx')
        df_all = df_all.merge(RR_df[['part_ID', 'gender']], how='left', left_on='speaker/player', right_on='part_ID').drop(columns=['part_ID'])
        df_all['speaker/player'] = 'RoomReader_'+df_all['speaker/player']
    elif 'SpeechClarity' in corpus:
        df_all['speaker/player'] = 'SpeechClarity_'+df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'TAT-Vol2' in corpus:
        df_all['speaker/player'] = 'TAT-Vol2_'+df_all['filename'].str[:10]
        df_all['gender'] = df_all['filename'].str[5]
    elif 'thchs30' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/player'] = 'thchs30_'+split_names.str[0] # Extract the first part of the split result
        df_all['gender'] = np.nan
    elif 'TIMIT' in corpus:
        split_names = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['speaker/player'] = 'TIMIT_'+split_names.str[-2] # Extract the first part of the split result
        df_all['gender'] = df_all['speaker/player'].str[6]
    elif 'TTS_Javanese' in corpus:
        df_all['speaker/player'] = df_all['filename'].str[:9] 
        df_all['gender'] = df_all['filename'].str[2]
    elif 'zeroth' in corpus:
        df_all['speaker/player'] = df_all['filename'].str[:3] 
        zeroth_df = pd.read_csv('data/speechCorp/zeroth_korean/AUDIO_INFO', sep="|")
        zeroth_df['SPEAKERID'] = zeroth_df['SPEAKERID'].astype(str)
        df_all = df_all.merge(zeroth_df[['SPEAKERID', 'SEX']], how='left', left_on='speaker/player', right_on='SPEAKERID').drop(columns=['SPEAKERID'])
        df_all.rename(columns={"SEX":'gender'}, inplace=True)
        df_all['speaker/player'] = 'zeroth_'+df_all['speaker/player']


    return df_all

In [16]:
corpus = 'MozillaCommonVoice/hi'
df_all = make_meta_file(corpus)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,speaker/player,gender
0,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_26986130,Hindi,1,1,128000,32,4,0.903469,4,d24af5ee15190d3fc14dfc4ee3c75c75fe66fb537d4c7c...,male
1,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_26021896,Hindi,1,1,128000,32,4,0.666188,4,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,
2,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_24026258,Hindi,1,1,128000,32,4,0.668438,4,63293ff4cc67280b10da2df2fd2763db65ca8493af191e...,male
3,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_25170240,Hindi,1,1,128000,32,4,0.620031,4,c88824395561074e0eb0ebc1280d4866dbc29f01c02689...,
4,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_23849297,Hindi,1,1,192000,48,4,0.792937,4,46f23ae11e8565e979749342e1097a77fe3ef574253f10...,male
...,...,...,...,...,...,...,...,...,...,...,...,...
5433,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_27761274,Hindi,1,1,128000,32,4,0.164656,4,d5880664e0b201cf24b4972ea9890b801afe05af05cd43...,male
5434,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_25241159,Hindi,1,1,128000,32,4,0.803937,4,3255dc44d5426ad255e14c4db4d12b224dfffd1e594a40...,male
5435,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_27517919,Hindi,1,1,128000,32,4,0.212375,4,d5880664e0b201cf24b4972ea9890b801afe05af05cd43...,male
5436,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_32264979,Hindi,1,1,256000,32,8,0.392625,8,b0900acc53b0b578a071411f8ec0479ebf48be094bdcae...,


## test code ##

In [114]:
corpus = 'zeroth_korean'
params_list = glob.glob('STM_output/Survey/speech_params_'+corpus+'/*')
df_list = []

for params_file in params_list:
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all.head()

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur
0,data/speechCorp/zeroth_korean/train_data_01/00...,201_003_1112.,Korean,1,1,128000,16,8,0.709688,8
1,data/speechCorp/zeroth_korean/train_data_01/00...,205_003_1133.,Korean,1,1,128000,16,8,0.597,8
2,data/speechCorp/zeroth_korean/train_data_01/00...,114_003_1078.,Korean,1,1,128000,16,8,0.585688,8
3,data/speechCorp/zeroth_korean/train_data_01/00...,183_003_1972.,Korean,1,1,64000,16,4,0.191188,4
4,data/speechCorp/zeroth_korean/train_data_01/00...,199_003_0088.,Korean,1,1,64000,16,4,0.287625,4


In [115]:
df_all['speaker/player'] = df_all['filename'].str[:3] 
df_all.head()

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,speaker/player
0,data/speechCorp/zeroth_korean/train_data_01/00...,201_003_1112.,Korean,1,1,128000,16,8,0.709688,8,201
1,data/speechCorp/zeroth_korean/train_data_01/00...,205_003_1133.,Korean,1,1,128000,16,8,0.597,8,205
2,data/speechCorp/zeroth_korean/train_data_01/00...,114_003_1078.,Korean,1,1,128000,16,8,0.585688,8,114
3,data/speechCorp/zeroth_korean/train_data_01/00...,183_003_1972.,Korean,1,1,64000,16,4,0.191188,4,183
4,data/speechCorp/zeroth_korean/train_data_01/00...,199_003_0088.,Korean,1,1,64000,16,4,0.287625,4,199


In [116]:
zeroth_df = pd.read_csv('data/speechCorp/zeroth_korean/AUDIO_INFO', sep="|")
zeroth_df['SPEAKERID'] = zeroth_df['SPEAKERID'].astype(str)
zeroth_df

Unnamed: 0,SPEAKERID,NAME,SEX,SCRIPTID,DATASET
0,104,BaekGeonJong,m,3,test_data_01
1,105,ChoSoYeon,f,3,test_data_01
2,106,KimJiWon,f,3,train_data_01
3,107,KimNamHyung,m,3,train_data_01
4,108,KimYeLim,f,3,train_data_01
...,...,...,...,...,...
110,214,RyuEunBee,f,3,train_data_01
111,215,ShinAYeon,f,3,train_data_01
112,216,SonSamuel,m,3,train_data_01
113,217,WonSeoWoo,f,3,train_data_01


In [117]:
df_all = df_all.merge(zeroth_df[['SPEAKERID', 'SEX']], how='left', left_on='speaker/player', right_on='SPEAKERID').drop(columns=['SPEAKERID'])
df_all.rename(columns={"SEX":'gender'}, inplace=True)
df_all['speaker/player'] = 'zeroth_'+df_all['speaker/player']
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,speaker/player,gender
0,data/speechCorp/zeroth_korean/train_data_01/00...,201_003_1112.,Korean,1,1,128000,16,8,0.709688,8,zeroth_201,f
1,data/speechCorp/zeroth_korean/train_data_01/00...,205_003_1133.,Korean,1,1,128000,16,8,0.597000,8,zeroth_205,f
2,data/speechCorp/zeroth_korean/train_data_01/00...,114_003_1078.,Korean,1,1,128000,16,8,0.585688,8,zeroth_114,m
3,data/speechCorp/zeroth_korean/train_data_01/00...,183_003_1972.,Korean,1,1,64000,16,4,0.191188,4,zeroth_183,f
4,data/speechCorp/zeroth_korean/train_data_01/00...,199_003_0088.,Korean,1,1,64000,16,4,0.287625,4,zeroth_199,f
...,...,...,...,...,...,...,...,...,...,...,...,...
22250,data/speechCorp/zeroth_korean/train_data_01/00...,189_003_1568.,Korean,1,1,64000,16,4,0.769563,4,zeroth_189,f
22251,data/speechCorp/zeroth_korean/train_data_01/00...,172_003_1519.,Korean,1,1,128000,16,8,0.480750,8,zeroth_172,m
22252,data/speechCorp/zeroth_korean/train_data_01/00...,114_003_1348.,Korean,1,1,128000,16,8,0.421938,4,zeroth_114,m
22253,data/speechCorp/zeroth_korean/train_data_01/00...,215_003_0183.,Korean,1,1,128000,16,8,0.553562,8,zeroth_215,f
