In [1]:
corpus_list = ['BibleTTS/akuapem-twi',
    'BibleTTS/asante-twi',
    'BibleTTS/ewe',
    'BibleTTS/hausa',
    'BibleTTS/lingala'
    'BibleTTS/yoruba',
    'Buckeye',
    'EUROM',
    'LibriSpeech',
    'LibriVox',
    'MediaSpeech/AR',
    'MediaSpeech/ES',
    'MediaSpeech/FR',
    'MediaSpeech/TR',
    'MozillaCommonVoice/ab',
    'MozillaCommonVoice/ar',
    'MozillaCommonVoice/ba',
    'MozillaCommonVoice/be',
    'MozillaCommonVoice/bg',
    'MozillaCommonVoice/bn',
    'MozillaCommonVoice/br',
    'MozillaCommonVoice/ca',
    'MozillaCommonVoice/ckb',
    'MozillaCommonVoice/cnh',
    'MozillaCommonVoice/cs',
    'MozillaCommonVoice/cv',
    'MozillaCommonVoice/cy',
    'MozillaCommonVoice/da',
    'MozillaCommonVoice/de',
    'MozillaCommonVoice/dv',
    'MozillaCommonVoice/el',
    'MozillaCommonVoice/en',
    'MozillaCommonVoice/eo',
    'MozillaCommonVoice/es',
    'MozillaCommonVoice/et',
    'MozillaCommonVoice/eu',
    'MozillaCommonVoice/fa',
    'MozillaCommonVoice/fi',
    'MozillaCommonVoice/fr',
    'MozillaCommonVoice/fy-NL',
    'MozillaCommonVoice/ga-IE',
    'MozillaCommonVoice/gl',
    'MozillaCommonVoice/gn',
    'MozillaCommonVoice/hi',
    'MozillaCommonVoice/hu',
    'MozillaCommonVoice/hy-AM',
    'MozillaCommonVoice/id',
    'MozillaCommonVoice/ig',
    'MozillaCommonVoice/it',
    'MozillaCommonVoice/ja',
    'MozillaCommonVoice/ka',
    'MozillaCommonVoice/kab',
    'MozillaCommonVoice/kk',
    'MozillaCommonVoice/kmr',
    'MozillaCommonVoice/ky',
    'MozillaCommonVoice/lg',
    'MozillaCommonVoice/lt',
    'MozillaCommonVoice/ltg',
    'MozillaCommonVoice/lv',
    'MozillaCommonVoice/mhr',
    'MozillaCommonVoice/ml',
    'MozillaCommonVoice/mn',
    'MozillaCommonVoice/mt',
    'MozillaCommonVoice/nan-tw',
    'MozillaCommonVoice/nl',
    'MozillaCommonVoice/oc',
    'MozillaCommonVoice/or',
    'MozillaCommonVoice/pl',
    'MozillaCommonVoice/pt',
    'MozillaCommonVoice/ro',
    'MozillaCommonVoice/ru',
    'MozillaCommonVoice/rw',
    'MozillaCommonVoice/sr',
    'MozillaCommonVoice/sv-SE',
    'MozillaCommonVoice/sw',
    'MozillaCommonVoice/ta',
    'MozillaCommonVoice/th',
    'MozillaCommonVoice/tr',
    'MozillaCommonVoice/tt',
    'MozillaCommonVoice/ug',
    'MozillaCommonVoice/uk',
    'MozillaCommonVoice/ur',
    'MozillaCommonVoice/uz',
    'MozillaCommonVoice/vi',
    'MozillaCommonVoice/yo',
    'MozillaCommonVoice/yue',
    'MozillaCommonVoice/zh-CN',
    'MozillaCommonVoice/zh-TW',
    'primewords_chinese',
    'room_reader',
    'SpeechClarity',
    'TAT-Vol2',
    'thchs30',
    'TIMIT',
    'TTS_Javanese',
    'zeroth_korean',]

In [7]:
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv

def make_meta_file(corpus):
    
    params_list = glob.glob('STM_output/Survey/speech_params_'+corpus+'/*')
    df_list = []

    # load the data from the mat file
    for params_file in params_list:
        data_dict = sio.loadmat(params_file)
        structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
        df = pd.DataFrame(structure_dict)
        df.drop(columns=['x_axis','y_axis'], inplace=True)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df_list.append(df)
    
    df_all = pd.concat(df_list, ignore_index=True)

    # add the speaker ID and gender info
    if 'MozillaCommonVoice' in corpus:
        valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)
        valid_df['path'] = valid_df['path'].str.replace('.mp3', '')
        valid_df.rename(columns={'client_id':'speaker/player'}, inplace=True)
        df_all = df_all.merge(valid_df[['speaker/player', 'path', 'gender']], how='left', left_on='filename', right_on='path').drop(columns=['path'])
    elif 'BibleTTS' in corpus:
        df_all['speaker/player'] = 'BibleTTS_' + df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'Buckeye' in corpus:
        df_all['speaker/player'] = df_all['filename'].str[:3].str.replace('s', 'S')
        df_gender = pd.read_csv('data/speechCorp/Buckeye/Buckeye_speaker_info.csv')
        df_all = df_all.merge(df_gender[['SPEAKER', "SPEAKER'S GENDER"]], how='left', left_on='speaker/player', right_on='SPEAKER').drop(columns=['SPEAKER'])
        df_all.rename(columns={"SPEAKER'S GENDER":'gender'}, inplace=True)
        df_all['speaker/player'] = 'Buckeye_'+df_all['speaker/player']
    elif 'EUROM' in corpus:
        df_all['speaker/player'] = 'EUROM_' + df_all['LangOrInstru'] +'_'+ df_all['filename'].str[:2]
        df_all['gender'] = np.nan
    elif 'MediaSpeech' in corpus:
        df_all['speaker/player'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
        df_all['gender'] = np.nan
    elif 'LibriSpeech' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        first_parts = split_names.str[0] # Extract the first part of the split result
        df_all['speaker/player'] = first_parts # Add the extracted part as a new column in the DataFrame
        
        ## load LibriSpeech text file
        reader_ids = []
        genders = []
        subsets = []
        durations = []
        names = []
        # Open the file and read line by line
        with open('data/speechCorp/LibriSpeech/SPEAKERS.TXT', 'r') as file:
            # Skip lines starting with ";" (comments) until reaching the data
            while True:
                line = file.readline()
                if not line.startswith(';'):
                    # Start processing data from this line
                    break
            
            # Read the rest of the lines and parse them
            while line:
                # Split each line by "|" character
                data = line.strip().split('|')
                # Extract relevant information
                reader_ids.append(int(data[0].strip()))
                genders.append(data[1].strip())
                subsets.append(data[2].strip())
                durations.append(float(data[3].strip()))
                names.append(data[4].strip())
                # Read the next line
                line = file.readline()
        
        # Create a DataFrame using the lists
        df_LibriSpeech = pd.DataFrame({'reader_id': reader_ids, 'gender': genders, 'subset': subsets, 'duration': durations, 'name': names})
        df_LibriSpeech['reader_id']= df_LibriSpeech['reader_id'].astype(str)
        df_all = df_all.merge(df_LibriSpeech[['reader_id', 'gender']], how='left', left_on='speaker/player', right_on='reader_id').drop(columns=['reader_id'])



    return df_all

In [16]:
corpus = 'MozillaCommonVoice/hi'
df_all = make_meta_file(corpus)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,speaker/player,gender
0,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_26986130,Hindi,1,1,128000,32,4,0.903469,4,d24af5ee15190d3fc14dfc4ee3c75c75fe66fb537d4c7c...,male
1,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_26021896,Hindi,1,1,128000,32,4,0.666188,4,0f018a99663f33afbb7d38aee281fb1afcfd07f9e7acd0...,
2,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_24026258,Hindi,1,1,128000,32,4,0.668438,4,63293ff4cc67280b10da2df2fd2763db65ca8493af191e...,male
3,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_25170240,Hindi,1,1,128000,32,4,0.620031,4,c88824395561074e0eb0ebc1280d4866dbc29f01c02689...,
4,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_23849297,Hindi,1,1,192000,48,4,0.792937,4,46f23ae11e8565e979749342e1097a77fe3ef574253f10...,male
...,...,...,...,...,...,...,...,...,...,...,...,...
5433,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_27761274,Hindi,1,1,128000,32,4,0.164656,4,d5880664e0b201cf24b4972ea9890b801afe05af05cd43...,male
5434,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_25241159,Hindi,1,1,128000,32,4,0.803937,4,3255dc44d5426ad255e14c4db4d12b224dfffd1e594a40...,male
5435,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_27517919,Hindi,1,1,128000,32,4,0.212375,4,d5880664e0b201cf24b4972ea9890b801afe05af05cd43...,male
5436,data/speechCorp/MozillaCommonVoice/hi/clips/co...,common_voice_hi_32264979,Hindi,1,1,256000,32,8,0.392625,8,b0900acc53b0b578a071411f8ec0479ebf48be094bdcae...,


## test code ##

In [61]:
corpus = 'MediaSpeech/AR'
params_list = glob.glob('STM_output/Survey/speech_params_'+corpus+'/*')
df_list = []

for params_file in params_list:
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all.head()

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur
0,data/speechCorp/MediaSpeech/AR/4f6373ba-a4d2-4...,4f6373ba-a4d2-427e-9d39-d4026396c296.,Arabic,1,1,192000,16,12,0.5,12
1,data/speechCorp/MediaSpeech/AR/f52a382e-362b-4...,f52a382e-362b-4275-bb64-57c13a35cd24.,Arabic,1,1,192000,16,12,0.539813,12
2,data/speechCorp/MediaSpeech/AR/807c9560-d22b-4...,807c9560-d22b-4f47-9637-cf471edd69c0.,Arabic,1,1,192000,16,12,0.5,12
3,data/speechCorp/MediaSpeech/AR/63edf638-9147-4...,63edf638-9147-4f76-890c-87db14ab3d77.,Arabic,1,1,192000,16,12,0.52025,12
4,data/speechCorp/MediaSpeech/AR/c6104cc5-431c-4...,c6104cc5-431c-4d45-b240-505d5d98d6a2.,Arabic,1,1,192000,16,12,0.502375,12


In [64]:
df_all['speaker/player'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
df_all['gender'] = np.nan

In [65]:
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,speaker/player,gender
0,data/speechCorp/MediaSpeech/AR/4f6373ba-a4d2-4...,4f6373ba-a4d2-427e-9d39-d4026396c296.,Arabic,1,1,192000,16,12,0.500000,12,MediaSpeech_Arabic_4f6373ba-a4d2-427e-9d39-d40...,
1,data/speechCorp/MediaSpeech/AR/f52a382e-362b-4...,f52a382e-362b-4275-bb64-57c13a35cd24.,Arabic,1,1,192000,16,12,0.539813,12,MediaSpeech_Arabic_f52a382e-362b-4275-bb64-57c...,
2,data/speechCorp/MediaSpeech/AR/807c9560-d22b-4...,807c9560-d22b-4f47-9637-cf471edd69c0.,Arabic,1,1,192000,16,12,0.500000,12,MediaSpeech_Arabic_807c9560-d22b-4f47-9637-cf4...,
3,data/speechCorp/MediaSpeech/AR/63edf638-9147-4...,63edf638-9147-4f76-890c-87db14ab3d77.,Arabic,1,1,192000,16,12,0.520250,12,MediaSpeech_Arabic_63edf638-9147-4f76-890c-87d...,
4,data/speechCorp/MediaSpeech/AR/c6104cc5-431c-4...,c6104cc5-431c-4d45-b240-505d5d98d6a2.,Arabic,1,1,192000,16,12,0.502375,12,MediaSpeech_Arabic_c6104cc5-431c-4d45-b240-505...,
...,...,...,...,...,...,...,...,...,...,...,...,...
2500,data/speechCorp/MediaSpeech/AR/0b480782-05dc-4...,0b480782-05dc-467b-af27-bdaaa7ad2928.,Arabic,1,1,192000,16,12,0.500000,12,MediaSpeech_Arabic_0b480782-05dc-467b-af27-bda...,
2501,data/speechCorp/MediaSpeech/AR/7c22a447-19d5-4...,7c22a447-19d5-456c-9ce4-943510b162f2.,Arabic,1,1,192000,16,12,0.641875,12,MediaSpeech_Arabic_7c22a447-19d5-456c-9ce4-943...,
2502,data/speechCorp/MediaSpeech/AR/8528d5a9-78d7-4...,8528d5a9-78d7-4080-acc8-a6c0ef207f39.,Arabic,1,1,192000,16,12,0.502375,12,MediaSpeech_Arabic_8528d5a9-78d7-4080-acc8-a6c...,
2503,data/speechCorp/MediaSpeech/AR/488d444e-2e6c-4...,488d444e-2e6c-4fe0-bd59-c38ac76d4584.,Arabic,1,1,192000,16,12,0.588125,12,MediaSpeech_Arabic_488d444e-2e6c-4fe0-bd59-c38...,
