In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv
import json
import os

In [2]:
corpus_speech_list = ['BibleTTS/akuapem-twi',
    'BibleTTS/asante-twi',
    'BibleTTS/ewe',
    'BibleTTS/hausa',
    'BibleTTS/lingala',
    'BibleTTS/yoruba',
    'Buckeye',
    'EUROM',
    'LibriSpeech',
    # 'LibriVox',
    'MediaSpeech/AR',
    'MediaSpeech/ES',
    'MediaSpeech/FR',
    'MediaSpeech/TR',
    'MozillaCommonVoice/ab',
    'MozillaCommonVoice/ar',
    'MozillaCommonVoice/ba',
    'MozillaCommonVoice/be',
    'MozillaCommonVoice/bg',
    'MozillaCommonVoice/bn',
    'MozillaCommonVoice/br',
    'MozillaCommonVoice/ca',
    'MozillaCommonVoice/ckb',
    'MozillaCommonVoice/cnh',
    'MozillaCommonVoice/cs',
    'MozillaCommonVoice/cv',
    'MozillaCommonVoice/cy',
    'MozillaCommonVoice/da',
    'MozillaCommonVoice/de',
    'MozillaCommonVoice/dv',
    'MozillaCommonVoice/el',
    'MozillaCommonVoice/en',
    'MozillaCommonVoice/eo',
    'MozillaCommonVoice/es',
    'MozillaCommonVoice/et',
    'MozillaCommonVoice/eu',
    'MozillaCommonVoice/fa',
    'MozillaCommonVoice/fi',
    'MozillaCommonVoice/fr',
    'MozillaCommonVoice/fy-NL',
    'MozillaCommonVoice/ga-IE',
    'MozillaCommonVoice/gl',
    'MozillaCommonVoice/gn',
    'MozillaCommonVoice/hi',
    'MozillaCommonVoice/hu',
    'MozillaCommonVoice/hy-AM',
    'MozillaCommonVoice/id',
    'MozillaCommonVoice/ig',
    'MozillaCommonVoice/it',
    'MozillaCommonVoice/ja',
    'MozillaCommonVoice/ka',
    'MozillaCommonVoice/kab',
    'MozillaCommonVoice/kk',
    'MozillaCommonVoice/kmr',
    'MozillaCommonVoice/ky',
    'MozillaCommonVoice/lg',
    'MozillaCommonVoice/lt',
    'MozillaCommonVoice/ltg',
    'MozillaCommonVoice/lv',
    'MozillaCommonVoice/mhr',
    'MozillaCommonVoice/ml',
    'MozillaCommonVoice/mn',
    'MozillaCommonVoice/mt',
    'MozillaCommonVoice/nan-tw',
    'MozillaCommonVoice/nl',
    'MozillaCommonVoice/oc',
    'MozillaCommonVoice/or',
    'MozillaCommonVoice/pl',
    'MozillaCommonVoice/pt',
    'MozillaCommonVoice/ro',
    'MozillaCommonVoice/ru',
    'MozillaCommonVoice/rw',
    'MozillaCommonVoice/sr',
    'MozillaCommonVoice/sv-SE',
    'MozillaCommonVoice/sw',
    'MozillaCommonVoice/ta',
    'MozillaCommonVoice/th',
    'MozillaCommonVoice/tr',
    'MozillaCommonVoice/tt',
    'MozillaCommonVoice/ug',
    'MozillaCommonVoice/uk',
    'MozillaCommonVoice/ur',
    'MozillaCommonVoice/uz',
    'MozillaCommonVoice/vi',
    'MozillaCommonVoice/yo',
    'MozillaCommonVoice/yue',
    'MozillaCommonVoice/zh-CN',
    'MozillaCommonVoice/zh-TW',
    'primewords_chinese',
    'room_reader',
    'SpeechClarity',
    'TAT-Vol2',
    'thchs30',
    'TIMIT',
    'TTS_Javanese',
    'zeroth_korean'
]

corpus_music_list = [
    'IRMAS',
    'Albouy2020Science',
    'CD',
    'GarlandEncyclopedia',
    'fma_large'
]

In [3]:
def make_meta_file(corpus, corpus_type):
    
    params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
    df_list = []

    # load the data from the mat file
    for params_file in params_list:
        # print(params_file)
        data_dict = sio.loadmat(params_file)
        structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
        df = pd.DataFrame(structure_dict)
        df.drop(columns=['x_axis','y_axis'], inplace=True)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df['corpus'] = corpus
        df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
        df_list.append(df)
    
    df_all = pd.concat(df_list, ignore_index=True)

    # add the speaker ID and gender info
    if 'MozillaCommonVoice' in corpus:
        valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)
        valid_df['path'] = valid_df['path'].str.replace('.mp3', '')
        valid_df.rename(columns={'client_id':'speaker/artist'}, inplace=True)
        df_all = df_all.merge(valid_df[['speaker/artist', 'path', 'gender']], how='left', left_on='filename', right_on='path').drop(columns=['path'])
    elif 'BibleTTS' in corpus:
        df_all['speaker/artist'] = 'BibleTTS_' + df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'Buckeye' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3].str.replace('s', 'S')
        df_gender = pd.read_csv('data/speechCorp/Buckeye/Buckeye_speaker_info.csv')
        df_all = df_all.merge(df_gender[['SPEAKER', "SPEAKER'S GENDER"]], how='left', left_on='speaker/artist', right_on='SPEAKER').drop(columns=['SPEAKER'])
        df_all.rename(columns={"SPEAKER'S GENDER":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'Buckeye_'+df_all['speaker/artist']
    elif 'EUROM' in corpus:
        df_all['speaker/artist'] = 'EUROM_' + df_all['LangOrInstru'] +'_'+ df_all['filename'].str[:2]
        df_all['gender'] = np.nan
    elif 'MediaSpeech' in corpus:
        df_all['speaker/artist'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
        df_all['gender'] = np.nan
    elif 'LibriSpeech' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        first_parts = split_names.str[0] # Extract the first part of the split result
        df_all['speaker/artist'] = first_parts # Add the extracted part as a new column in the DataFrame
        
        ## load LibriSpeech text file
        reader_ids = []
        genders = []
        subsets = []
        durations = []
        names = []
        # Open the file and read line by line
        with open('data/speechCorp/LibriSpeech/SPEAKERS.TXT', 'r') as file:
            # Skip lines starting with ";" (comments) until reaching the data
            while True:
                line = file.readline()
                if not line.startswith(';'):
                    # Start processing data from this line
                    break
            
            # Read the rest of the lines and parse them
            while line:
                # Split each line by "|" character
                data = line.strip().split('|')
                # Extract relevant information
                reader_ids.append(int(data[0].strip()))
                genders.append(data[1].strip())
                subsets.append(data[2].strip())
                durations.append(float(data[3].strip()))
                names.append(data[4].strip())
                # Read the next line
                line = file.readline()
        
        # Create a DataFrame using the lists
        df_LibriSpeech = pd.DataFrame({'reader_id': reader_ids, 'gender': genders, 'subset': subsets, 'duration': durations, 'name': names})
        df_LibriSpeech['reader_id']= df_LibriSpeech['reader_id'].astype(str)
        df_all = df_all.merge(df_LibriSpeech[['reader_id', 'gender']], how='left', left_on='speaker/artist', right_on='reader_id').drop(columns=['reader_id'])
    elif 'primewords_chinese' in corpus:
        with open('data/speechCorp/primewords_chinese/set1_transcript.json', 'r') as file:
            data = json.load(file)
        primewords_df = pd.DataFrame(data)
        primewords_df['file'] = primewords_df['file'].str.replace('.wav', '')
        df_all = df_all.merge(primewords_df[['file', 'user_id']], how='left', left_on='filename', right_on='file').drop(columns=['file'])
        df_all.rename(columns={"user_id":'speaker/artist'}, inplace=True)
        df_all['speaker/artist'] = 'primewords_'+df_all['speaker/artist']
        df_all['gender']=np.nan
    elif 'room_reader' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = split_names.str[1] # Extract the first part of the split result
        RR_df = pd.read_excel('data/speechCorp/room_reader/RoomReader_SessionsEvents.xlsx')
        df_all = df_all.merge(RR_df[['part_ID', 'gender']], how='left', left_on='speaker/artist', right_on='part_ID').drop(columns=['part_ID'])
        df_all['speaker/artist'] = 'RoomReader_'+df_all['speaker/artist']
    elif 'SpeechClarity' in corpus:
        df_all['speaker/artist'] = 'SpeechClarity_'+df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'TAT-Vol2' in corpus:
        df_all['speaker/artist'] = 'TAT-Vol2_'+df_all['filename'].str[:10]
        df_all['gender'] = df_all['filename'].str[5]
    elif 'thchs30' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'thchs30_'+split_names.str[0] # Extract the first part of the split result
        df_all['gender'] = np.nan
    elif 'TIMIT' in corpus:
        split_names = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'TIMIT_'+split_names.str[-2] # Extract the first part of the split result
        df_all['gender'] = df_all['speaker/artist'].str[6]
    elif 'TTS_Javanese' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:9] 
        df_all['gender'] = df_all['filename'].str[2]
    elif 'zeroth' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3] 
        zeroth_df = pd.read_csv('data/speechCorp/zeroth_korean/AUDIO_INFO', sep="|")
        zeroth_df['SPEAKERID'] = zeroth_df['SPEAKERID'].astype(str)
        df_all = df_all.merge(zeroth_df[['SPEAKERID', 'SEX']], how='left', left_on='speaker/artist', right_on='SPEAKERID').drop(columns=['SPEAKERID'])
        df_all.rename(columns={"SEX":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'zeroth_'+df_all['speaker/artist']

    # Music corpora
    elif 'IRMAS' in corpus:
        for nRow in range(len(df_all)):
            txt_path = df_all['filepath'][nRow].replace('.wav', '.txt')
            if os.path.exists(txt_path):
                # Open the file in read mode
                with open(txt_path, 'r') as file:
                    # Read all lines into a list
                    lines = file.readlines()
                converted_lines = [line.strip() for line in lines]
                df_all.loc[nRow,'LangOrInstru'] = '-'.join(converted_lines)
                df_all.loc[nRow,'VoiOrNot'] = int('voi' in converted_lines)
                
        # df_all['speaker/artist'] = '-'.join(df_all['filename'].split('-')[:-1]).strip()
        df_all['speaker/artist'] = df_all['filename'].apply(lambda x: '-'.join(x.split('-')[:-1]).strip())
        df_all['gender'] = np.nan
        df_all['genre'] = np.nan
        
    elif 'Albouy2020Science' in corpus:
        df_all['speaker/artist'] = 'Albouy2020Science'
        df_all['gender'] = 'female'
        df_all['genre'] = 'classical'
        df_all['LangOrInstru'] = 'English'
        df_all.loc[df_all['filename'].str.contains('French', case=False), 'LangOrInstru'] = 'French'
        df_all['VoiOrNot'] = 1

    elif 'CD' in corpus:
        from fuzzywuzzy import process
        df_all['gender'] = np.nan
        def extract_artist(file_path):
            parts = file_path.split('/')
            artist_album_part = parts[3]
            return artist_album_part.split('_')[0]
        
        df_all['speaker/artist'] = df_all['filepath'].apply(extract_artist)
        df_CD = pd.read_excel('data/musicCorp/CD/CD_music_list.xlsx')

        # Function to find the best match for each name in df1 from df2
        def find_best_match(name, choices):
            return process.extractOne(name, choices)
        
        df_all['Best_Match'] = df_all['filename'].apply(lambda x: find_best_match(x, df_CD['Piece'])) # Apply the function to find the best match for each name in df1
        df_all['Matched_Name'] = df_all['Best_Match'].apply(lambda x: x[0]) # Extract matched names and similarity scores
        df_all['Similarity_Score'] = df_all['Best_Match'].apply(lambda x: x[1])
        
        # Join based on matched names
        df_all = pd.merge(df_all, df_CD[['Piece', 'Genre','Instrument']], left_on='Matched_Name', right_on='Piece', how='left')
        df_all['LangOrInstru'] = df_all['Instrument']
        df_all.drop(columns=['Best_Match','Instrument','Matched_Name','Similarity_Score','Piece'], inplace = True)
        df.rename(columns={'Genre': 'genre'}, inplace = True)

        df_all['VoiOrNot'] = 0
        df_all.loc[df_all['LangOrInstru'].str.contains('Voi', case=False), 'VoiOrNot'] = 1

        df_all = df_all[~df_all['filepath'].str.contains('Compilations', case=False)]
        
    elif 'GarlandEncyclopedia' in corpus:
        df_all['speaker/artist'] = df_all['filename']
        df_all['gender'] = np.nan
        df_all['LangOrInstru'] = np.nan
        df_all['genre'] = 'world'
        Garland_novoice_list = list(pd.read_csv('data/musicCorp/GarlandEncyclopedia/Garland_noVoice.csv', header=None)[0])
        df_all['VoiOrNot'] = 1
        df_all.loc[df_all['filename'].isin(Garland_novoice_list), 'VoiOrNot'] = 0
        
    elif 'fma_large' in corpus:
        def revert_numerical_string(num_str):
            return num_str.lstrip('0')
        df_all['filename'] = df_all['filename'].apply(lambda x: revert_numerical_string(x)).astype(int)
        df_tracks = pd.read_csv('data/musicCorp/fma_large/fma_metadata/tracks.csv', low_memory=True, header=1)
        df_tracks.rename(columns={'Unnamed: 0': 'track_id'}, inplace=True)
        df_tracks['track_id'] = pd.to_numeric(df_tracks['track_id'], errors='coerce')
        df_tracks.drop(index=0, inplace=True)
        import ast
        def convert_to_list(string_value):
            return ast.literal_eval(string_value)
        df_tracks['genres_all'] = df_tracks['genres_all'].apply(convert_to_list)
        df_all = pd.merge(df_all, df_tracks[['track_id','name','language_code','genres_all']], left_on='filename', right_on='track_id', how='left')
        df_all['LangOrInstru'] = df_all['language_code']
        df_all.drop(columns=['language_code','track_id'], inplace=True)
        df_all.rename(columns={'genres_all': 'genre', 'name': 'speaker/artist'}, inplace=True)
        df_all['gender'] = np.nan
        df_genres = pd.read_csv('data/musicCorp/fma_large/fma_metadata/genres.csv', index_col='genre_id')
        def replace_genre_names(lst):
            return [df_genres['title'][n] for n in lst]
        df_all['genre'] = df_all['genre'].apply(lambda x: replace_genre_names(x))

    elif 'Homburg' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        df_all['speaker/artist'] = split_names.str[0]
        df_all['gender'] = np.nan
        df_all['LangOrInstru'] = np.nan
        split_filepath = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['genre'] = split_filepath.str[3].str.capitalize()
        df_all['VoiOrNot'] = np.nan

    elif 'ismir04_genre' in corpus:
        df_track1 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/development/tracklist.csv', header=None)
        df_track2 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/evaluation/tracklist.csv', header=None)
        df_track3 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/training/tracklist.csv', header=None)
        df_track = pd.concat([df_track1,df_track2,df_track3],ignore_index=True)
        df_track.columns = ['genre', 'speaker/artist', 'album', 'track_name', 'track_num', 'file_name', 'nan']
        split_names = df_track['file_name'].str.split('/') # Split the 'name' column by "-"
        df_track['file_name'] = split_names.str[-1].str.replace('.mp3', '')
        df_all = df_all.merge(df_track[['genre','speaker/artist','file_name']], left_on='filename', right_on='file_name', how='left').drop(columns='file_name')
        df_all['gender'] = np.nan

    return df_all

In [4]:
for corpus in corpus_speech_list:
    print(corpus)
    df_all = make_meta_file(corpus, corpus_type='speech')
    df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')


BibleTTS/akuapem-twi
BibleTTS/asante-twi
BibleTTS/ewe
BibleTTS/hausa
BibleTTS/lingala
BibleTTS/yoruba
Buckeye
EUROM
LibriSpeech
MediaSpeech/AR
MediaSpeech/ES
MediaSpeech/FR
MediaSpeech/TR
MozillaCommonVoice/ab
MozillaCommonVoice/ar


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ba


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/be


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/bg
MozillaCommonVoice/bn
MozillaCommonVoice/br
MozillaCommonVoice/ca


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ckb
MozillaCommonVoice/cnh
MozillaCommonVoice/cs
MozillaCommonVoice/cv
MozillaCommonVoice/cy
MozillaCommonVoice/da
MozillaCommonVoice/de


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/dv
MozillaCommonVoice/el
MozillaCommonVoice/en


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/eo


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/es


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/et
MozillaCommonVoice/eu
MozillaCommonVoice/fa


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/fi
MozillaCommonVoice/fr
MozillaCommonVoice/fy-NL
MozillaCommonVoice/ga-IE
MozillaCommonVoice/gl
MozillaCommonVoice/gn
MozillaCommonVoice/hi
MozillaCommonVoice/hu
MozillaCommonVoice/hy-AM
MozillaCommonVoice/id
MozillaCommonVoice/ig
MozillaCommonVoice/it
MozillaCommonVoice/ja
MozillaCommonVoice/ka


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/kab


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/kk
MozillaCommonVoice/kmr
MozillaCommonVoice/ky
MozillaCommonVoice/lg


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/lt
MozillaCommonVoice/ltg
MozillaCommonVoice/lv
MozillaCommonVoice/mhr


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ml
MozillaCommonVoice/mn
MozillaCommonVoice/mt
MozillaCommonVoice/nan-tw
MozillaCommonVoice/nl
MozillaCommonVoice/oc
MozillaCommonVoice/or
MozillaCommonVoice/pl


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/pt


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ro
MozillaCommonVoice/ru
MozillaCommonVoice/rw


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/sr
MozillaCommonVoice/sv-SE
MozillaCommonVoice/sw


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ta


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/th


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/tr


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/tt
MozillaCommonVoice/ug
MozillaCommonVoice/uk


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ur
MozillaCommonVoice/uz


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/vi
MozillaCommonVoice/yo
MozillaCommonVoice/yue
MozillaCommonVoice/zh-CN
MozillaCommonVoice/zh-TW


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


primewords_chinese
room_reader
SpeechClarity
TAT-Vol2
thchs30
TIMIT
TTS_Javanese
zeroth_korean


In [4]:
for corpus in corpus_music_list:
    print(corpus)
    df_all = make_meta_file(corpus, corpus_type='music')
    df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')

IRMAS
Albouy2020Science
CD
GarlandEncyclopedia


## test code ##

In [30]:
corpus = 'ismir04_genre'

corpus_type='music'

params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
df_list = []

# load the data from the mat file
for params_file in params_list:
    # print(params_file)
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df['corpus'] = corpus
    df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,corpus,mat_filename
0,data/musicCorp/ismir04_genre/audio/training/ro...,14-you_got_something,,,1,5292000,44.1,120,0.276667,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/14...
1,data/musicCorp/ismir04_genre/audio/training/cl...,17-the_orange_rogue_(trad._arr._r,,,1,5292000,44.1,120,0.215850,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/17...
2,data/musicCorp/ismir04_genre/audio/training/wo...,9-hollow_earth,,,1,5292000,44.1,120,0.279206,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/9-...
3,data/musicCorp/ismir04_genre/audio/evaluation/...,14-table_of_the_sun,,,1,5292000,44.1,120,0.203061,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/14...
4,data/musicCorp/ismir04_genre/audio/training/cl...,26-fantasia_que_contrahaze_la_har,,,1,5292000,44.1,120,0.603923,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/26...
...,...,...,...,...,...,...,...,...,...,...,...,...
2180,data/musicCorp/ismir04_genre/audio/evaluation/...,11-the_way_to_the_prophet_s_head,,,1,3351600,44.1,76,0.280975,76,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/11...
2181,data/musicCorp/ismir04_genre/audio/training/ja...,8-down_the_line,,,1,5292000,44.1,120,0.044127,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/8-...
2182,data/musicCorp/ismir04_genre/audio/evaluation/...,2-permanent,,,1,5292000,44.1,120,0.036553,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/2-...
2183,data/musicCorp/ismir04_genre/audio/training/cl...,15-i_have_a_yong_suster,,,1,5292000,44.1,120,0.445805,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/15...


In [31]:
df_track1 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/development/tracklist.csv', header=None)
df_track2 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/evaluation/tracklist.csv', header=None)
df_track3 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/training/tracklist.csv', header=None)
df_track = pd.concat([df_track1,df_track2,df_track3],ignore_index=True)
df_track.columns = ['genre', 'speaker/artist', 'album', 'track_name', 'track_num', 'file_name', 'nan']
df_track

Unnamed: 0,genre,speaker/artist,album,track_name,track_num,file_name,nan
0,classical,artist_1,album_1,track_1,1,artist_1_album_1_track_1.mp3,
1,classical,artist_1,album_1,track_2,2,artist_1_album_1_track_2.mp3,
2,classical,artist_1,album_1,track_3,3,artist_1_album_1_track_3.mp3,
3,classical,artist_1,album_1,track_4,4,artist_1_album_1_track_4.mp3,
4,classical,artist_1,album_1,track_5,5,artist_1_album_1_track_5.mp3,
...,...,...,...,...,...,...,...
2182,world,touchinggrace,reformation_sessions,taking_flight,2,world/touchinggrace/the_reformation_sessions/2...,
2183,world,touchinggrace,submission,last_night_s_dream_the_remembrance,9,world/touchinggrace/submission/9-last_night's_...,
2184,world,touchinggrace,submission,watching_clouds,6,world/touchinggrace/submission/6-watching_clou...,
2185,world,yakshi,yakshi,chandra,2,world/yakshi/yakshi/2-chandra.mp3,


In [32]:
split_names = df_track['file_name'].str.split('/') # Split the 'name' column by "-"
df_track['file_name'] = split_names.str[-1].str.replace('.mp3', '')
df_track

Unnamed: 0,genre,speaker/artist,album,track_name,track_num,file_name,nan
0,classical,artist_1,album_1,track_1,1,artist_1_album_1_track_1,
1,classical,artist_1,album_1,track_2,2,artist_1_album_1_track_2,
2,classical,artist_1,album_1,track_3,3,artist_1_album_1_track_3,
3,classical,artist_1,album_1,track_4,4,artist_1_album_1_track_4,
4,classical,artist_1,album_1,track_5,5,artist_1_album_1_track_5,
...,...,...,...,...,...,...,...
2182,world,touchinggrace,reformation_sessions,taking_flight,2,2-taking_flight,
2183,world,touchinggrace,submission,last_night_s_dream_the_remembrance,9,9-last_night's_dream_-_the_remembrance,
2184,world,touchinggrace,submission,watching_clouds,6,6-watching_clouds,
2185,world,yakshi,yakshi,chandra,2,2-chandra,


In [33]:
df_all = df_all.merge(df_track[['genre','speaker/artist','file_name']], left_on='filename', right_on='file_name', how='left').drop(columns='file_name')
df_all['gender'] = np.nan
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,corpus,mat_filename,genre,speaker/artist
0,data/musicCorp/ismir04_genre/audio/training/ro...,14-you_got_something,,,1,5292000,44.1,120,0.276667,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/14...,rock_pop,grayson_wray
1,data/musicCorp/ismir04_genre/audio/training/cl...,17-the_orange_rogue_(trad._arr._r,,,1,5292000,44.1,120,0.215850,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/17...,classical,doc_rossi
2,data/musicCorp/ismir04_genre/audio/training/wo...,9-hollow_earth,,,1,5292000,44.1,120,0.279206,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/9-...,world,shiva_in_exile
3,data/musicCorp/ismir04_genre/audio/evaluation/...,14-table_of_the_sun,,,1,5292000,44.1,120,0.203061,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/14...,world,michael_masley
4,data/musicCorp/ismir04_genre/audio/training/cl...,26-fantasia_que_contrahaze_la_har,,,1,5292000,44.1,120,0.603923,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/26...,classical,jacob_heringman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2182,data/musicCorp/ismir04_genre/audio/evaluation/...,11-the_way_to_the_prophet_s_head,,,1,3351600,44.1,76,0.280975,76,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/11...,rock_pop,jade_leary
2183,data/musicCorp/ismir04_genre/audio/training/ja...,8-down_the_line,,,1,5292000,44.1,120,0.044127,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/8-...,jazz_blues,jag
2184,data/musicCorp/ismir04_genre/audio/evaluation/...,2-permanent,,,1,5292000,44.1,120,0.036553,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/2-...,rock_pop,arthur_yoria
2185,data/musicCorp/ismir04_genre/audio/training/cl...,15-i_have_a_yong_suster,,,1,5292000,44.1,120,0.445805,120,ismir04_genre,STM_output/MATs/music_mat_wl4_ismir04_genre/15...,classical,john_fleagle


2186

In [13]:
split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
df_all['speaker/artist'] = split_names.str[0]
df_all['gender'] = np.nan
df_all['LangOrInstru'] = np.nan
split_filepath = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
df_all['genre'] = split_filepath.str[3].str.capitalize()
df_all['VoiOrNot'] = np.nan
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,corpus,mat_filename,speaker/artist,gender,genre
0,data/musicCorp/Homburg/folkcountry/JoshChristi...,JoshChristian-FOR_THE_LOVE_OF_MONEY,,,1,352800,44.1,8,0.025057,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/JoshChri...,JoshChristian,,Folkcountry
1,data/musicCorp/Homburg/rock/Hanover_Fist-Waiti...,Hanover_Fist-Waiting,,,1,352800,44.1,8,0.025034,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Hanover_...,Hanover_Fist,,Rock
2,data/musicCorp/Homburg/electronic/Zerobae-Taus...,Zerobae-Tausende_Herden,,,1,352800,44.1,8,0.025057,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Zerobae-...,Zerobae,,Electronic
3,data/musicCorp/Homburg/rock/Royal_Bliss-Busy_B...,Royal_Bliss-Busy_Bee,,,1,352800,44.1,8,0.025057,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Royal_Bl...,Royal_Bliss,,Rock
4,data/musicCorp/Homburg/raphiphop/Gym_Class_Her...,Gym_Class_Heroes-Decrepit_Bricks,,,1,352800,44.1,8,0.041179,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Gym_Clas...,Gym_Class_Heroes,,Raphiphop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1880,data/musicCorp/Homburg/jazz/ATMA-Immanuel_s_La...,ATMA-Immanuel_s_Labor,,,1,352800,44.1,8,0.025034,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/ATMA-Imm...,ATMA,,Jazz
1881,data/musicCorp/Homburg/rock/Doteen-Arrow_to_th...,Doteen-Arrow_to_the_Sun,,,1,352800,44.1,8,0.025057,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Doteen-A...,Doteen,,Rock
1882,data/musicCorp/Homburg/rock/Secret_Suburbia-I_...,Secret_Suburbia-I_ve_Been_Around,,,1,352800,44.1,8,0.024739,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Secret_S...,Secret_Suburbia,,Rock
1883,data/musicCorp/Homburg/jazz/Waterloo_Jazz_Duo-...,Waterloo_Jazz_Duo-Godnattvisa,,,1,352800,44.1,8,0.025011,8,Homburg,STM_output/MATs/music_mat_wl4_Homburg/Waterloo...,Waterloo_Jazz_Duo,,Jazz


In [None]:
# df_all['speaker/artist'] = df_all['filename']
# df_all['gender'] = np.nan
# df_all['LangOrInstru'] = np.nan
# df_all['genre'] = 'world'
# Garland_novoice_list = list(pd.read_csv('data/musicCorp/GarlandEncyclopedia/Garland_noVoice.csv', header=None)[0])
# df_all['VoiOrNot'] = 1
# df_all.loc[df_all['filename'].isin(Garland_novoice_list), 'VoiOrNot'] = 0