In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv
import json
import os
import os.path

In [2]:
corpus_speech_list = ['BibleTTS/akuapem-twi',
    'BibleTTS/asante-twi',
    'BibleTTS/ewe',
    'BibleTTS/hausa',
    'BibleTTS/lingala',
    'BibleTTS/yoruba',
    'Buckeye',
    'EUROM',
    'HiltonMoser2022_speech',
    'LibriSpeech',
    # 'LibriVox',
    'MediaSpeech/AR',
    'MediaSpeech/ES',
    'MediaSpeech/FR',
    'MediaSpeech/TR',
    'MozillaCommonVoice/ab',
    'MozillaCommonVoice/ar',
    'MozillaCommonVoice/ba',
    'MozillaCommonVoice/be',
    'MozillaCommonVoice/bg',
    'MozillaCommonVoice/bn',
    'MozillaCommonVoice/br',
    'MozillaCommonVoice/ca',
    'MozillaCommonVoice/ckb',
    'MozillaCommonVoice/cnh',
    'MozillaCommonVoice/cs',
    'MozillaCommonVoice/cv',
    'MozillaCommonVoice/cy',
    'MozillaCommonVoice/da',
    'MozillaCommonVoice/de',
    'MozillaCommonVoice/dv',
    'MozillaCommonVoice/el',
    'MozillaCommonVoice/en',
    'MozillaCommonVoice/eo',
    'MozillaCommonVoice/es',
    'MozillaCommonVoice/et',
    'MozillaCommonVoice/eu',
    'MozillaCommonVoice/fa',
    'MozillaCommonVoice/fi',
    'MozillaCommonVoice/fr',
    'MozillaCommonVoice/fy-NL',
    'MozillaCommonVoice/ga-IE',
    'MozillaCommonVoice/gl',
    'MozillaCommonVoice/gn',
    'MozillaCommonVoice/hi',
    'MozillaCommonVoice/hu',
    'MozillaCommonVoice/hy-AM',
    'MozillaCommonVoice/id',
    'MozillaCommonVoice/ig',
    'MozillaCommonVoice/it',
    'MozillaCommonVoice/ja',
    'MozillaCommonVoice/ka',
    'MozillaCommonVoice/kab',
    'MozillaCommonVoice/kk',
    'MozillaCommonVoice/kmr',
    'MozillaCommonVoice/ky',
    'MozillaCommonVoice/lg',
    'MozillaCommonVoice/lt',
    'MozillaCommonVoice/ltg',
    'MozillaCommonVoice/lv',
    'MozillaCommonVoice/mhr',
    'MozillaCommonVoice/ml',
    'MozillaCommonVoice/mn',
    'MozillaCommonVoice/mt',
    'MozillaCommonVoice/nan-tw',
    'MozillaCommonVoice/nl',
    'MozillaCommonVoice/oc',
    'MozillaCommonVoice/or',
    'MozillaCommonVoice/pl',
    'MozillaCommonVoice/pt',
    'MozillaCommonVoice/ro',
    'MozillaCommonVoice/ru',
    'MozillaCommonVoice/rw',
    'MozillaCommonVoice/sr',
    'MozillaCommonVoice/sv-SE',
    'MozillaCommonVoice/sw',
    'MozillaCommonVoice/ta',
    'MozillaCommonVoice/th',
    'MozillaCommonVoice/tr',
    'MozillaCommonVoice/tt',
    'MozillaCommonVoice/ug',
    'MozillaCommonVoice/uk',
    'MozillaCommonVoice/ur',
    'MozillaCommonVoice/uz',
    'MozillaCommonVoice/vi',
    'MozillaCommonVoice/yo',
    'MozillaCommonVoice/yue',
    'MozillaCommonVoice/zh-CN',
    'MozillaCommonVoice/zh-TW',
    'primewords_chinese',
    'room_reader',
    'SpeechClarity',
    'TAT-Vol2',
    'thchs30',
    'TIMIT',
    'TTS_Javanese',
    'zeroth_korean'
]

corpus_music_list = [
    'IRMAS',
    'Albouy2020Science',
    'CD',
    'GarlandEncyclopedia',
    'fma_large',
    'ismir04_genre',
    'MTG-Jamendo',
    'HiltonMoser2022_song',
    'NHS2'
]

In [3]:
def make_meta_file(corpus, corpus_type):
    
    params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
    df_list = []

    # load the data from the mat file
    for params_file in params_list:
        # print(params_file)
        data_dict = sio.loadmat(params_file)
        structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
        df = pd.DataFrame(structure_dict)
        df.drop(columns=['x_axis','y_axis'], inplace=True)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df['corpus'] = corpus
        df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
        df_list.append(df)
    
    df_all = pd.concat(df_list, ignore_index=True)

    # add the speaker ID and gender info
    if 'MozillaCommonVoice' in corpus:
        valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)
        valid_df['path'] = valid_df['path'].str.replace('.mp3', '')
        valid_df.rename(columns={'client_id':'speaker/artist'}, inplace=True)
        df_all = df_all.merge(valid_df[['speaker/artist', 'path', 'gender']], how='left', left_on='filename', right_on='path').drop(columns=['path'])
    elif 'BibleTTS' in corpus:
        df_all['speaker/artist'] = 'BibleTTS_' + df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'Buckeye' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3].str.replace('s', 'S')
        df_gender = pd.read_csv('data/speechCorp/Buckeye/Buckeye_speaker_info.csv')
        df_all = df_all.merge(df_gender[['SPEAKER', "SPEAKER'S GENDER"]], how='left', left_on='speaker/artist', right_on='SPEAKER').drop(columns=['SPEAKER'])
        df_all.rename(columns={"SPEAKER'S GENDER":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'Buckeye_'+df_all['speaker/artist']
    elif 'EUROM' in corpus:
        df_all['speaker/artist'] = 'EUROM_' + df_all['LangOrInstru'] +'_'+ df_all['filename'].str[:2]
        df_all['gender'] = np.nan
    elif 'HiltonMoser2022_speech' in corpus:
        df_all = preprocHM2022(df_all)
    elif 'MediaSpeech' in corpus:
        df_all['speaker/artist'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
        df_all['gender'] = np.nan
    elif 'LibriSpeech' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        first_parts = split_names.str[0] # Extract the first part of the split result
        df_all['speaker/artist'] = first_parts # Add the extracted part as a new column in the DataFrame
        
        ## load LibriSpeech text file
        reader_ids = []
        genders = []
        subsets = []
        durations = []
        names = []
        # Open the file and read line by line
        with open('data/speechCorp/LibriSpeech/SPEAKERS.TXT', 'r') as file:
            # Skip lines starting with ";" (comments) until reaching the data
            while True:
                line = file.readline()
                if not line.startswith(';'):
                    # Start processing data from this line
                    break
            
            # Read the rest of the lines and parse them
            while line:
                # Split each line by "|" character
                data = line.strip().split('|')
                # Extract relevant information
                reader_ids.append(int(data[0].strip()))
                genders.append(data[1].strip())
                subsets.append(data[2].strip())
                durations.append(float(data[3].strip()))
                names.append(data[4].strip())
                # Read the next line
                line = file.readline()
        
        # Create a DataFrame using the lists
        df_LibriSpeech = pd.DataFrame({'reader_id': reader_ids, 'gender': genders, 'subset': subsets, 'duration': durations, 'name': names})
        df_LibriSpeech['reader_id']= df_LibriSpeech['reader_id'].astype(str)
        df_all = df_all.merge(df_LibriSpeech[['reader_id', 'gender']], how='left', left_on='speaker/artist', right_on='reader_id').drop(columns=['reader_id'])
    elif 'primewords_chinese' in corpus:
        with open('data/speechCorp/primewords_chinese/set1_transcript.json', 'r') as file:
            data = json.load(file)
        primewords_df = pd.DataFrame(data)
        primewords_df['file'] = primewords_df['file'].str.replace('.wav', '')
        df_all = df_all.merge(primewords_df[['file', 'user_id']], how='left', left_on='filename', right_on='file').drop(columns=['file'])
        df_all.rename(columns={"user_id":'speaker/artist'}, inplace=True)
        df_all['speaker/artist'] = 'primewords_'+df_all['speaker/artist']
        df_all['gender']=np.nan
    elif 'room_reader' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = split_names.str[1] # Extract the first part of the split result
        RR_df = pd.read_excel('data/speechCorp/room_reader/RoomReader_SessionsEvents.xlsx')
        df_all = df_all.merge(RR_df[['part_ID', 'gender']], how='left', left_on='speaker/artist', right_on='part_ID').drop(columns=['part_ID'])
        df_all['speaker/artist'] = 'RoomReader_'+df_all['speaker/artist']
    elif 'SpeechClarity' in corpus:
        df_all['speaker/artist'] = 'SpeechClarity_'+df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'TAT-Vol2' in corpus:
        df_all['speaker/artist'] = 'TAT-Vol2_'+df_all['filename'].str[:10]
        df_all['gender'] = df_all['filename'].str[5]
    elif 'thchs30' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'thchs30_'+split_names.str[0] # Extract the first part of the split result
        df_all['gender'] = np.nan
    elif 'TIMIT' in corpus:
        split_names = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'TIMIT_'+split_names.str[-2] # Extract the first part of the split result
        df_all['gender'] = df_all['speaker/artist'].str[6]
    elif 'TTS_Javanese' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:9] 
        df_all['gender'] = df_all['filename'].str[2]
    elif 'zeroth' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3] 
        zeroth_df = pd.read_csv('data/speechCorp/zeroth_korean/AUDIO_INFO', sep="|")
        zeroth_df['SPEAKERID'] = zeroth_df['SPEAKERID'].astype(str)
        df_all = df_all.merge(zeroth_df[['SPEAKERID', 'SEX']], how='left', left_on='speaker/artist', right_on='SPEAKERID').drop(columns=['SPEAKERID'])
        df_all.rename(columns={"SEX":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'zeroth_'+df_all['speaker/artist']

    # Music corpora
    elif 'IRMAS' in corpus:
        for nRow in range(len(df_all)):
            txt_path = df_all['filepath'][nRow].replace('.wav', '.txt')
            if os.path.exists(txt_path):
                # Open the file in read mode
                with open(txt_path, 'r') as file:
                    # Read all lines into a list
                    lines = file.readlines()
                converted_lines = [line.strip() for line in lines]
                df_all.loc[nRow,'LangOrInstru'] = '-'.join(converted_lines)
                df_all.loc[nRow,'VoiOrNot'] = int('voi' in converted_lines)
                
        # df_all['speaker/artist'] = '-'.join(df_all['filename'].split('-')[:-1]).strip()
        df_all['speaker/artist'] = df_all['filename'].apply(lambda x: '-'.join(x.split('-')[:-1]).strip())
        df_all['gender'] = np.nan
        df_all['genre'] = np.nan
        
    elif 'Albouy2020Science' in corpus:
        df_all['speaker/artist'] = 'Albouy2020Science'
        df_all['gender'] = 'female'
        df_all['genre'] = 'classical'
        df_all['LangOrInstru'] = 'English'
        df_all.loc[df_all['filename'].str.contains('French', case=False), 'LangOrInstru'] = 'French'
        df_all['VoiOrNot'] = 1

    elif 'CD' in corpus:
        from fuzzywuzzy import process
        df_all['gender'] = np.nan
        def extract_artist(file_path):
            parts = file_path.split('/')
            artist_album_part = parts[3]
            return artist_album_part.split('_')[0]
        
        df_all['speaker/artist'] = df_all['filepath'].apply(extract_artist)
        df_CD = pd.read_excel('data/musicCorp/CD/CD_music_list.xlsx')

        # Function to find the best match for each name in df1 from df2
        def find_best_match(name, choices):
            return process.extractOne(name, choices)
        
        df_all['Best_Match'] = df_all['filename'].apply(lambda x: find_best_match(x, df_CD['Piece'])) # Apply the function to find the best match for each name in df1
        df_all['Matched_Name'] = df_all['Best_Match'].apply(lambda x: x[0]) # Extract matched names and similarity scores
        df_all['Similarity_Score'] = df_all['Best_Match'].apply(lambda x: x[1])
        
        # Join based on matched names
        df_all = pd.merge(df_all, df_CD[['Piece', 'Genre','Instrument']], left_on='Matched_Name', right_on='Piece', how='left')
        df_all['LangOrInstru'] = df_all['Instrument']
        df_all.drop(columns=['Best_Match','Instrument','Matched_Name','Similarity_Score','Piece'], inplace = True)
        df.rename(columns={'Genre': 'genre'}, inplace = True)

        df_all['VoiOrNot'] = 0
        df_all.loc[df_all['LangOrInstru'].str.contains('Voi', case=False), 'VoiOrNot'] = 1

        df_all = df_all[~df_all['filepath'].str.contains('Compilations', case=False)]
        
    elif 'GarlandEncyclopedia' in corpus:
        df_all['speaker/artist'] = df_all['filename']
        df_all['gender'] = np.nan
        df_all['LangOrInstru'] = np.nan
        df_all['genre'] = 'world'
        Garland_novoice_list = list(pd.read_csv('data/musicCorp/GarlandEncyclopedia/Garland_noVoice.csv', header=None)[0])
        df_all['VoiOrNot'] = 1
        df_all.loc[df_all['filename'].isin(Garland_novoice_list), 'VoiOrNot'] = 0
        
    elif 'fma_large' in corpus:
        def revert_numerical_string(num_str):
            return num_str.lstrip('0')
        df_all['filename'] = df_all['filename'].apply(lambda x: revert_numerical_string(x)).astype(int)
        df_tracks = pd.read_csv('data/musicCorp/fma_large/fma_metadata/tracks.csv', low_memory=True, header=1)
        df_tracks.rename(columns={'Unnamed: 0': 'track_id'}, inplace=True)
        df_tracks['track_id'] = pd.to_numeric(df_tracks['track_id'], errors='coerce')
        df_tracks.drop(index=0, inplace=True)
        import ast
        def convert_to_list(string_value):
            return ast.literal_eval(string_value)
        df_tracks['genres_all'] = df_tracks['genres_all'].apply(convert_to_list)
        df_all = pd.merge(df_all, df_tracks[['track_id','name','language_code','genres_all']], left_on='filename', right_on='track_id', how='left')
        df_all['LangOrInstru'] = df_all['language_code']
        df_all.drop(columns=['language_code','track_id'], inplace=True)
        df_all.rename(columns={'genres_all': 'genre', 'name': 'speaker/artist'}, inplace=True)
        df_all['gender'] = np.nan
        df_genres = pd.read_csv('data/musicCorp/fma_large/fma_metadata/genres.csv', index_col='genre_id')
        def replace_genre_names(lst):
            return [df_genres['title'][n] for n in lst]
        df_all['genre'] = df_all['genre'].apply(lambda x: replace_genre_names(x))

    elif 'Homburg' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        df_all['speaker/artist'] = split_names.str[0]
        df_all['gender'] = np.nan
        df_all['LangOrInstru'] = np.nan
        split_filepath = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['genre'] = split_filepath.str[3].str.capitalize()
        df_all['VoiOrNot'] = np.nan

    elif 'ismir04_genre' in corpus:
        df_track1 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/development/tracklist.csv', header=None)
        df_track2 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/evaluation/tracklist.csv', header=None)
        df_track3 = pd.read_csv('data/musicCorp/ismir04_genre/metadata/training/tracklist.csv', header=None)
        df_track = pd.concat([df_track1,df_track2,df_track3],ignore_index=True)
        df_track.columns = ['genre', 'speaker/artist', 'album', 'track_name', 'track_num', 'file_name', 'nan']
        split_names = df_track['file_name'].str.split('/') # Split the 'name' column by "-"
        df_track['file_name'] = split_names.str[-1].str.replace('.mp3', '')
        df_all = df_all.merge(df_track[['genre','speaker/artist','file_name']], left_on='filename', right_on='file_name', how='left').drop(columns='file_name')
        df_all['gender'] = np.nan

    elif 'MTG-Jamendo' in corpus:
        df_all['gender'] = np.nan
        df_all['VoiOrNot'] = np.nan
        import sys
        sys.path.append('data/musicCorp/MTG-Jamendo/mtg-jamendo-dataset-github/scripts')
        import commons
        tracks, tags, extra = commons.read_file('data/musicCorp/MTG-Jamendo/mtg-jamendo-dataset-github/data/raw_30s_cleantags_50artists.tsv')
        def get_track_info(filename, tracks):
            try:
                track_dict = tracks[int(filename[:-4])]
                return 'MTG-Jamendo_'+str(track_dict['artist_id']), '/'.join(track_dict['genre']), '/'.join(track_dict['instrument'])
            except:
                return np.nan, np.nan, np.nan
        
        for n in range(len(df_all)):
            df_all.loc[n,'speaker/artist'], df_all.loc[n,'genre'], df_all.loc[n,'LangOrInstru'] = get_track_info(df_all.loc[n,'filename'], tracks)

    elif 'HiltonMoser2022_song' in corpus:
        df_all = preprocHM2022(df_all)
    
    elif 'NHS2' in corpus:
        df_NHS2_meta = pd.read_csv('data/musicCorp/NHS2/NHS2-metadata.csv')
        df_languoid = pd.read_csv('data/musicCorp/NHS2/glottolog_languoid/languoid.csv')
        df_NHS2_meta = df_NHS2_meta.merge(df_languoid[['id', 'name']], left_on='glottocode', right_on='id')
        df_all = df_all.merge(df_NHS2_meta[['song','name']], left_on='filename', right_on='song', how='left')
        df_all['LangOrInstru'] = df_all['name']
        df_all.drop(columns=['song','name'], inplace=True)
        df_all['genre']='world'
        df_all['gender']=np.NaN
        df_all['VoiOrNot'] = 1
        df_all['speaker/artist'] = df_all['filename']

    
    elif 'SONYC' in corpus:
        df_annot = pd.read_csv('data/envCorp/SONYC/annotations.csv')
        music_speech_columns = [column_name for column_name in df_annot.columns if '6' in column_name or '7' in column_name]
        df_rate_columns = df_annot[music_speech_columns].replace('notsure', np.nan).replace(['near','far'], 1).astype(float).replace(-1, np.nan)
        df_annot_MusicSpeech = pd.concat([df_annot['audio_filename'],df_rate_columns], axis=1)
        df_annot_MusicSpeech['audio_filename'] = df_annot_MusicSpeech['audio_filename'].str.replace('.wav','')
        mean_by_group = pd.DataFrame(np.sum(df_annot_MusicSpeech.groupby('audio_filename').mean()>=0.5, axis=1)>0, columns=['exclude'])
        files_drop = list(mean_by_group[mean_by_group['exclude'] == True].index)
        df_all = df_all[~df_all['filename'].isin(files_drop)]
        df_all['speaker/artist'] = np.nan
        df_all['gender'] = np.nan
        df_all['genre'] = np.nan
        df_all['VoiOrNot'] = 0


    return df_all

In [4]:
def preprocHM2022(df_all):
    # This process will be done for both music and speech
    genre_code = df_all['filename'].str[-1]
    genre_code.replace({
        'A': 'infant-directed',
        'B': 'infant-directed',
        'C': 'adult-directed',
        'D': 'adult-directed'
    }, inplace=True)
    df_all['genre'] = genre_code
    df_all['speaker/artist'] = df_all['filename'].str.slice(0, 5)
    temp_soc = df_all['filename'].str.slice(0, 3)
    temp_soc.replace({
        'MBE': 'Mbendjele',
        'HAD': 'Hadza',
        'NYA': 'Nyangatom',
        'TOP': 'Toposa',
        'BEJ': 'Mandarin-China',
        'JEN': 'Kannada',
        'MEN': 'Mentawai',
        'KRA': 'Polish',
        'LIM': 'Polish',
        'TUR': 'Finnish/Swedish',
        'USD': 'English-US',
        'TOR': 'English-Canada',
        'VAN': 'Bislama',
        'PNG': 'Enga',
        'WEL': 'English-NZ',
        'ARA': 'English-Creole',
        'TSI': 'Tsimane',
        'SPA': 'Quechua/Achuar',
        'QUE': 'Spanish',
        'ACO': 'Spanish',
        'MES': 'Spanish',
    }, inplace=True)
    df_all['LangOrInstru'] = temp_soc
    df_HMmeta = pd.read_csv('data/speechCorp/HiltonMoser2022_speech/stimuli-rawMetadata.csv', usecols=["What is this subject's unique identifier?", "What is the gender of the singer/speaker?"])
    df_HMmeta.rename(columns={
        "What is this subject's unique identifier?": 'speaker/artist', 
        "What is the gender of the singer/speaker?": 'gender'
    }, inplace=True)
    df_HMmeta.replace({
        'Female': 'F',
        'Male': 'M',
        'Not specified': np.nan
    }, inplace=True)
    df_all = df_all.merge(df_HMmeta, on='speaker/artist', how='left')
    return df_all

In [5]:
for corpus in corpus_speech_list:
    savename = 'STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv'
    if os.path.isfile(savename):
        print('**skipping: '+corpus)
    else:
        print(corpus)
        df_all = make_meta_file(corpus, corpus_type='speech')
        df_all.to_csv(savename)


**skipping: BibleTTS/akuapem-twi
**skipping: BibleTTS/asante-twi
**skipping: BibleTTS/ewe
**skipping: BibleTTS/hausa
**skipping: BibleTTS/lingala
**skipping: BibleTTS/yoruba
**skipping: Buckeye
**skipping: EUROM
HiltonMoser2022_speech
**skipping: LibriSpeech
**skipping: MediaSpeech/AR
**skipping: MediaSpeech/ES
**skipping: MediaSpeech/FR
**skipping: MediaSpeech/TR
**skipping: MozillaCommonVoice/ab
**skipping: MozillaCommonVoice/ar
**skipping: MozillaCommonVoice/ba
**skipping: MozillaCommonVoice/be
**skipping: MozillaCommonVoice/bg
**skipping: MozillaCommonVoice/bn
**skipping: MozillaCommonVoice/br
**skipping: MozillaCommonVoice/ca
**skipping: MozillaCommonVoice/ckb
**skipping: MozillaCommonVoice/cnh
**skipping: MozillaCommonVoice/cs
**skipping: MozillaCommonVoice/cv
**skipping: MozillaCommonVoice/cy
**skipping: MozillaCommonVoice/da
**skipping: MozillaCommonVoice/de
**skipping: MozillaCommonVoice/dv
**skipping: MozillaCommonVoice/el
**skipping: MozillaCommonVoice/en
**skipping: Mozilla

In [6]:
for corpus in corpus_music_list:
    savename = 'STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv'
    if os.path.isfile(savename):
        print('**skipping: '+corpus)
    else:
        print(corpus)
        df_all = make_meta_file(corpus, corpus_type='music')
        df_all.to_csv(savename)

**skipping: IRMAS
**skipping: Albouy2020Science
**skipping: CD
**skipping: GarlandEncyclopedia
**skipping: fma_large
**skipping: ismir04_genre
**skipping: MTG-Jamendo
HiltonMoser2022_song
**skipping: NHS2


In [7]:
corpus = 'SONYC'
savename = 'STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv'
if os.path.isfile(savename):
    print('**skipping: '+corpus)
else:
    df_all = make_meta_file(corpus, corpus_type='env')
    df_all.to_csv(savename)

**skipping: SONYC


## test code ##

In [10]:
corpus = 'HiltonMoser2022_speech'

corpus_type='speech'

params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
df_list = []

# load the data from the mat file
for params_file in params_list:
    # print(params_file)
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df['corpus'] = corpus
    df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,corpus,mat_filename
0,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,TOR02B,??,1,1,882000,44.1,20,0.009683,20,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
1,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,MEN13B,??,1,1,529200,44.1,12,0.009864,12,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
2,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,USD26D,??,1,1,1940400,44.1,44,0.305533,44,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
3,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,MEN04D,??,1,1,1764000,44.1,40,0.469751,40,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
4,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,TOR15D,??,1,1,1587600,44.1,36,0.119705,36,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
...,...,...,...,...,...,...,...,...,...,...,...,...
780,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,TUR05D,??,1,1,705600,44.1,16,0.031905,16,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
781,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,WEL16B,??,1,1,1058400,44.1,24,0.146372,24,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
782,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,TOR19B,??,1,1,705600,44.1,16,0.229773,16,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...
783,data/speechCorp/HiltonMoser2022_speech/IDS-cor...,BEJ16D,??,1,1,1411200,44.1,32,0.010000,32,HiltonMoser2022_speech,STM_output/MATs/speech_mat_wl4_HiltonMoser2022...


In [13]:
genre_code = df_all['filename'].str[-1]
genre_code.replace({
    'A': 'infant-directed',
    'B': 'infant-directed',
    'C': 'adult-directed',
    'D': 'adult-directed'
}, inplace=True)
df_all['genre'] = genre_code
df_all['genre']

0      infant-directed
1      infant-directed
2       adult-directed
3       adult-directed
4       adult-directed
            ...       
780     adult-directed
781    infant-directed
782    infant-directed
783     adult-directed
784    infant-directed
Name: genre, Length: 785, dtype: object

In [14]:
df_all['genre']

0      infant-directed
1      infant-directed
2       adult-directed
3       adult-directed
4       adult-directed
            ...       
780     adult-directed
781    infant-directed
782    infant-directed
783     adult-directed
784    infant-directed
Name: genre, Length: 785, dtype: object

In [16]:
df_NHS2_meta = pd.read_csv('data/musicCorp/NHS2/NHS2-metadata.csv')
df_NHS2_meta

Unnamed: 0,song,type,region,glottocode
0,NHS2-SZHN,Mourning,Amazon and Orinoco,gali1262
1,NHS2-YYR4,Mourning,Amazon and Orinoco,boro1282
2,NHS2-DYV5,Work,Amazon and Orinoco,boro1282
3,NHS2-LFHL,Healing,Amazon and Orinoco,gali1262
4,NHS2-INOD,Mourning,Amazon and Orinoco,gali1262
...,...,...,...,...
1002,NHS2-JDTL,Lullaby,Western Europe,stan1290
1003,NHS2-OU7B,Lullaby,Western Europe,stan1290
1004,NHS2-ZJ3C,Procession,Western Europe,stan1290
1005,NHS2-2PWX,Work,Western Europe,stan1290


In [17]:
df_languoid = pd.read_csv('data/musicCorp/NHS2/glottolog_languoid/languoid.csv')
df_languoid

Unnamed: 0,id,family_id,parent_id,name,bookkeeping,level,latitude,longitude,iso639P3code,description,markup_description,child_family_count,child_language_count,child_dialect_count,country_ids
0,3adt1234,afro1255,nort3292,3Ad-Tekles,False,dialect,,,,,,0,0,0,
1,aala1237,aust1307,ramo1244,Aalawa,False,dialect,,,,,,0,0,0,
2,aant1238,nucl1709,nort2920,Aantantara,False,dialect,,,,,,0,0,0,
3,aari1238,sout2845,ahkk1235,Aari-Gayil,False,family,,,aiz,,,0,2,0,
4,aari1239,sout2845,aari1238,Aari,False,language,5.95034,36.5721,aiw,,,0,0,0,ET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26874,zuti1239,tupi1275,guaj1255,Guajajára of Zutiua,False,dialect,,,,,,0,0,0,
26875,zuwa1238,koia1260,omie1241,Zuwadza,False,dialect,,,,,,0,0,0,
26876,zwal1238,atla1278,shal1242,Zwall,False,dialect,,,,,,0,0,0,
26877,zyph1238,sino1245,nucl1757,Zyphe,False,language,22.52400,93.2640,zyp,,,0,0,2,IN MM


In [18]:
df_NHS2_meta = df_NHS2_meta.merge(df_languoid[['id', 'name']], left_on='glottocode', right_on='id')
df_NHS2_meta

Unnamed: 0,song,type,region,glottocode,id,name
0,NHS2-SZHN,Mourning,Amazon and Orinoco,gali1262,gali1262,Galibi Carib
1,NHS2-LFHL,Healing,Amazon and Orinoco,gali1262,gali1262,Galibi Carib
2,NHS2-INOD,Mourning,Amazon and Orinoco,gali1262,gali1262,Galibi Carib
3,NHS2-YYR4,Mourning,Amazon and Orinoco,boro1282,boro1282,Bororo
4,NHS2-DYV5,Work,Amazon and Orinoco,boro1282,boro1282,Bororo
...,...,...,...,...,...,...
1002,NHS2-JDTL,Lullaby,Western Europe,stan1290,stan1290,French
1003,NHS2-OU7B,Lullaby,Western Europe,stan1290,stan1290,French
1004,NHS2-ZJ3C,Procession,Western Europe,stan1290,stan1290,French
1005,NHS2-2PWX,Work,Western Europe,stan1290,stan1290,French


In [20]:
df_all = df_all.merge(df_NHS2_meta[['song','name']], left_on='filename', right_on='song', how='left')
df_all['LangOrInstru'] = df_all['name']
df_all.drop(columns=['song','name'], inplace=True)
df_all['genre']='World'
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,corpus,mat_filename,genre
0,data/musicCorp/NHS2/NHS2-UDOB.mp3,NHS2-UDOB,Mezquital Otomi,,1,352800,44.1,8,0.141542,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-UDOB_M...,World
1,data/musicCorp/NHS2/NHS2-CQH0.mp3,NHS2-CQH0,Central Guerrero Nahuatl,,1,352800,44.1,8,0.141542,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-CQH0_M...,World
2,data/musicCorp/NHS2/NHS2-YXHK.mp3,NHS2-YXHK,Selkup,,1,352800,44.1,8,0.141701,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-YXHK_M...,World
3,data/musicCorp/NHS2/NHS2-CN26.mp3,NHS2-CN26,Chicham,,1,352800,44.1,8,0.148662,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-CN26_M...,World
4,data/musicCorp/NHS2/NHS2-KR0Z.mp3,NHS2-KR0Z,Nsenga,,1,352800,44.1,8,0.191156,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-KR0Z_M...,World
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1002,data/musicCorp/NHS2/NHS2-B0XT.mp3,NHS2-B0XT,Malayalam,,1,352800,44.1,8,0.144240,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-B0XT_M...,World
1003,data/musicCorp/NHS2/NHS2-VXRD.mp3,NHS2-VXRD,Tonga (Tonga Islands),,1,352800,44.1,8,0.118866,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-VXRD_M...,World
1004,data/musicCorp/NHS2/NHS2-18M3.mp3,NHS2-18M3,Nsenga,,1,352800,44.1,8,0.509116,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-18M3_M...,World
1005,data/musicCorp/NHS2/NHS2-D3XG.mp3,NHS2-D3XG,Antankarana Malagasy,,1,352800,44.1,8,0.145669,8,NHS2,STM_output/MATs/music_mat_wl4_NHS2/NHS2-D3XG_M...,World


In [None]:
# genre??

# MagnaTagATune unsolved

In [9]:
corpus = 'MagnaTagATune'

corpus_type='music'

params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
df_list = []

# load the data from the mat file
for params_file in params_list:
    # print(params_file)
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df['corpus'] = corpus
    df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,corpus,mat_filename
0,data/musicCorp/MagnaTagATune/d/shiva_in_exile-...,shiva_in_exile-ethnic-07-shaman_fever-59-88,,,1,448000,16,28,0.069062,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/sh...
1,data/musicCorp/MagnaTagATune/1/suzanne_teng-my...,suzanne_teng-mystic_journey-05-lhasa_love-204-233,,,1,448000,16,28,0.069062,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/su...
2,data/musicCorp/MagnaTagATune/2/magnatune_compi...,magnatune_compilation-electronica-03-cargo_cul...,,,1,448000,16,28,0.069062,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/ma...
3,data/musicCorp/MagnaTagATune/f/hans_christian-...,hans_christian-surrender-04-shrouded-146-175,,,1,448000,16,28,0.069750,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/ha...
4,data/musicCorp/MagnaTagATune/b/ehren_starks-th...,ehren_starks-the_depths_of_a_year-05-dads_song...,,,1,448000,16,28,0.069062,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/eh...
...,...,...,...,...,...,...,...,...,...,...,...,...
25854,data/musicCorp/MagnaTagATune/8/magnatune-red_h...,magnatune-red_hat_summit_compilation-03-norine...,,,1,448000,16,28,0.069062,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/ma...
25855,data/musicCorp/MagnaTagATune/c/strojovna_07-nu...,strojovna_07-number_1-11-elektra-175-204,,,1,448000,16,28,0.072750,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/st...
25856,data/musicCorp/MagnaTagATune/a/tilopa-by_the_w...,tilopa-by_the_way-03-amigos_de_viaje-291-320,,,1,448000,16,28,0.069125,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/ti...
25857,data/musicCorp/MagnaTagATune/3/dj_cary-eastern...,dj_cary-eastern_grooves-10-waddi_jahrapoon-204...,,,1,448000,16,28,0.069125,28,MagnaTagATune,STM_output/MATs/music_mat_wl4_MagnaTagATune/dj...


In [10]:
df_annot = pd.read_csv('data/musicCorp/MagnaTagATune/annotations_final.csv', sep ='\t')
df_clip = pd.read_csv('data/musicCorp/MagnaTagATune/clip_info_final.csv', sep ='\t')
df_info = df_clip.merge(df_annot)

In [14]:
df_annot

Unnamed: 0,clip_id,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,...,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25858,58899,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-56-la_bres...
25859,58906,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...
25860,58907,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...
25861,58908,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...


In [23]:
df_annot.describe()

Unnamed: 0,clip_id,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,...,female singer,rap,metal,hip hop,quick,water,baroque,women,fiddle,english
count,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,...,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0,25863.0
mean,28487.427058,0.022155,0.005684,0.002861,0.002668,0.010594,0.002513,0.002011,0.042261,0.001779,...,0.005645,0.00348,0.019526,0.002475,0.001276,0.003209,0.011484,0.003867,0.003325,0.002359
std,16587.462929,0.147191,0.075178,0.053415,0.051584,0.102384,0.05007,0.044795,0.201188,0.042137,...,0.074923,0.058889,0.138367,0.049684,0.035698,0.05656,0.106546,0.062062,0.05757,0.048509
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14073.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,28350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,42663.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,58915.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
df_annot.iloc[:,1:-2].sum(axis=1)

0        4
1        7
2        3
3        2
4        5
        ..
25858    2
25859    7
25860    4
25861    8
25862    3
Length: 25863, dtype: int64

In [56]:
df_annot.iloc[:30,180:190]

Unnamed: 0,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path
0,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,0,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
5,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-30-59.mp3
6,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-175-204.mp3
7,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-233-262.mp3
8,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-291-320.mp3
9,0,0,0,0,0,0,0,0,0,0/american_bach_soloists-j_s__bach__cantatas_v...


In [None]:
genre_list = ['classical', 'new age', 'electronica', 'rock', 'pop', 'world', 'jazz', 'blues', 'metal', 'punk', 'funk', 'country', 'rap', 'metal', 'hip hop']

In [12]:
split_names = df_info['mp3_path'].str.split('/') # Split the 'name' column by "-"
df_info['filename'] = split_names.str[-1].str.replace('.mp3', '')
df_info

Unnamed: 0,clip_id,track_number,title,artist,album,url,segmentStart,segmentEnd,original_url,mp3_path,...,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,filename
0,2,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,30,59,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,american_bach_soloists-j_s__bach_solo_cantatas...
1,6,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,146,175,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,1,0,0,0,american_bach_soloists-j_s__bach_solo_cantatas...
2,10,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,262,291,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,american_bach_soloists-j_s__bach_solo_cantatas...
3,11,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,291,320,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,american_bach_soloists-j_s__bach_solo_cantatas...
4,12,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,320,349,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,american_bach_soloists-j_s__bach_solo_cantatas...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25858,58899,56,La Bressanina,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,88,117,http://he3.magnatune.com/all/56-La%20Bressanin...,8/jacob_heringman-blame_not_my_lute-56-la_bres...,...,0,0,0,0,0,0,0,0,0,jacob_heringman-blame_not_my_lute-56-la_bressa...
25859,58906,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,0,29,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...,...,0,0,0,0,0,0,0,0,0,jacob_heringman-blame_not_my_lute-57-lost_is_m...
25860,58907,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,30,59,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...,...,0,0,0,0,0,0,0,0,0,jacob_heringman-blame_not_my_lute-57-lost_is_m...
25861,58908,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,59,88,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...,...,0,0,0,0,0,0,0,0,0,jacob_heringman-blame_not_my_lute-57-lost_is_m...


In [11]:
df_info

Unnamed: 0,clip_id,track_number,title,artist,album,url,segmentStart,segmentEnd,original_url,mp3_path,...,female singer,rap,metal,hip hop,quick,water,baroque,women,fiddle,english
0,2,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,30,59,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,0
1,6,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,146,175,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,1,0,0,0
2,10,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,262,291,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,0
3,11,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,291,320,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,0
4,12,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,320,349,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25858,58899,56,La Bressanina,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,88,117,http://he3.magnatune.com/all/56-La%20Bressanin...,8/jacob_heringman-blame_not_my_lute-56-la_bres...,...,0,0,0,0,0,0,0,0,0,0
25859,58906,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,0,29,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...,...,0,0,0,0,0,0,0,0,0,0
25860,58907,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,30,59,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...,...,0,0,0,0,0,0,0,0,0,0
25861,58908,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,59,88,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# df_all['speaker/artist'] = df_all['filename']
# df_all['gender'] = np.nan
# df_all['LangOrInstru'] = np.nan
# df_all['genre'] = 'world'
# Garland_novoice_list = list(pd.read_csv('data/musicCorp/GarlandEncyclopedia/Garland_noVoice.csv', header=None)[0])
# df_all['VoiOrNot'] = 1
# df_all.loc[df_all['filename'].isin(Garland_novoice_list), 'VoiOrNot'] = 0

In [13]:
import pandas as pd

# Sample one-hot-encoded DataFrame
data = {'A': [1, 0, 0, 0],
        'B': [0, 1, 0, 0],
        'C': [0, 0, 1, 0],
        'D': [0, 0, 0, 1]}
df = pd.DataFrame(data)

# Combine one-hot-encoded columns
combined = df.idxmax(axis=1)

# Convert combined Series to DataFrame with a single column
result_df = combined.to_frame(name='Encoded_Column')

print(result_df)


  Encoded_Column
0              A
1              B
2              C
3              D
