In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv
import json
import os

In [2]:
corpus_speech_list = ['BibleTTS/akuapem-twi',
    'BibleTTS/asante-twi',
    'BibleTTS/ewe',
    'BibleTTS/hausa',
    'BibleTTS/lingala',
    'BibleTTS/yoruba',
    'Buckeye',
    'EUROM',
    'LibriSpeech',
    # 'LibriVox',
    'MediaSpeech/AR',
    'MediaSpeech/ES',
    'MediaSpeech/FR',
    'MediaSpeech/TR',
    'MozillaCommonVoice/ab',
    'MozillaCommonVoice/ar',
    'MozillaCommonVoice/ba',
    'MozillaCommonVoice/be',
    'MozillaCommonVoice/bg',
    'MozillaCommonVoice/bn',
    'MozillaCommonVoice/br',
    'MozillaCommonVoice/ca',
    'MozillaCommonVoice/ckb',
    'MozillaCommonVoice/cnh',
    'MozillaCommonVoice/cs',
    'MozillaCommonVoice/cv',
    'MozillaCommonVoice/cy',
    'MozillaCommonVoice/da',
    'MozillaCommonVoice/de',
    'MozillaCommonVoice/dv',
    'MozillaCommonVoice/el',
    'MozillaCommonVoice/en',
    'MozillaCommonVoice/eo',
    'MozillaCommonVoice/es',
    'MozillaCommonVoice/et',
    'MozillaCommonVoice/eu',
    'MozillaCommonVoice/fa',
    'MozillaCommonVoice/fi',
    'MozillaCommonVoice/fr',
    'MozillaCommonVoice/fy-NL',
    'MozillaCommonVoice/ga-IE',
    'MozillaCommonVoice/gl',
    'MozillaCommonVoice/gn',
    'MozillaCommonVoice/hi',
    'MozillaCommonVoice/hu',
    'MozillaCommonVoice/hy-AM',
    'MozillaCommonVoice/id',
    'MozillaCommonVoice/ig',
    'MozillaCommonVoice/it',
    'MozillaCommonVoice/ja',
    'MozillaCommonVoice/ka',
    'MozillaCommonVoice/kab',
    'MozillaCommonVoice/kk',
    'MozillaCommonVoice/kmr',
    'MozillaCommonVoice/ky',
    'MozillaCommonVoice/lg',
    'MozillaCommonVoice/lt',
    'MozillaCommonVoice/ltg',
    'MozillaCommonVoice/lv',
    'MozillaCommonVoice/mhr',
    'MozillaCommonVoice/ml',
    'MozillaCommonVoice/mn',
    'MozillaCommonVoice/mt',
    'MozillaCommonVoice/nan-tw',
    'MozillaCommonVoice/nl',
    'MozillaCommonVoice/oc',
    'MozillaCommonVoice/or',
    'MozillaCommonVoice/pl',
    'MozillaCommonVoice/pt',
    'MozillaCommonVoice/ro',
    'MozillaCommonVoice/ru',
    'MozillaCommonVoice/rw',
    'MozillaCommonVoice/sr',
    'MozillaCommonVoice/sv-SE',
    'MozillaCommonVoice/sw',
    'MozillaCommonVoice/ta',
    'MozillaCommonVoice/th',
    'MozillaCommonVoice/tr',
    'MozillaCommonVoice/tt',
    'MozillaCommonVoice/ug',
    'MozillaCommonVoice/uk',
    'MozillaCommonVoice/ur',
    'MozillaCommonVoice/uz',
    'MozillaCommonVoice/vi',
    'MozillaCommonVoice/yo',
    'MozillaCommonVoice/yue',
    'MozillaCommonVoice/zh-CN',
    'MozillaCommonVoice/zh-TW',
    'primewords_chinese',
    'room_reader',
    'SpeechClarity',
    'TAT-Vol2',
    'thchs30',
    'TIMIT',
    'TTS_Javanese',
    'zeroth_korean',]

corpus_music_list = ['IRMAS',
                     'Albouy2020Science',
                     'CD',
                     'GarlandEncyclopedia']

In [3]:
def make_meta_file(corpus, corpus_type):
    
    params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
    df_list = []

    # load the data from the mat file
    for params_file in params_list:
        # print(params_file)
        data_dict = sio.loadmat(params_file)
        structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
        df = pd.DataFrame(structure_dict)
        df.drop(columns=['x_axis','y_axis'], inplace=True)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df['corpus'] = corpus
        df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
        df_list.append(df)
    
    df_all = pd.concat(df_list, ignore_index=True)

    # add the speaker ID and gender info
    if 'MozillaCommonVoice' in corpus:
        valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)
        valid_df['path'] = valid_df['path'].str.replace('.mp3', '')
        valid_df.rename(columns={'client_id':'speaker/artist'}, inplace=True)
        df_all = df_all.merge(valid_df[['speaker/artist', 'path', 'gender']], how='left', left_on='filename', right_on='path').drop(columns=['path'])
    elif 'BibleTTS' in corpus:
        df_all['speaker/artist'] = 'BibleTTS_' + df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'Buckeye' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3].str.replace('s', 'S')
        df_gender = pd.read_csv('data/speechCorp/Buckeye/Buckeye_speaker_info.csv')
        df_all = df_all.merge(df_gender[['SPEAKER', "SPEAKER'S GENDER"]], how='left', left_on='speaker/artist', right_on='SPEAKER').drop(columns=['SPEAKER'])
        df_all.rename(columns={"SPEAKER'S GENDER":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'Buckeye_'+df_all['speaker/artist']
    elif 'EUROM' in corpus:
        df_all['speaker/artist'] = 'EUROM_' + df_all['LangOrInstru'] +'_'+ df_all['filename'].str[:2]
        df_all['gender'] = np.nan
    elif 'MediaSpeech' in corpus:
        df_all['speaker/artist'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
        df_all['gender'] = np.nan
    elif 'LibriSpeech' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        first_parts = split_names.str[0] # Extract the first part of the split result
        df_all['speaker/artist'] = first_parts # Add the extracted part as a new column in the DataFrame
        
        ## load LibriSpeech text file
        reader_ids = []
        genders = []
        subsets = []
        durations = []
        names = []
        # Open the file and read line by line
        with open('data/speechCorp/LibriSpeech/SPEAKERS.TXT', 'r') as file:
            # Skip lines starting with ";" (comments) until reaching the data
            while True:
                line = file.readline()
                if not line.startswith(';'):
                    # Start processing data from this line
                    break
            
            # Read the rest of the lines and parse them
            while line:
                # Split each line by "|" character
                data = line.strip().split('|')
                # Extract relevant information
                reader_ids.append(int(data[0].strip()))
                genders.append(data[1].strip())
                subsets.append(data[2].strip())
                durations.append(float(data[3].strip()))
                names.append(data[4].strip())
                # Read the next line
                line = file.readline()
        
        # Create a DataFrame using the lists
        df_LibriSpeech = pd.DataFrame({'reader_id': reader_ids, 'gender': genders, 'subset': subsets, 'duration': durations, 'name': names})
        df_LibriSpeech['reader_id']= df_LibriSpeech['reader_id'].astype(str)
        df_all = df_all.merge(df_LibriSpeech[['reader_id', 'gender']], how='left', left_on='speaker/artist', right_on='reader_id').drop(columns=['reader_id'])
    elif 'primewords_chinese' in corpus:
        with open('data/speechCorp/primewords_chinese/set1_transcript.json', 'r') as file:
            data = json.load(file)
        primewords_df = pd.DataFrame(data)
        primewords_df['file'] = primewords_df['file'].str.replace('.wav', '')
        df_all = df_all.merge(primewords_df[['file', 'user_id']], how='left', left_on='filename', right_on='file').drop(columns=['file'])
        df_all.rename(columns={"user_id":'speaker/artist'}, inplace=True)
        df_all['speaker/artist'] = 'primewords_'+df_all['speaker/artist']
        df_all['gender']=np.nan
    elif 'room_reader' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = split_names.str[1] # Extract the first part of the split result
        RR_df = pd.read_excel('data/speechCorp/room_reader/RoomReader_SessionsEvents.xlsx')
        df_all = df_all.merge(RR_df[['part_ID', 'gender']], how='left', left_on='speaker/artist', right_on='part_ID').drop(columns=['part_ID'])
        df_all['speaker/artist'] = 'RoomReader_'+df_all['speaker/artist']
    elif 'SpeechClarity' in corpus:
        df_all['speaker/artist'] = 'SpeechClarity_'+df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'TAT-Vol2' in corpus:
        df_all['speaker/artist'] = 'TAT-Vol2_'+df_all['filename'].str[:10]
        df_all['gender'] = df_all['filename'].str[5]
    elif 'thchs30' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'thchs30_'+split_names.str[0] # Extract the first part of the split result
        df_all['gender'] = np.nan
    elif 'TIMIT' in corpus:
        split_names = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'TIMIT_'+split_names.str[-2] # Extract the first part of the split result
        df_all['gender'] = df_all['speaker/artist'].str[6]
    elif 'TTS_Javanese' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:9] 
        df_all['gender'] = df_all['filename'].str[2]
    elif 'zeroth' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3] 
        zeroth_df = pd.read_csv('data/speechCorp/zeroth_korean/AUDIO_INFO', sep="|")
        zeroth_df['SPEAKERID'] = zeroth_df['SPEAKERID'].astype(str)
        df_all = df_all.merge(zeroth_df[['SPEAKERID', 'SEX']], how='left', left_on='speaker/artist', right_on='SPEAKERID').drop(columns=['SPEAKERID'])
        df_all.rename(columns={"SEX":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'zeroth_'+df_all['speaker/artist']

    # Music corpora
    elif 'IRMAS' in corpus:
        for nRow in range(len(df_all)):
            txt_path = df_all['filepath'][nRow].replace('.wav', '.txt')
            if os.path.exists(txt_path):
                # Open the file in read mode
                with open(txt_path, 'r') as file:
                    # Read all lines into a list
                    lines = file.readlines()
                converted_lines = [line.strip() for line in lines]
                df_all.loc[nRow,'LangOrInstru'] = '-'.join(converted_lines)
                df_all.loc[nRow,'VoiOrNot'] = int('voi' in converted_lines)
                
        # df_all['speaker/artist'] = '-'.join(df_all['filename'].split('-')[:-1]).strip()
        df_all['speaker/artist'] = df_all['filename'].apply(lambda x: '-'.join(x.split('-')[:-1]).strip())
        df_all['gender'] = np.nan
        df_all['genre'] = np.nan
        
    elif 'Albouy2020Science' in corpus:
        df_all['speaker/artist'] = 'Albouy2020Science'
        df_all['gender'] = 'female'
        df_all['genre'] = 'classical'
        df_all['LangOrInstru'] = 'English'
        df_all.loc[df_all['filename'].str.contains('French', case=False), 'LangOrInstru'] = 'French'
        df_all['VoiOrNot'] = 1

    elif 'CD' in corpus:
        from fuzzywuzzy import process
        df_all['gender'] = np.nan
        def extract_artist(file_path):
            parts = file_path.split('/')
            artist_album_part = parts[3]
            return artist_album_part.split('_')[0]
        
        df_all['speaker/artist'] = df_all['filepath'].apply(extract_artist)
        df_CD = pd.read_excel('data/musicCorp/CD/CD_music_list.xlsx')

        # Function to find the best match for each name in df1 from df2
        def find_best_match(name, choices):
            return process.extractOne(name, choices)
        
        df_all['Best_Match'] = df_all['filename'].apply(lambda x: find_best_match(x, df_CD['Piece'])) # Apply the function to find the best match for each name in df1
        df_all['Matched_Name'] = df_all['Best_Match'].apply(lambda x: x[0]) # Extract matched names and similarity scores
        df_all['Similarity_Score'] = df_all['Best_Match'].apply(lambda x: x[1])
        
        # Join based on matched names
        df_all = pd.merge(df_all, df_CD[['Piece', 'Genre','Instrument']], left_on='Matched_Name', right_on='Piece', how='left')
        df_all['LangOrInstru'] = df_all['Instrument']
        df_all.drop(columns=['Best_Match','Instrument','Matched_Name','Similarity_Score','Piece'], inplace = True)
        df.rename(columns={'Genre': 'genre'}, inplace = True)

        df_all['VoiOrNot'] = 0
        df_all.loc[df_all['LangOrInstru'].str.contains('Voi', case=False), 'VoiOrNot'] = 1

        df_all = df_all[~df_all['filepath'].str.contains('Compilations', case=False)]
        
    elif 'GarlandEncyclopedia' in corpus:
        df_all['speaker/artist'] = df_all['filename']
        df_all['gender'] = np.nan
        df_all['LangOrInstru'] = np.nan
        df_all['genre'] = 'world'
        Garland_novoice_list = list(pd.read_csv('data/musicCorp/GarlandEncyclopedia/Garland_noVoice.csv', header=None)[0])
        df_all['VoiOrNot'] = 1
        df_all.loc[df_all['filename'].isin(Garland_novoice_list), 'VoiOrNot'] = 0
        
    elif 'fma_large' in corpus:
        def revert_numerical_string(num_str):
            return num_str.lstrip('0')
        df_all['filename'] = df_all['filename'].apply(lambda x: revert_numerical_string(x)).astype(int)
        df_tracks = pd.read_csv('data/musicCorp/fma_large/fma_metadata/tracks.csv', low_memory=True, header=1)
        df_tracks.rename(columns={'Unnamed: 0': 'track_id'}, inplace=True)
        df_tracks['track_id'] = pd.to_numeric(df_tracks['track_id'], errors='coerce')
        df_tracks.drop(index=0, inplace=True)
        import ast
        def convert_to_list(string_value):
            return ast.literal_eval(string_value)
        df_tracks['genres_all'] = df_tracks['genres_all'].apply(convert_to_list)
        df_all = pd.merge(df_all, df_tracks[['track_id','name','language_code','genres_all']], left_on='filename', right_on='track_id', how='left')
        df_all['LangOrInstru'] = df_all['language_code']
        df_all.drop(columns=['language_code','track_id'], inplace=True)
        df_all.rename(columns={'genres_all': 'genre', 'name': 'speaker/artist'}, inplace=True)
        df_all['gender'] = np.nan
        df_genres = pd.read_csv('data/musicCorp/fma_large/fma_metadata/genres.csv', index_col='genre_id')
        def replace_genre_names(lst):
            return [df_genres['title'][n] for n in lst]
        df_all['genre'] = df_all['genre'].apply(lambda x: replace_genre_names(x))


    return df_all

In [4]:
for corpus in corpus_speech_list:
    print(corpus)
    df_all = make_meta_file(corpus, corpus_type='speech')
    df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')


BibleTTS/akuapem-twi
BibleTTS/asante-twi
BibleTTS/ewe
BibleTTS/hausa
BibleTTS/lingala
BibleTTS/yoruba
Buckeye
EUROM
LibriSpeech
MediaSpeech/AR
MediaSpeech/ES
MediaSpeech/FR
MediaSpeech/TR
MozillaCommonVoice/ab
MozillaCommonVoice/ar


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ba


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/be


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/bg
MozillaCommonVoice/bn
MozillaCommonVoice/br
MozillaCommonVoice/ca


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ckb
MozillaCommonVoice/cnh
MozillaCommonVoice/cs
MozillaCommonVoice/cv
MozillaCommonVoice/cy
MozillaCommonVoice/da
MozillaCommonVoice/de


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/dv
MozillaCommonVoice/el
MozillaCommonVoice/en


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/eo


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/es


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/et
MozillaCommonVoice/eu
MozillaCommonVoice/fa


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/fi
MozillaCommonVoice/fr
MozillaCommonVoice/fy-NL
MozillaCommonVoice/ga-IE
MozillaCommonVoice/gl
MozillaCommonVoice/gn
MozillaCommonVoice/hi
MozillaCommonVoice/hu
MozillaCommonVoice/hy-AM
MozillaCommonVoice/id
MozillaCommonVoice/ig
MozillaCommonVoice/it
MozillaCommonVoice/ja
MozillaCommonVoice/ka


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/kab


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/kk
MozillaCommonVoice/kmr
MozillaCommonVoice/ky
MozillaCommonVoice/lg


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/lt
MozillaCommonVoice/ltg
MozillaCommonVoice/lv
MozillaCommonVoice/mhr


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ml
MozillaCommonVoice/mn
MozillaCommonVoice/mt
MozillaCommonVoice/nan-tw
MozillaCommonVoice/nl
MozillaCommonVoice/oc
MozillaCommonVoice/or
MozillaCommonVoice/pl


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/pt


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ro
MozillaCommonVoice/ru
MozillaCommonVoice/rw


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/sr
MozillaCommonVoice/sv-SE
MozillaCommonVoice/sw


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ta


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/th


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/tr


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/tt
MozillaCommonVoice/ug
MozillaCommonVoice/uk


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ur
MozillaCommonVoice/uz


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/vi
MozillaCommonVoice/yo
MozillaCommonVoice/yue
MozillaCommonVoice/zh-CN
MozillaCommonVoice/zh-TW


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


primewords_chinese
room_reader
SpeechClarity
TAT-Vol2
thchs30
TIMIT
TTS_Javanese
zeroth_korean


In [4]:
for corpus in corpus_music_list:
    print(corpus)
    df_all = make_meta_file(corpus, corpus_type='music')
    df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')

IRMAS
Albouy2020Science
CD
GarlandEncyclopedia


In [None]:
corpus = 'fma_large'
df_all = make_meta_file('fma_large', corpus_type='music')

In [6]:
df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')

In [10]:
df_all['LangOrInstru'].unique()

array([nan, 'en', 'es', 'pt', 'fr', 'sr', 'de', 'ru', 'vi', 'ja', 'he',
       'ee', 'tr', 'ar', 'pl', 'cs', 'it', 'el', 'hy', 'my', 'zh', 'ka',
       'ms', 'gu', 'uz', 'fi', 'bg', 'nl', 'uk', 'sw', 'hi', 'az', 'id',
       'ko', 'la', 'ha', 'tl', 'eu', 'bm', 'sk', 'lt', 'ty', 'th', 'tw',
       'no'], dtype=object)

## test code ##

In [2]:
corpus = 'fma_large'

corpus_type='music'

params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
df_list = []

# load the data from the mat file
for params_file in params_list:
    # print(params_file)
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename
0,data/musicCorp/fma_large/083/083813.mp3,083813,,,1,1234800,44.1,28,0.032880,28,STM_output/MATs/music_mat_wl4_fma_large/083813...
1,data/musicCorp/fma_large/034/034059.mp3,034059,,,1,1234800,44.1,28,0.034807,28,STM_output/MATs/music_mat_wl4_fma_large/034059...
2,data/musicCorp/fma_large/092/092378.mp3,092378,,,1,1234800,44.1,28,0.020907,28,STM_output/MATs/music_mat_wl4_fma_large/092378...
3,data/musicCorp/fma_large/013/013935.mp3,013935,,,1,1234800,44.1,28,0.006553,28,STM_output/MATs/music_mat_wl4_fma_large/013935...
4,data/musicCorp/fma_large/148/148436.mp3,148436,,,1,1234800,44.1,28,0.006735,28,STM_output/MATs/music_mat_wl4_fma_large/148436...
...,...,...,...,...,...,...,...,...,...,...,...
9995,data/musicCorp/fma_large/135/135403.mp3,135403,,,1,1234800,44.1,28,0.767574,24,STM_output/MATs/music_mat_wl4_fma_large/135403...
9996,data/musicCorp/fma_large/052/052418.mp3,052418,,,1,1234800,44.1,28,0.044943,28,STM_output/MATs/music_mat_wl4_fma_large/052418...
9997,data/musicCorp/fma_large/103/103604.mp3,103604,,,1,1234800,44.1,28,0.032653,28,STM_output/MATs/music_mat_wl4_fma_large/103604...
9998,data/musicCorp/fma_large/118/118270.mp3,118270,,,1,1234800,44.1,28,0.033061,28,STM_output/MATs/music_mat_wl4_fma_large/118270...


In [3]:
def revert_numerical_string(num_str):
    return num_str.lstrip('0')
df_all['filename'] = df_all['filename'].apply(lambda x: revert_numerical_string(x)).astype(int)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename
0,data/musicCorp/fma_large/083/083813.mp3,83813,,,1,1234800,44.1,28,0.032880,28,STM_output/MATs/music_mat_wl4_fma_large/083813...
1,data/musicCorp/fma_large/034/034059.mp3,34059,,,1,1234800,44.1,28,0.034807,28,STM_output/MATs/music_mat_wl4_fma_large/034059...
2,data/musicCorp/fma_large/092/092378.mp3,92378,,,1,1234800,44.1,28,0.020907,28,STM_output/MATs/music_mat_wl4_fma_large/092378...
3,data/musicCorp/fma_large/013/013935.mp3,13935,,,1,1234800,44.1,28,0.006553,28,STM_output/MATs/music_mat_wl4_fma_large/013935...
4,data/musicCorp/fma_large/148/148436.mp3,148436,,,1,1234800,44.1,28,0.006735,28,STM_output/MATs/music_mat_wl4_fma_large/148436...
...,...,...,...,...,...,...,...,...,...,...,...
9995,data/musicCorp/fma_large/135/135403.mp3,135403,,,1,1234800,44.1,28,0.767574,24,STM_output/MATs/music_mat_wl4_fma_large/135403...
9996,data/musicCorp/fma_large/052/052418.mp3,52418,,,1,1234800,44.1,28,0.044943,28,STM_output/MATs/music_mat_wl4_fma_large/052418...
9997,data/musicCorp/fma_large/103/103604.mp3,103604,,,1,1234800,44.1,28,0.032653,28,STM_output/MATs/music_mat_wl4_fma_large/103604...
9998,data/musicCorp/fma_large/118/118270.mp3,118270,,,1,1234800,44.1,28,0.033061,28,STM_output/MATs/music_mat_wl4_fma_large/118270...


In [7]:
df_tracks = pd.read_csv('data/musicCorp/fma_large/fma_metadata/tracks.csv', low_memory=True, header=1)
df_tracks.rename(columns={'Unnamed: 0': 'track_id'}, inplace=True)
df_tracks['track_id'] = pd.to_numeric(df_tracks['track_id'], errors='coerce')
df_tracks.drop(index=0, inplace=True)
df_tracks

  df_tracks = pd.read_csv('data/musicCorp/fma_large/fma_metadata/tracks.csv', low_memory=True, header=1)


Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information.1,interest,language_code,license,listens.1,lyricist,number,publisher,tags.2,title.1
1,2.0,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,4656.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293.0,,3.0,,[],Food
2,3.0,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,1470.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514.0,,4.0,,[],Electric Ave
3,5.0,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,1933.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151.0,,6.0,,[],This World
4,10.0,0.0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4.0,6.0,,47632.0,,...,,54881.0,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135.0,,1.0,,[],Freeway
5,20.0,0.0,2008-11-26 01:45:05,2009-01-06 00:00:00,,2.0,4.0,"<p> ""spiritual songs"" from Nicky Cook</p>",2710.0,,...,,978.0,en,Attribution-NonCommercial-NoDerivatives (aka M...,361.0,,3.0,,[],Spiritual Level
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106570,155316.0,0.0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0.0,22940.0,"<p>A live performance at Monty Hall on Feb 17,...",1506.0,Monty Hall,...,,122.0,,Creative Commons Attribution-NonCommercial-NoD...,102.0,,3.0,,[],The Auger
106571,155317.0,0.0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0.0,22940.0,"<p>A live performance at Monty Hall on Feb 17,...",1506.0,Monty Hall,...,,194.0,,Creative Commons Attribution-NonCommercial-NoD...,165.0,,4.0,,[],Let's Skin Ruby
106572,155318.0,0.0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0.0,22940.0,"<p>A live performance at Monty Hall on Feb 17,...",1506.0,Monty Hall,...,,214.0,,Creative Commons Attribution-NonCommercial-NoD...,168.0,,6.0,,[],My House Smells Like Kim Deal/Pulp
106573,155319.0,0.0,2017-03-30 15:20:35,2017-02-17 00:00:00,,0.0,22940.0,"<p>A live performance at Monty Hall on Feb 17,...",1506.0,Monty Hall,...,,336.0,,Creative Commons Attribution-NonCommercial-NoD...,294.0,,5.0,,[],The Man With Two Mouths


In [22]:
sum(df_tracks['track_id'] == 83813)

1

In [8]:
import ast
def convert_to_list(string_value):
    return ast.literal_eval(string_value)
df_tracks['genres_all'] = df_tracks['genres_all'].apply(convert_to_list)

In [17]:
df_tracks['genres_all'].count()

106574

In [6]:
df_tracks.columns

Index(['track_id', 'comments', 'date_created', 'date_released', 'engineer',
       'favorites', 'id', 'information', 'listens', 'producer', 'tags',
       'title', 'tracks', 'type', 'active_year_begin', 'active_year_end',
       'associated_labels', 'bio', 'comments.1', 'date_created.1',
       'favorites.1', 'id.1', 'latitude', 'location', 'longitude', 'members',
       'name', 'related_projects', 'tags.1', 'website', 'wikipedia_page',
       'split', 'subset', 'bit_rate', 'comments.2', 'composer',
       'date_created.2', 'date_recorded', 'duration', 'favorites.2',
       'genre_top', 'genres', 'genres_all', 'information.1', 'interest',
       'language_code', 'license', 'listens.1', 'lyricist', 'number',
       'publisher', 'tags.2', 'title.1'],
      dtype='object')

In [9]:
df_all = pd.merge(df_all, df_tracks[['track_id','name','language_code','genres_all']], left_on='filename', right_on='track_id', how='left')
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename,track_id,name,language_code,genres_all
0,data/musicCorp/fma_large/083/083813.mp3,83813,,,1,1234800,44.1,28,0.032880,28,STM_output/MATs/music_mat_wl4_fma_large/083813...,83813.0,Brandon Ross,,"[4, 37]"
1,data/musicCorp/fma_large/034/034059.mp3,34059,,,1,1234800,44.1,28,0.034807,28,STM_output/MATs/music_mat_wl4_fma_large/034059...,34059.0,Sound Of Ground,,"[58, 3, 12, 45]"
2,data/musicCorp/fma_large/092/092378.mp3,92378,,,1,1234800,44.1,28,0.020907,28,STM_output/MATs/music_mat_wl4_fma_large/092378...,92378.0,M-PeX,,"[4, 21, 15]"
3,data/musicCorp/fma_large/013/013935.mp3,13935,,,1,1234800,44.1,28,0.006553,28,STM_output/MATs/music_mat_wl4_fma_large/013935...,13935.0,Milton Cross,en,"[1235, 224, 107, 38]"
4,data/musicCorp/fma_large/148/148436.mp3,148436,,,1,1234800,44.1,28,0.006735,28,STM_output/MATs/music_mat_wl4_fma_large/148436...,148436.0,Poder Fantasma,,"[2, 46]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,data/musicCorp/fma_large/135/135403.mp3,135403,,,1,1234800,44.1,28,0.767574,24,STM_output/MATs/music_mat_wl4_fma_large/135403...,135403.0,Kosta T,,"[1, 514, 38, 107, 18, 1235, 250]"
9996,data/musicCorp/fma_large/052/052418.mp3,52418,,,1,1234800,44.1,28,0.044943,28,STM_output/MATs/music_mat_wl4_fma_large/052418...,52418.0,Delmore fx,,"[10, 12, 76, 17, 27, 94]"
9997,data/musicCorp/fma_large/103/103604.mp3,103604,,,1,1234800,44.1,28,0.032653,28,STM_output/MATs/music_mat_wl4_fma_large/103604...,103604.0,Meso,,"[456, 42, 38, 15]"
9998,data/musicCorp/fma_large/118/118270.mp3,118270,,,1,1234800,44.1,28,0.033061,28,STM_output/MATs/music_mat_wl4_fma_large/118270...,118270.0,luciana bass,,"[74, 250, 4, 38]"


In [15]:
df_all['LangOrInstru'] = df_all['language_code']
df_all.drop(columns=['language_code','track_id'], inplace=True)
df_all.rename(columns={'genres_all': 'genre', 'name': 'speaker/artist'}, inplace=True)
df_all['gender'] = np.nan
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename,speaker/artist,genre,gender
0,data/musicCorp/fma_large/083/083813.mp3,83813,,,1,1234800,44.1,28,0.032880,28,STM_output/MATs/music_mat_wl4_fma_large/083813...,Brandon Ross,"[4, 37]",
1,data/musicCorp/fma_large/034/034059.mp3,34059,,,1,1234800,44.1,28,0.034807,28,STM_output/MATs/music_mat_wl4_fma_large/034059...,Sound Of Ground,"[58, 3, 12, 45]",
2,data/musicCorp/fma_large/092/092378.mp3,92378,,,1,1234800,44.1,28,0.020907,28,STM_output/MATs/music_mat_wl4_fma_large/092378...,M-PeX,"[4, 21, 15]",
3,data/musicCorp/fma_large/013/013935.mp3,13935,en,,1,1234800,44.1,28,0.006553,28,STM_output/MATs/music_mat_wl4_fma_large/013935...,Milton Cross,"[1235, 224, 107, 38]",
4,data/musicCorp/fma_large/148/148436.mp3,148436,,,1,1234800,44.1,28,0.006735,28,STM_output/MATs/music_mat_wl4_fma_large/148436...,Poder Fantasma,"[2, 46]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,data/musicCorp/fma_large/135/135403.mp3,135403,,,1,1234800,44.1,28,0.767574,24,STM_output/MATs/music_mat_wl4_fma_large/135403...,Kosta T,"[1, 514, 38, 107, 18, 1235, 250]",
9996,data/musicCorp/fma_large/052/052418.mp3,52418,,,1,1234800,44.1,28,0.044943,28,STM_output/MATs/music_mat_wl4_fma_large/052418...,Delmore fx,"[10, 12, 76, 17, 27, 94]",
9997,data/musicCorp/fma_large/103/103604.mp3,103604,,,1,1234800,44.1,28,0.032653,28,STM_output/MATs/music_mat_wl4_fma_large/103604...,Meso,"[456, 42, 38, 15]",
9998,data/musicCorp/fma_large/118/118270.mp3,118270,,,1,1234800,44.1,28,0.033061,28,STM_output/MATs/music_mat_wl4_fma_large/118270...,luciana bass,"[74, 250, 4, 38]",


In [12]:
df_genres = pd.read_csv('data/musicCorp/fma_large/fma_metadata/genres.csv', index_col='genre_id')
df_genres

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,8693,38,Avant-Garde,38
2,5271,0,International,2
3,1752,0,Blues,3
4,4126,0,Jazz,4
5,4106,0,Classical,5
...,...,...,...,...
1032,60,102,Turkish,2
1060,30,46,Tango,2
1156,26,130,Fado,2
1193,72,763,Christmas,38


In [16]:
def replace_genre_names(lst):
    return [df_genres['title'][n] for n in lst]

df_all['genre'] = df_all['genre'].apply(lambda x: replace_genre_names(x))
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename,speaker/artist,genre,gender
0,data/musicCorp/fma_large/083/083813.mp3,83813,,,1,1234800,44.1,28,0.032880,28,STM_output/MATs/music_mat_wl4_fma_large/083813...,Brandon Ross,"[Jazz, Jazz: Vocal]",
1,data/musicCorp/fma_large/034/034059.mp3,34059,,,1,1234800,44.1,28,0.034807,28,STM_output/MATs/music_mat_wl4_fma_large/034059...,Sound Of Ground,"[Psych-Rock, Blues, Rock, Loud-Rock]",
2,data/musicCorp/fma_large/092/092378.mp3,92378,,,1,1234800,44.1,28,0.020907,28,STM_output/MATs/music_mat_wl4_fma_large/092378...,M-PeX,"[Jazz, Hip-Hop, Electronic]",
3,data/musicCorp/fma_large/013/013935.mp3,13935,en,,1,1234800,44.1,28,0.006553,28,STM_output/MATs/music_mat_wl4_fma_large/013935...,Milton Cross,"[Instrumental, Sound Collage, Ambient, Experim...",
4,data/musicCorp/fma_large/148/148436.mp3,148436,,,1,1234800,44.1,28,0.006735,28,STM_output/MATs/music_mat_wl4_fma_large/148436...,Poder Fantasma,"[International, Latin America]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,data/musicCorp/fma_large/135/135403.mp3,135403,,,1,1234800,44.1,28,0.767574,24,STM_output/MATs/music_mat_wl4_fma_large/135403...,Kosta T,"[Avant-Garde, Sound Art, Experimental, Ambient...",
9996,data/musicCorp/fma_large/052/052418.mp3,52418,,,1,1234800,44.1,28,0.044943,28,STM_output/MATs/music_mat_wl4_fma_large/052418...,Delmore fx,"[Pop, Rock, Experimental Pop, Folk, Lo-Fi, Fre...",
9997,data/musicCorp/fma_large/103/103604.mp3,103604,,,1,1234800,44.1,28,0.032653,28,STM_output/MATs/music_mat_wl4_fma_large/103604...,Meso,"[Minimalism, Ambient Electronic, Experimental,...",
9998,data/musicCorp/fma_large/118/118270.mp3,118270,,,1,1234800,44.1,28,0.033061,28,STM_output/MATs/music_mat_wl4_fma_large/118270...,luciana bass,"[Free-Jazz, Improv, Jazz, Experimental]",
