In [1]:
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv
import json
import os

In [2]:
corpus_speech_list = ['BibleTTS/akuapem-twi',
    'BibleTTS/asante-twi',
    'BibleTTS/ewe',
    'BibleTTS/hausa',
    'BibleTTS/lingala',
    'BibleTTS/yoruba',
    'Buckeye',
    'EUROM',
    'LibriSpeech',
    # 'LibriVox',
    'MediaSpeech/AR',
    'MediaSpeech/ES',
    'MediaSpeech/FR',
    'MediaSpeech/TR',
    'MozillaCommonVoice/ab',
    'MozillaCommonVoice/ar',
    'MozillaCommonVoice/ba',
    'MozillaCommonVoice/be',
    'MozillaCommonVoice/bg',
    'MozillaCommonVoice/bn',
    'MozillaCommonVoice/br',
    'MozillaCommonVoice/ca',
    'MozillaCommonVoice/ckb',
    'MozillaCommonVoice/cnh',
    'MozillaCommonVoice/cs',
    'MozillaCommonVoice/cv',
    'MozillaCommonVoice/cy',
    'MozillaCommonVoice/da',
    'MozillaCommonVoice/de',
    'MozillaCommonVoice/dv',
    'MozillaCommonVoice/el',
    'MozillaCommonVoice/en',
    'MozillaCommonVoice/eo',
    'MozillaCommonVoice/es',
    'MozillaCommonVoice/et',
    'MozillaCommonVoice/eu',
    'MozillaCommonVoice/fa',
    'MozillaCommonVoice/fi',
    'MozillaCommonVoice/fr',
    'MozillaCommonVoice/fy-NL',
    'MozillaCommonVoice/ga-IE',
    'MozillaCommonVoice/gl',
    'MozillaCommonVoice/gn',
    'MozillaCommonVoice/hi',
    'MozillaCommonVoice/hu',
    'MozillaCommonVoice/hy-AM',
    'MozillaCommonVoice/id',
    'MozillaCommonVoice/ig',
    'MozillaCommonVoice/it',
    'MozillaCommonVoice/ja',
    'MozillaCommonVoice/ka',
    'MozillaCommonVoice/kab',
    'MozillaCommonVoice/kk',
    'MozillaCommonVoice/kmr',
    'MozillaCommonVoice/ky',
    'MozillaCommonVoice/lg',
    'MozillaCommonVoice/lt',
    'MozillaCommonVoice/ltg',
    'MozillaCommonVoice/lv',
    'MozillaCommonVoice/mhr',
    'MozillaCommonVoice/ml',
    'MozillaCommonVoice/mn',
    'MozillaCommonVoice/mt',
    'MozillaCommonVoice/nan-tw',
    'MozillaCommonVoice/nl',
    'MozillaCommonVoice/oc',
    'MozillaCommonVoice/or',
    'MozillaCommonVoice/pl',
    'MozillaCommonVoice/pt',
    'MozillaCommonVoice/ro',
    'MozillaCommonVoice/ru',
    'MozillaCommonVoice/rw',
    'MozillaCommonVoice/sr',
    'MozillaCommonVoice/sv-SE',
    'MozillaCommonVoice/sw',
    'MozillaCommonVoice/ta',
    'MozillaCommonVoice/th',
    'MozillaCommonVoice/tr',
    'MozillaCommonVoice/tt',
    'MozillaCommonVoice/ug',
    'MozillaCommonVoice/uk',
    'MozillaCommonVoice/ur',
    'MozillaCommonVoice/uz',
    'MozillaCommonVoice/vi',
    'MozillaCommonVoice/yo',
    'MozillaCommonVoice/yue',
    'MozillaCommonVoice/zh-CN',
    'MozillaCommonVoice/zh-TW',
    'primewords_chinese',
    'room_reader',
    'SpeechClarity',
    'TAT-Vol2',
    'thchs30',
    'TIMIT',
    'TTS_Javanese',
    'zeroth_korean',]

corpus_music_list = ['IRMAS',
                     'Albouy2020Science',
                     'CD',]

In [3]:
def make_meta_file(corpus, corpus_type):
    
    params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
    df_list = []

    # load the data from the mat file
    for params_file in params_list:
        # print(params_file)
        data_dict = sio.loadmat(params_file)
        structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
        df = pd.DataFrame(structure_dict)
        df.drop(columns=['x_axis','y_axis'], inplace=True)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
        df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
        df_list.append(df)
    
    df_all = pd.concat(df_list, ignore_index=True)

    # add the speaker ID and gender info
    if 'MozillaCommonVoice' in corpus:
        valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)
        valid_df['path'] = valid_df['path'].str.replace('.mp3', '')
        valid_df.rename(columns={'client_id':'speaker/artist'}, inplace=True)
        df_all = df_all.merge(valid_df[['speaker/artist', 'path', 'gender']], how='left', left_on='filename', right_on='path').drop(columns=['path'])
    elif 'BibleTTS' in corpus:
        df_all['speaker/artist'] = 'BibleTTS_' + df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'Buckeye' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3].str.replace('s', 'S')
        df_gender = pd.read_csv('data/speechCorp/Buckeye/Buckeye_speaker_info.csv')
        df_all = df_all.merge(df_gender[['SPEAKER', "SPEAKER'S GENDER"]], how='left', left_on='speaker/artist', right_on='SPEAKER').drop(columns=['SPEAKER'])
        df_all.rename(columns={"SPEAKER'S GENDER":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'Buckeye_'+df_all['speaker/artist']
    elif 'EUROM' in corpus:
        df_all['speaker/artist'] = 'EUROM_' + df_all['LangOrInstru'] +'_'+ df_all['filename'].str[:2]
        df_all['gender'] = np.nan
    elif 'MediaSpeech' in corpus:
        df_all['speaker/artist'] = 'MediaSpeech_' + df_all['LangOrInstru'] +'_'+ df_all['filename']
        df_all['gender'] = np.nan
    elif 'LibriSpeech' in corpus:
        split_names = df_all['filename'].str.split('-') # Split the 'name' column by "-"
        first_parts = split_names.str[0] # Extract the first part of the split result
        df_all['speaker/artist'] = first_parts # Add the extracted part as a new column in the DataFrame
        
        ## load LibriSpeech text file
        reader_ids = []
        genders = []
        subsets = []
        durations = []
        names = []
        # Open the file and read line by line
        with open('data/speechCorp/LibriSpeech/SPEAKERS.TXT', 'r') as file:
            # Skip lines starting with ";" (comments) until reaching the data
            while True:
                line = file.readline()
                if not line.startswith(';'):
                    # Start processing data from this line
                    break
            
            # Read the rest of the lines and parse them
            while line:
                # Split each line by "|" character
                data = line.strip().split('|')
                # Extract relevant information
                reader_ids.append(int(data[0].strip()))
                genders.append(data[1].strip())
                subsets.append(data[2].strip())
                durations.append(float(data[3].strip()))
                names.append(data[4].strip())
                # Read the next line
                line = file.readline()
        
        # Create a DataFrame using the lists
        df_LibriSpeech = pd.DataFrame({'reader_id': reader_ids, 'gender': genders, 'subset': subsets, 'duration': durations, 'name': names})
        df_LibriSpeech['reader_id']= df_LibriSpeech['reader_id'].astype(str)
        df_all = df_all.merge(df_LibriSpeech[['reader_id', 'gender']], how='left', left_on='speaker/artist', right_on='reader_id').drop(columns=['reader_id'])
    elif 'primewords_chinese' in corpus:
        with open('data/speechCorp/primewords_chinese/set1_transcript.json', 'r') as file:
            data = json.load(file)
        primewords_df = pd.DataFrame(data)
        primewords_df['file'] = primewords_df['file'].str.replace('.wav', '')
        df_all = df_all.merge(primewords_df[['file', 'user_id']], how='left', left_on='filename', right_on='file').drop(columns=['file'])
        df_all.rename(columns={"user_id":'speaker/artist'}, inplace=True)
        df_all['speaker/artist'] = 'primewords_'+df_all['speaker/artist']
        df_all['gender']=np.nan
    elif 'room_reader' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = split_names.str[1] # Extract the first part of the split result
        RR_df = pd.read_excel('data/speechCorp/room_reader/RoomReader_SessionsEvents.xlsx')
        df_all = df_all.merge(RR_df[['part_ID', 'gender']], how='left', left_on='speaker/artist', right_on='part_ID').drop(columns=['part_ID'])
        df_all['speaker/artist'] = 'RoomReader_'+df_all['speaker/artist']
    elif 'SpeechClarity' in corpus:
        df_all['speaker/artist'] = 'SpeechClarity_'+df_all['filename'].str[:3]
        df_all['gender'] = np.nan
    elif 'TAT-Vol2' in corpus:
        df_all['speaker/artist'] = 'TAT-Vol2_'+df_all['filename'].str[:10]
        df_all['gender'] = df_all['filename'].str[5]
    elif 'thchs30' in corpus:
        split_names = df_all['filename'].str.split('_') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'thchs30_'+split_names.str[0] # Extract the first part of the split result
        df_all['gender'] = np.nan
    elif 'TIMIT' in corpus:
        split_names = df_all['filepath'].str.split('/') # Split the 'name' column by "-"
        df_all['speaker/artist'] = 'TIMIT_'+split_names.str[-2] # Extract the first part of the split result
        df_all['gender'] = df_all['speaker/artist'].str[6]
    elif 'TTS_Javanese' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:9] 
        df_all['gender'] = df_all['filename'].str[2]
    elif 'zeroth' in corpus:
        df_all['speaker/artist'] = df_all['filename'].str[:3] 
        zeroth_df = pd.read_csv('data/speechCorp/zeroth_korean/AUDIO_INFO', sep="|")
        zeroth_df['SPEAKERID'] = zeroth_df['SPEAKERID'].astype(str)
        df_all = df_all.merge(zeroth_df[['SPEAKERID', 'SEX']], how='left', left_on='speaker/artist', right_on='SPEAKERID').drop(columns=['SPEAKERID'])
        df_all.rename(columns={"SEX":'gender'}, inplace=True)
        df_all['speaker/artist'] = 'zeroth_'+df_all['speaker/artist']

    # Music corpora
    elif 'IRMAS' in corpus:
        for nRow in range(len(df_all)):
            txt_path = df_all['filepath'][nRow].replace('.wav', '.txt')
            if os.path.exists(txt_path):
                # Open the file in read mode
                with open(txt_path, 'r') as file:
                    # Read all lines into a list
                    lines = file.readlines()
                converted_lines = [line.strip() for line in lines]
                df_all.loc[nRow,'LangOrInstru'] = '-'.join(converted_lines)
                df_all.loc[nRow,'VoiOrNot'] = int('voi' in converted_lines)
                
        # df_all['speaker/artist'] = '-'.join(df_all['filename'].split('-')[:-1]).strip()
        df_all['speaker/artist'] = df_all['filename'].apply(lambda x: '-'.join(x.split('-')[:-1]).strip())
        df_all['gender'] = np.nan
        df_all['genre'] = np.nan
        
    elif 'Albouy2020Science' in corpus:
        df_all['speaker/artist'] = 'Albouy2020Science'
        df_all['gender'] = 'female'
        df_all['genre'] = 'classical'

    elif 'CD' in corpus:
        from fuzzywuzzy import process
        df_all['genre'] = 'classical'
        def extract_artist(file_path):
            parts = file_path.split('/')
            artist_album_part = parts[3]
            return artist_album_part.split('_')[0]
        
        df_all['speaker/artist'] = df_all['filepath'].apply(extract_artist)
        df_CD = pd.read_excel('data/musicCorp/CD/CD_music_list.xlsx')

        # Function to find the best match for each name in df1 from df2
        def find_best_match(name, choices):
            return process.extractOne(name, choices)
        
        df_all['Best_Match'] = df_all['filename'].apply(lambda x: find_best_match(x, df_CD['Piece'])) # Apply the function to find the best match for each name in df1
        df_all['Matched_Name'] = df_all['Best_Match'].apply(lambda x: x[0]) # Extract matched names and similarity scores
        df_all['Similarity_Score'] = df_all['Best_Match'].apply(lambda x: x[1])
        
        # Join based on matched names
        df_all = pd.merge(df_all, df_CD[['Piece', 'Genre','Instrument']], left_on='Matched_Name', right_on='Piece', how='left')
        df_all['LangOrInstru'] = df_all['Instrument']
        df_all.drop(columns=['Best_Match','Instrument','Matched_Name','Similarity_Score','Piece'], inplace = True)
        df.rename(columns={'Genre': 'genre'}, inplace = True)


    return df_all

In [4]:
for corpus in corpus_speech_list:
    print(corpus)
    df_all = make_meta_file(corpus, corpus_type='speech')
    df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')


BibleTTS/akuapem-twi
BibleTTS/asante-twi
BibleTTS/ewe
BibleTTS/hausa
BibleTTS/lingala
BibleTTS/yoruba
Buckeye
EUROM
LibriSpeech
MediaSpeech/AR
MediaSpeech/ES
MediaSpeech/FR
MediaSpeech/TR
MozillaCommonVoice/ab
MozillaCommonVoice/ar


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ba


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/be


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/bg
MozillaCommonVoice/bn
MozillaCommonVoice/br
MozillaCommonVoice/ca


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ckb
MozillaCommonVoice/cnh
MozillaCommonVoice/cs
MozillaCommonVoice/cv
MozillaCommonVoice/cy
MozillaCommonVoice/da
MozillaCommonVoice/de


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/dv
MozillaCommonVoice/el
MozillaCommonVoice/en


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/eo


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/es


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/et
MozillaCommonVoice/eu
MozillaCommonVoice/fa


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/fi
MozillaCommonVoice/fr
MozillaCommonVoice/fy-NL
MozillaCommonVoice/ga-IE
MozillaCommonVoice/gl
MozillaCommonVoice/gn
MozillaCommonVoice/hi
MozillaCommonVoice/hu
MozillaCommonVoice/hy-AM
MozillaCommonVoice/id
MozillaCommonVoice/ig
MozillaCommonVoice/it
MozillaCommonVoice/ja
MozillaCommonVoice/ka


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/kab


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/kk
MozillaCommonVoice/kmr
MozillaCommonVoice/ky
MozillaCommonVoice/lg


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/lt
MozillaCommonVoice/ltg
MozillaCommonVoice/lv
MozillaCommonVoice/mhr


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ml
MozillaCommonVoice/mn
MozillaCommonVoice/mt
MozillaCommonVoice/nan-tw
MozillaCommonVoice/nl
MozillaCommonVoice/oc
MozillaCommonVoice/or
MozillaCommonVoice/pl


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/pt


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ro
MozillaCommonVoice/ru
MozillaCommonVoice/rw


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/sr
MozillaCommonVoice/sv-SE
MozillaCommonVoice/sw


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ta


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/th


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/tr


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/tt
MozillaCommonVoice/ug
MozillaCommonVoice/uk


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/ur
MozillaCommonVoice/uz


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


MozillaCommonVoice/vi
MozillaCommonVoice/yo
MozillaCommonVoice/yue
MozillaCommonVoice/zh-CN
MozillaCommonVoice/zh-TW


  valid_df = pd.read_csv('data/speechCorp/'+corpus+'/validated.tsv', sep='\t', quoting=csv.QUOTE_NONE)


primewords_chinese
room_reader
SpeechClarity
TAT-Vol2
thchs30
TIMIT
TTS_Javanese
zeroth_korean


In [None]:
for corpus in corpus_music_list:
    print(corpus)
    df_all = make_meta_file(corpus, corpus_type='music')
    df_all.to_csv('STM_output/STM_metaData/metaData_'+corpus.replace('/', '-')+'.csv')

## test code ##

In [61]:
corpus = 'CD'

corpus_type='music'

params_list = glob.glob('STM_output/Survey/'+corpus_type+'_params_'+corpus+'/*')
df_list = []

# load the data from the mat file
for params_file in params_list:
    # print(params_file)
    data_dict = sio.loadmat(params_file)
    structure_dict = {field: data_dict['Params'][field][0] for field in data_dict['Params'].dtype.names}
    df = pd.DataFrame(structure_dict)
    df.drop(columns=['x_axis','y_axis'], inplace=True)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df = df.applymap(lambda x: x[0] if isinstance(x, (list, np.ndarray)) else x)
    df['mat_filename'] = params_file.replace('/Survey/','/MATs/').replace('_params_','_mat_wl4_').replace('_Params.mat', '_MS2024.mat')
    df_list.append(df)

df_all = pd.concat(df_list, ignore_index=True)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename
0,data/musicCorp/CD/Frans Brüggen_ Orchestra Of...,"04 Mozart_ Symphony #41 In C, K 551, _Jupiter_...",nonIRMAS,,1,5292000,44.1,120,0.042404,120,STM_output/MATs/music_mat_wl4_CD/04 Mozart_ Sy...
1,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-18 Bach_ Cello Suite #5 In C Minor, BWV 1011...",nonIRMAS,,1,5292000,44.1,120,0.157029,120,STM_output/MATs/music_mat_wl4_CD/1-18 Bach_ Ce...
2,data/musicCorp/CD/The Dave Brubeck Quartet/Tim...,07 Pick Up Sticks,nonIRMAS,,1,5292000,44.1,120,0.069320,120,STM_output/MATs/music_mat_wl4_CD/07 Pick Up St...
3,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"3-05 Beethoven_ Symphony #6 In F, Op. 68, _Pas...",nonIRMAS,,1,5292000,44.1,120,0.552585,120,STM_output/MATs/music_mat_wl4_CD/3-05 Beethove...
4,data/musicCorp/CD/Edgar Meyer/Bach_ Unaccompan...,"09 Bach_ Cello Suite #1 In G, BWV 1007 - 3. Co...",nonIRMAS,,1,5292000,44.1,120,0.041179,120,STM_output/MATs/music_mat_wl4_CD/09 Bach_ Cell...
...,...,...,...,...,...,...,...,...,...,...,...
417,data/musicCorp/CD/Arcade Fire/Arcade Fire/09 S...,09 Suburban War,nonIRMAS,,1,5292000,44.1,120,0.147664,120,STM_output/MATs/music_mat_wl4_CD/09 Suburban W...
418,data/musicCorp/CD/The Beatles/1962-1966 [Disc ...,1-05 I Want To Hold Your Hand,nonIRMAS,,1,5292000,44.1,120,0.405828,120,STM_output/MATs/music_mat_wl4_CD/1-05 I Want T...
419,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-14 Bach_ Cello Suite #5 In C Minor, BWV 1011...",nonIRMAS,,1,5292000,44.1,120,0.126644,120,STM_output/MATs/music_mat_wl4_CD/1-14 Bach_ Ce...
420,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"2-02 Beethoven_ Symphony #3 In E Flat, Op. 55,...",nonIRMAS,,1,5644800,44.1,128,0.956100,120,STM_output/MATs/music_mat_wl4_CD/2-02 Beethove...


In [62]:
import os
import scipy.io as sio
import pandas as pd
import numpy as np
import glob
import csv
import json


        
# df_all['speaker/artist'] = '-'.join(df_all['filename'].split('-')[:-1]).strip()
df_all['speaker/artist'] = df_all['filename']
df_all['gender'] = np.nan
df_all.head()

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename,speaker/song,gender
0,data/musicCorp/CD/Frans Brüggen_ Orchestra Of...,"04 Mozart_ Symphony #41 In C, K 551, _Jupiter_...",nonIRMAS,,1,5292000,44.1,120,0.042404,120,STM_output/MATs/music_mat_wl4_CD/04 Mozart_ Sy...,"04 Mozart_ Symphony #41 In C, K 551, _Jupiter_...",
1,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-18 Bach_ Cello Suite #5 In C Minor, BWV 1011...",nonIRMAS,,1,5292000,44.1,120,0.157029,120,STM_output/MATs/music_mat_wl4_CD/1-18 Bach_ Ce...,"1-18 Bach_ Cello Suite #5 In C Minor, BWV 1011...",
2,data/musicCorp/CD/The Dave Brubeck Quartet/Tim...,07 Pick Up Sticks,nonIRMAS,,1,5292000,44.1,120,0.06932,120,STM_output/MATs/music_mat_wl4_CD/07 Pick Up St...,07 Pick Up Sticks,
3,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"3-05 Beethoven_ Symphony #6 In F, Op. 68, _Pas...",nonIRMAS,,1,5292000,44.1,120,0.552585,120,STM_output/MATs/music_mat_wl4_CD/3-05 Beethove...,"3-05 Beethoven_ Symphony #6 In F, Op. 68, _Pas...",
4,data/musicCorp/CD/Edgar Meyer/Bach_ Unaccompan...,"09 Bach_ Cello Suite #1 In G, BWV 1007 - 3. Co...",nonIRMAS,,1,5292000,44.1,120,0.041179,120,STM_output/MATs/music_mat_wl4_CD/09 Bach_ Cell...,"09 Bach_ Cello Suite #1 In G, BWV 1007 - 3. Co...",


In [64]:
def extract_artist(file_path):
    parts = file_path.split('/')
    artist_album_part = parts[3]
    return artist_album_part.split('_')[0]
    
df_all['speaker/artist'] = df_all['filepath'].apply(extract_artist)
df_all

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename,speaker/song,gender
0,data/musicCorp/CD/Frans Brüggen_ Orchestra Of...,"04 Mozart_ Symphony #41 In C, K 551, _Jupiter_...",nonIRMAS,,1,5292000,44.1,120,0.042404,120,STM_output/MATs/music_mat_wl4_CD/04 Mozart_ Sy...,Frans Brüggen,
1,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-18 Bach_ Cello Suite #5 In C Minor, BWV 1011...",nonIRMAS,,1,5292000,44.1,120,0.157029,120,STM_output/MATs/music_mat_wl4_CD/1-18 Bach_ Ce...,Yo-Yo Ma,
2,data/musicCorp/CD/The Dave Brubeck Quartet/Tim...,07 Pick Up Sticks,nonIRMAS,,1,5292000,44.1,120,0.069320,120,STM_output/MATs/music_mat_wl4_CD/07 Pick Up St...,The Dave Brubeck Quartet,
3,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"3-05 Beethoven_ Symphony #6 In F, Op. 68, _Pas...",nonIRMAS,,1,5292000,44.1,120,0.552585,120,STM_output/MATs/music_mat_wl4_CD/3-05 Beethove...,Charles Mackerras,
4,data/musicCorp/CD/Edgar Meyer/Bach_ Unaccompan...,"09 Bach_ Cello Suite #1 In G, BWV 1007 - 3. Co...",nonIRMAS,,1,5292000,44.1,120,0.041179,120,STM_output/MATs/music_mat_wl4_CD/09 Bach_ Cell...,Edgar Meyer,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,data/musicCorp/CD/Arcade Fire/Arcade Fire/09 S...,09 Suburban War,nonIRMAS,,1,5292000,44.1,120,0.147664,120,STM_output/MATs/music_mat_wl4_CD/09 Suburban W...,Arcade Fire,
418,data/musicCorp/CD/The Beatles/1962-1966 [Disc ...,1-05 I Want To Hold Your Hand,nonIRMAS,,1,5292000,44.1,120,0.405828,120,STM_output/MATs/music_mat_wl4_CD/1-05 I Want T...,The Beatles,
419,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-14 Bach_ Cello Suite #5 In C Minor, BWV 1011...",nonIRMAS,,1,5292000,44.1,120,0.126644,120,STM_output/MATs/music_mat_wl4_CD/1-14 Bach_ Ce...,Yo-Yo Ma,
420,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"2-02 Beethoven_ Symphony #3 In E Flat, Op. 55,...",nonIRMAS,,1,5644800,44.1,128,0.956100,120,STM_output/MATs/music_mat_wl4_CD/2-02 Beethove...,Charles Mackerras,


In [65]:
df_CD = pd.read_excel('data/musicCorp/CD/CD_music_list.xlsx')
df_CD

Unnamed: 0,Artist,Album,Piece,Duration,Genre,Instrument
0,Andrés Segovia,The Art Of Segovia [Disc 1],Tárrega: Recuerdos de la Alhambra,05:16:00,Classical,guitar
1,Andrés Segovia,The Art Of Segovia [Disc 1],Tárrega: Capricho arabe,05:27:00,Classical,guitar
2,Andrés Segovia,The Art Of Segovia [Disc 1],Tárrega: Marieta,02:30:00,Classical,guitar
3,Andrés Segovia,The Art Of Segovia [Disc 1],Torroba: Romance De Los Pinos,01:46:00,Classical,guitar
4,Andrés Segovia,The Art Of Segovia [Disc 1],Torroba: Madronos,02:57:00,Classical,guitar
...,...,...,...,...,...,...
521,Ysaye Crumb Kodaly,Cello Sonatas,Son: II. Tema pastorale con variazioni,04:05:00,Classical,cello
522,Ysaye Crumb Kodaly,Cello Sonatas,Son: III. Toccata,02:19:00,Classical,cello
523,Ysaye Crumb Kodaly,Cello Sonatas,"Son, Op.8: I. Allegro maestoso ma appassionato",07:14:00,Classical,cello
524,Ysaye Crumb Kodaly,Cello Sonatas,"Son, Op.8: II. Adagio",08:52:00,Classical,cello


In [66]:
from fuzzywuzzy import process

# Function to find the best match for each name in df1 from df2
def find_best_match(name, choices):
    return process.extractOne(name, choices)

# Apply the function to find the best match for each name in df1
df_all['Best_Match'] = df_all['filename'].apply(lambda x: find_best_match(x, df_CD['Piece']))

# Extract matched names and similarity scores
df_all['Matched_Name'] = df_all['Best_Match'].apply(lambda x: x[0])
df_all['Similarity_Score'] = df_all['Best_Match'].apply(lambda x: x[1])

# Join based on matched names
merged_df = pd.merge(df_all, df_CD[['Piece', 'Genre','Instrument']], left_on='Matched_Name', right_on='Piece', how='left')
merged_df['LangOrInstru'] = merged_df['Instrument']
merged_df.drop(columns=['Best_Match', 'Instrument','Matched_Name','Similarity_Score','Piece'], inplace = True)
merged_df

Unnamed: 0,filepath,filename,LangOrInstru,VoiOrNot,startPoint,endPoint,FSkhz,expdur,maxsil,totalLengCur,mat_filename,speaker/song,gender,Piece,Genre
0,data/musicCorp/CD/Frans Brüggen_ Orchestra Of...,"04 Mozart_ Symphony #41 In C, K 551, _Jupiter_...",multi,,1,5292000,44.1,120,0.042404,120,STM_output/MATs/music_mat_wl4_CD/04 Mozart_ Sy...,Frans Brüggen,,"Mozart: Symphony #41 In C, K 551, ""Jupiter"" - ...",Classical
1,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-18 Bach_ Cello Suite #5 In C Minor, BWV 1011...",cello,,1,5292000,44.1,120,0.157029,120,STM_output/MATs/music_mat_wl4_CD/1-18 Bach_ Ce...,Yo-Yo Ma,,"Bach: Cello Suite #5 In C Minor, BWV 1011 - 6....",Classical
2,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-18 Bach_ Cello Suite #5 In C Minor, BWV 1011...",cello,,1,5292000,44.1,120,0.157029,120,STM_output/MATs/music_mat_wl4_CD/1-18 Bach_ Ce...,Yo-Yo Ma,,"Bach: Cello Suite #5 In C Minor, BWV 1011 - 6....",Classical
3,data/musicCorp/CD/The Dave Brubeck Quartet/Tim...,07 Pick Up Sticks,multi,,1,5292000,44.1,120,0.069320,120,STM_output/MATs/music_mat_wl4_CD/07 Pick Up St...,The Dave Brubeck Quartet,,Pick Up Sticks,Jazz
4,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"3-05 Beethoven_ Symphony #6 In F, Op. 68, _Pas...",multi,,1,5292000,44.1,120,0.552585,120,STM_output/MATs/music_mat_wl4_CD/3-05 Beethove...,Charles Mackerras,,"Beethoven: Symphony #6 In F, Op. 68, ""Pastoral...",Classical
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
453,data/musicCorp/CD/The Beatles/1962-1966 [Disc ...,1-05 I Want To Hold Your Hand,multi_Voi,,1,5292000,44.1,120,0.405828,120,STM_output/MATs/music_mat_wl4_CD/1-05 I Want T...,The Beatles,,I Want To Hold Your Hand,Rock
454,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-14 Bach_ Cello Suite #5 In C Minor, BWV 1011...",cello,,1,5292000,44.1,120,0.126644,120,STM_output/MATs/music_mat_wl4_CD/1-14 Bach_ Ce...,Yo-Yo Ma,,"Bach: Cello Suite #5 In C Minor, BWV 1011 - 2....",Classical
455,data/musicCorp/CD/Yo-Yo Ma/Bach_ Cello Suites ...,"1-14 Bach_ Cello Suite #5 In C Minor, BWV 1011...",cello,,1,5292000,44.1,120,0.126644,120,STM_output/MATs/music_mat_wl4_CD/1-14 Bach_ Ce...,Yo-Yo Ma,,"Bach: Cello Suite #5 In C Minor, BWV 1011 - 2....",Classical
456,data/musicCorp/CD/Charles Mackerras_ Scottish ...,"2-02 Beethoven_ Symphony #3 In E Flat, Op. 55,...",multi,,1,5644800,44.1,128,0.956100,120,STM_output/MATs/music_mat_wl4_CD/2-02 Beethove...,Charles Mackerras,,"Beethoven: Symphony #3 In E Flat, Op. 55, ""Ero...",Classical
