In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from os.path import join, splitext, isdir, exists, basename
from collections import OrderedDict
import re
import os

### Dataset

In [2]:
def _parse_speaker_info(data_root):
    speaker_info_path = join(data_root, "speaker-info.txt")
    if not exists(speaker_info_path):
        raise RuntimeError(
            "speaker-info.txt doesn't exist at \"{}\"".format(speaker_info_path))
    speaker_info = OrderedDict()
    filed_names = ["ID", "AGE", "GENDER", "ACCENTS", "REGION"]
    with open(speaker_info_path, "rb") as f:
        for line in f:
            line = line.decode("utf-8")
            fields = line.split()
            if fields[0] == "ID":
                continue
            assert len(fields) == 4 or len(fields) == 5 or len(fields) == 6
            ID = fields[0]
            speaker_info[ID] = {}
            speaker_info[ID]["AGE"] = int(fields[1])
            speaker_info[ID]["GENDER"] = fields[2]
            speaker_info[ID]["ACCENTS"] = fields[3]
            if len(fields) > 4:
                speaker_info[ID]["REGION"] = " ".join(fields[4:])
            else:
                speaker_info[ID]["REGION"] = ""
    return speaker_info

In [3]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
diz = _parse_speaker_info(os.path.join(ROOT_DIR, 'rawData', 'VCTK-Corpus'))
df = pd.DataFrame.from_dict(diz, orient = 'index')
df['ID'] = df.index
#df.to_csv('df_info.csv', index = False)

In [8]:
lista_esclusa = ['234','249','266','303','253','238','314']
df[df['ID'].isin(lista_esclusa)]

Unnamed: 0,AGE,GENDER,ACCENTS,REGION,ID
234,22,F,Scottish,West Dumfries,234
238,22,F,NorthernIrish,Belfast,238
249,22,F,Scottish,Aberdeen,249
253,22,F,Welsh,Cardiff,253
266,22,F,Irish,Athlone,266
303,24,F,Canadian,Toronto,303
314,26,F,SouthAfrican,Cape Town,314


### Sample speakers for train-val-test

In [47]:
def from_txt_to_list(txt):
    out_list = []
    with open(txt) as f:
        for line in f:
            out_list.append(line.strip())
    return out_list

In [48]:
txtfiles_dir = os.path.join(ROOT_DIR, 'processedData', 'txtfiles')
id_list_test = from_txt_to_list(os.path.join(txtfiles_dir, 'test-speakers.txt'))
regex = re.compile(r'(p)(\d*)')
id_test = [regex.search(i)[2] for i in id_list_test]

df_test = df.loc[id_test]
df = df.drop(id_test)
df = df.drop('225')

In [59]:
def get_elements_from_dict(df, diz, gender):
    out = {}
    for accent in diz:
        idxs = list(df[(df['GENDER'] == gender)&(df['ACCENTS'] == accent)].head(diz[accent]).index)
        for idx in idxs:
            id_ = 'p' + str(idx)
            out[id_] = {'AGE': df.loc[idx]['AGE'], 
                       'GENDER': df.loc[idx]['GENDER'],
                       'ACCENTS': df.loc[idx]['ACCENTS'],
                       'REGION': df.loc[idx]['REGION'],
                       'ID': df.loc[idx]['ID']}
        df = df.drop(idxs)
    return df, pd.DataFrame.from_dict(out, 'index')

In [60]:
def create_sampled_train_val_txt(df, tr_diz_M, tr_diz_F, val_diz_M , val_diz_F):
    df, tr_M = get_elements_from_dict(df, tr_diz_M, 'M')
    df, tr_F = get_elements_from_dict(df, tr_diz_F, 'F')  
    df, val_M = get_elements_from_dict(df, val_diz_M, 'M')
    df, val_F = get_elements_from_dict(df, val_diz_F, 'F')
    df_tr, df_val = pd.concat([tr_M, tr_F]), pd.concat([val_M, val_F]) 
    return list(df_tr.index), list(df_val.index)

In [61]:
df[df['GENDER'] == 'F'].ACCENTS.value_counts()

English          17
American         15
Irish             6
Canadian          5
Scottish          5
SouthAfrican      3
NorthernIrish     3
Welsh             1
NewZealand        1
Indian            1
Name: ACCENTS, dtype: int64

In [62]:
df[df['GENDER'] == 'M'].ACCENTS.value_counts()

English          15
Scottish         14
American          4
Canadian          2
Irish             2
NorthernIrish     2
Australian        1
Indian            1
Name: ACCENTS, dtype: int64

In [63]:
df_test[df_test['GENDER'] == 'M'].ACCENTS.value_counts()

Irish           1
Indian          1
Canadian        1
American        1
SouthAfrican    1
Australian      1
Name: ACCENTS, dtype: int64

In [64]:
df_test[df_test['GENDER'] == 'F'].ACCENTS.value_counts()

American         2
NorthernIrish    1
Name: ACCENTS, dtype: int64

In [65]:
train_sampling_F = {'English': 9, 'American': 3, 'Scottish': 2, 
                    'Irish': 1, 'Canadian': 1, 'Welsh': 1, 'NorthernIrish': 1, 'SouthAfrican': 1}   
train_sampling_M = {'English': 9, 'American': 3, 'Scottish': 3,
                    'Irish': 1, 'Canadian': 1, 'Indian': 1, 'Australian': 1}
val_sampling_F = {'English': 1, 'NewZealand': 1}
val_sampling_M = {'English': 1, 'American': 1}
tr_ids, val_ids = create_sampled_train_val_txt(df,
                                   tr_diz_M = train_sampling_M,
                                   tr_diz_F = train_sampling_F,
                                   val_diz_M =val_sampling_M,
                                   val_diz_F = val_sampling_F)

In [67]:
def from_list_to_txt(id_list, name, path_to_save):
    file = os.path.join(path_to_save, "{}.txt".format(name))
    with open(file, "w") as text_file:
        for id_ in id_list:
            text_file.write(id_ + '\n')
        text_file.close()

In [68]:
from_list_to_txt(tr_ids, 'train-speakers-sampled', txtfiles_dir)
from_list_to_txt(val_ids, 'val-speakers-sampled', txtfiles_dir)