In [1]:
import os
import pandas as pd
from nn import corpus
from nn import utils

In [2]:
# paths
timit_path = 'W:/timit/CDdata/timit/' # to extract corpus, features, labels
root_path = 'D:/gitlab/psi/compi1234/nn/' # to read/write corpus, features, labels
os.chdir(root_path)

## Make TIMIT corpus

In [7]:
# TIMIT - corpus filenames     
timit_fnames = corpus.get_corpus_timit(timit_path) 

# TIMIT - corpus subsets
timit_train = corpus.filter_list_timit(timit_fnames, split='train')
timit_test = corpus.filter_list_timit(timit_fnames, split='test')
timit_dummy = corpus.filter_list_timit(timit_fnames, split='train', region='dr1', speaker='(mcpm0|mdac0|mdpk0)', sentence='si')

# TIMIT - write corpus to disk (no meta data)
utils.write_txt(timit_train, 'data/timit_train.corpus')
utils.write_txt(timit_test, 'data/timit_test.corpus')
utils.write_txt(timit_dummy, 'data/dummy.corpus')

In [4]:
# TIMIT - meta data
timit_meta = corpus.get_timit_metadata(timit_fnames)

# TIMIT - write meta data to disk
#   as CSV without index or header
meta_file = "data/timit.meta"
timit_meta.to_csv(meta_file, sep='\t', index=False, header=False)

NameError: name 'timit_fnames' is not defined

## Read TIMIT corpus

In [3]:
# TIMIT - read corpus from disk
timit_train = utils.read_txt('data/timit_train.corpus')
timit_test = utils.read_txt('data/timit_test.corpus')
timit_dummy = utils.read_txt('data/timit_dummy.corpus')

# TIMIT - read meta from disk
timit_meta = pd.read_csv('data/timit.meta', sep='\t', header=None)

## Extract TIMIT data (features and labels) into SpchData

In [4]:
# SpchData (object containing features and labels)
spchdata = corpus.SpchData(timit_dummy)

# Reader and Writer (for formatting)
mode = 'numpy'
extension = '.npy'
writer = corpus.ArrayWriter(mode, extension)
reader = corpus.ArrayReader(mode, extension)

In [5]:
# Feature extraction 
feature_path = 'W:/timit/CDdata/timit/'
feature_extension = ".wav"
write_feature_path = 'data/dummy/mfcc39'

# arguments
feature_args = {
    'spg': None, 'Deltas': "delta", 'Norm': 'mean',
    'sample_rate': 16000, 'f_shift': 0.01, 'f_length': 0.03,
    'preemp': 0.97, 'window': 'hamm',
    'n_mels': 39, 'mode': 'dB'
    }

doExtractAndWrite = True
if doExtractAndWrite:
    
    # SpchData - extract features
    spchdata.extract_features(feature_path, feature_args, feature_extension)

    # SpchData - write features
    spchdata.write_features(write_feature_path, writer)
    print(spchdata.features[0].shape) 
    
    # write feature_args to json
    feature_args_fname = os.path.join(write_feature_path, 'feature_args.json')
    utils.write_json(feature_args, feature_args_fname)
    
# SpchData - read features
spchdata.read_features(write_feature_path, reader) 
print(spchdata.features[0].shape)

(78, 279)
(78, 279)


In [7]:
# Phone segmentation -> labels 
seg_path = 'W:/timit/CDdata/timit/'
seg_extension = ".phn"
write_seg_path = 'data/dummy/phn'

# arguments = feature arguments

doExtractAndWrite = True
if doExtractAndWrite:

    # SpchData - extract labels
    spchdata.extract_labels(seg_path, feature_args, seg_extension)

    # SpchData - write labels
    spchdata.write_labels(write_seg_path, writer) 
    print(spchdata.labels[0].shape)
    
# SpchData - read labels
spchdata.read_labels(write_seg_path, reader) 
print(spchdata.labels[0].shape)


(279,)
(279,)


In [8]:
# Word segmentation -> labels  
seg_path = 'W:/timit/CDdata/timit/'
seg_extension = ".wrd"
write_seg_path = 'data/dummy/wrd'

# arguments = feature arguments

doExtractAndWrite = True
if doExtractAndWrite:
    
    # SpchData - extract labels
    spchdata.extract_labels(seg_path, feature_args, seg_extension)

    # SpchData - write labels
    spchdata.write_labels(write_seg_path, writer) 
    print(spchdata.labels[0].shape) 
    
# SpchData - read labels
spchdata.read_labels(write_seg_path, reader) 
print(spchdata.labels[0].shape)


(242,)
(242,)


In [9]:
# Word segmentation -> labels with padding

# Since the 'end' time in TIMIT segmentations doesn't always match the length of the audio,
# the number of labels can differ from the number of (feature) frames.
# We solve this by padding the labels.

doExtractAndWrite = True
if doExtractAndWrite:
    
    # SpchData - extract labels
    spchdata.extract_labels(seg_path, feature_args, seg_extension)
    print(spchdata.labels[0].shape)
    
    # Add padding
    lengths = spchdata.get_length_features()
    spchdata.pad_labels(lengths)

    # SpchData - write labels
    spchdata.write_labels(write_seg_path, writer)
    print(spchdata.labels[0].shape) 
    
# SpchData - read labels
spchdata.read_labels(write_seg_path, reader) 
print(spchdata.labels[0].shape)

(242,)
(279,)
(279,)


In [12]:
# Padding can also be done with a custom token.

spchdata.extract_features(feature_path, feature_args, feature_extension)
spchdata.extract_labels(seg_path, feature_args, '.wrd')
print(spchdata.labels[0].shape)

lengths = spchdata.get_length_features()
spchdata.pad_labels_with_token(lengths, 'sil')
print(spchdata.labels[0].shape)
print(spchdata.labels[0][-5:])


(242,)
(279,)
['sil' 'sil' 'sil' 'sil' 'sil']


In [None]:
# Alternatively, one could pad the features.

spchdata.extract_features(feature_path, feature_args, feature_extension)
spchdata.extract_labels(seg_path, feature_args, seg_extension)
print(spchdata.features[0].shape)

lengths = spchdata.get_length_labels()
spchdata.pad_features(lengths)
print(spchdata.features[0].shape)

(78, 279)
(78, 242)


## Use TIMIT meta data 

In [None]:
# Meta data -> labels 
meta_file = 'data/timit.meta'
timit_meta = pd.read_csv(meta_file, sep='\t', header=None)
timit_meta.columns = ['fname', 'split', 'region', 'gender', 'speaker', 'sentence']

# Lengths (number of frames -> depends on feature extraction)
spchdata.extract_features(feature_path, feature_args, feature_extension)
lengths = spchdata.get_length_features()

# SpchData - extract labels
spchdata.extract_labels_from_meta(timit_meta, lengths, 'fname', 'region')
print(spchdata.labels[0].shape)

(279,)


In [18]:
# check seg2labels
print(spchdata.get_length_features())
print(spchdata.get_length_labels())

[279, 419, 383, 273, 222, 513, 361, 193, 165]
[279, 419, 383, 273, 222, 513, 361, 193, 165]


## Make subsets of SpchData 

In [10]:
# Subset with regex
rgx = ".*/dr1/mcpm0/.*"
spchdata_subset = spchdata.subset_with_regex(rgx)
print(len(spchdata.corpus))
print(len(spchdata.features))
print(len(spchdata_subset.corpus))
print(len(spchdata_subset.features))

9
9
3
3
