In [1]:
import os
import pyspch
import pyspch.nn
import pandas as pd



In [2]:
# paths 

# public = /users/spraak/spchlab/public_html/pyspch/timit/ (final project)
# private = /esat/spchtemp/scratch/bvandyck/timit/ (dev project, training models on HTCondor)
remote_path = 'https://homes.esat.kuleuven.be/~spchlab/pyspch/timit/'

if True:
    # (@remote, personal machine, bvandyck) 
    timit_path = 'W:/timit/CDdata/timit/' # to extract corpus, features, labels
    write_path = 'Z:/scratch/bvandyck/timit/' # to write corpus, features, labels
    read_path = 'Z:/scratch/bvandyck/timit/' # to read corpus, features, labels

if False:
    # (@esat, bvandyck)
    timit_path = '/users/spraak/spchdata/timit/CDdata/timit/' # to extract corpus, features, labels
    write_path = '/esat/spchtemp/scratch/bvandyck/timit/' # to write corpus, features, labels
    read_path = '/esat/spchtemp/scratch/bvandyck/timit/' # to read corpus, features, labels
    
if False:
    # (@esat, spchlab)
    timit_path = '/users/spraak/spchdata/timit/CDdata/timit/' # to extract corpus, features, labels
    write_path = '/users/spraak/spchlab/public_html/pyspch/timit/' # to write corpus, features, labels
    read_path = '/users/spraak/spchlab/public_html/pyspch/timit/' # to read corpus, features, labels

os.chdir(write_path)

## Prepare (or read) TIMIT corpus 

In [3]:
# prepare TIMIT corpus 
prepare_corpus = False
read_corpus = True
write_corpus_path = write_path + 'data/'
read_corpus_path = read_path + 'data/'

if prepare_corpus:
    
    # get corpus from directory 
    timit_corpus = pyspch.timit.get_timit_corpus(timit_path) 

    # corpus subsets (train/test, additional)
    timit_train = pyspch.timit.filter_list_timit(timit_corpus, split='train')
    timit_test = pyspch.timit.filter_list_timit(timit_corpus, split='test')
    timit_train_dr1 = pyspch.timit.filter_list_timit(timit_corpus, split='train', region='dr1')
    timit_test_dr1 = pyspch.timit.filter_list_timit(timit_corpus, split='test', region='dr1')
    
    # write corpus to disk
    os.makedirs(write_corpus_path, exist_ok=True)
    pyspch.write_txt(timit_corpus, write_corpus_path + 'timit.corpus')
    
    # write corpus subsets to disk
    pyspch.write_txt(timit_train, write_corpus_path + 'timit_train.corpus')
    pyspch.write_txt(timit_test, write_corpus_path + 'timit_test.corpus')
    pyspch.write_txt(timit_train_dr1, write_corpus_path + 'timit_train_dr1.corpus')
    pyspch.write_txt(timit_test_dr1, write_corpus_path + 'timit_test_dr1.corpus')

    # extract meta data and write to disk
    timit_meta = pyspch.timit.get_timit_metadata(timit_corpus)
    timit_meta.to_csv(write_corpus_path + 'timit.meta', sep='\t', index=False, header=False)

if read_corpus:
    
    # read corpus and meta data
    timit_corpus = pyspch.read_data_file(read_corpus_path + 'timit.corpus')
    meta = pyspch.read_dataframe(read_corpus_path + "timit.meta")
   
# print
print(f'Corpus contains {len(timit_corpus)} files')  

Corpus contains 6300 files


## Read TIMIT data (wav)

In [6]:
# initialize SpchData with corpus
timit_data = pyspch.nn.SpchData(timit_corpus)

# read signals (wav-data) from disk ~ 25min
sample_rate_wav = 16000
timit_data.read_signals(timit_path, sample_rate_wav, extension='.wav')

## Extract TIMIT features (for exercise sessions)

### Mel filterbank cepstral coeffients (mfcc13)

In [7]:
# A. Mel Frequency Cepstral Coeffients (mfcc13)
write_feature_path = write_path + 'data/mfcc13/'

# arguments
feature_args = {
    'spg': None, 'Deltas': None, 'Norm': None,
    'sample_rate': 16000, 'f_shift': 0.01, 'f_length': 0.03,
    'preemp': 0.97, 'window': 'hamm', 'mode': 'dB',  
    'n_mels': 24, 'n_cep': 13 
    }

# extract and write features
pyspch.timit.make_dirs_for_corpus(write_feature_path, timit_corpus)
timit_data.extract_features_from_signals(feature_args)
timit_data.write_features(write_feature_path) # ~ 25min

# write feature_args 
feature_args_fname = os.path.join(write_feature_path, 'feature_args.json')
pyspch.write_json(feature_args, feature_args_fname)

In [None]:
# Feature extraction can also be done while reading the signals (wav-data).
# This requires less memory (since signals are not kept in memory).
# However, here we first load signals, then extract features, such that
# different feature extraction's can be performed, without re-reading the signals.
if False:
    # on the fly looks like:
    timit_data.extract_features(timit_path, feature_args, extension='.wav')

### Mel filterbanks (mel80)

In [9]:
# B. Mel filterbanks (mel80)
write_feature_path = write_path + 'data/mel80/'

# arguments
feature_args = {
    'spg': None, 'Deltas': None, 'Norm': None,
    'sample_rate': 16000, 'f_shift': 0.01, 'f_length': 0.03,
    'preemp': 0.97, 'window': 'hamm', 'mode': 'dB',
    'n_mels': 80, 'n_cep': None
    }

# extract and write features
pyspch.timit.make_dirs_for_corpus(write_feature_path, timit_corpus)
timit_data.extract_features_from_signals(feature_args)
timit_data.write_features(write_feature_path)

# write feature_args 
feature_args_fname = os.path.join(write_feature_path, 'feature_args.json')
pyspch.write_json(feature_args, feature_args_fname)

### Filterbanks (fb)

In [12]:
# C. Filterbanks (fb)
write_feature_path = write_path + 'data/fb/'

# arguments
feature_args = {
    'spg': None, 'Deltas': None, 'Norm': None,
    'sample_rate': 16000, 'f_shift': 0.01, 'f_length': 0.03,
    'preemp': 0.97, 'window': 'hamm', 'mode': 'dB',
    'n_mels': None, 'n_cep': None
    }

# extract and write features
pyspch.timit.make_dirs_for_corpus(write_feature_path, timit_corpus)
timit_data.extract_features_from_signals(feature_args)
timit_data.write_features(write_feature_path)

# write feature_args 
feature_args_fname = os.path.join(write_feature_path, 'feature_args.json')
pyspch.write_json(feature_args, feature_args_fname)

## Default setup for exercise sessions

Setup saved as pickled dataframe for fast loading:
- mfcc13 features:
- TIMIT61 phoneme labels: modified in exerice-session to 

Modified in exerice-session after reading to:
- mfcc39 features (by adding delta_ddelta and variance normalisation)
- TIMIT41 phoneme labels (by predefined mapping)

Setup split into train/test, smaller subsets can be defined analogously.

In [15]:
# Mel Frequency Cepstral Coeffients (mfcc13)
# instead of reading, extract features from signals (still in memory) ~ faster
read_feature_path = read_path + 'data/mfcc13/'
feature_args = pyspch.read_json(read_feature_path + 'feature_args.json')
timit_data.extract_features_from_signals(feature_args)

In [16]:
# TIMIT61 phoneme labels (phn)
read_label_path = read_path + 'data/segmentation/'
label_args = {'pad': 'h#', 'extension': '.phn'}
shift = feature_args['f_shift'] * feature_args['sample_rate']
timit_data.extract_alligned_labels(read_label_path, shift, label_args['pad'], label_args['extension'])

In [17]:
# split Spchdata into train/test
train_data = timit_data.subset_with_regex(f'.*(train)/.*')
test_data = timit_data.subset_with_regex(f'.*(test)/.*')

# to dataframe
train_df = train_data.to_dataframe()
test_df = test_data.to_dataframe()

# drop signals (wav-data)
train_df.drop(columns=['signals'], inplace=True)
test_df.drop(columns=['signals'], inplace=True)

In [18]:
# write setup to disk
write_setup_path = write_path + 'data/mfcc13/'
train_df.to_pickle(write_setup_path + 'train.pkl')
test_df.to_pickle(write_setup_path + 'test.pkl')

In [None]:
if False:
    
    # read from disk
    read_setup_path = write_path + 'data/dummy/mfcc13/'
    train_df = pd.read_pickle(write_setup_path + 'train.pkl')
    test_df = pd.read_pickle(write_setup_path + 'test.pkl')
    
    # dataframe to SpchData
    train_data = pyspch.nn.DataFrame_to_SpchData(train_df)
    test_data = pyspch.nn.DataFrame_to_SpchData(test_df)
    print(train_data.corpus)
    print(test_data.corpus)

## Other setups

In [None]:
# Mel Frequency Filterbanks (mel80)
read_feature_path = read_path + 'data/mel80/'
write_setup_path = write_path + 'data/mel80/'

if False:
    
    feature_args = pyspch.read_json(read_feature_path + 'feature_args.json')
    timit_data.extract_features_from_signals(feature_args)
    train_data = timit_data.subset_with_regex(f'.*(train)/.*')
    test_data = timit_data.subset_with_regex(f'.*(test)/.*')
    train_df = train_data.to_dataframe()
    test_df = test_data.to_dataframe()
    train_df.drop(columns=['signals'], inplace=True)
    test_df.drop(columns=['signals'], inplace=True)
    train_df.to_pickle(write_setup_path + 'train.pkl')
    test_df.to_pickle(write_setup_path + 'test.pkl')

In [None]:
# Filterbanks (fb)
read_feature_path = read_path + 'data/fb/'
write_setup_path = write_path + 'data/fb/'

if False:
    
    feature_args = pyspch.read_json(read_feature_path + 'feature_args.json')
    timit_data.extract_features_from_signals(feature_args)
    train_data = timit_data.subset_with_regex(f'.*(train)/.*')
    test_data = timit_data.subset_with_regex(f'.*(test)/.*')
    train_df = train_data.to_dataframe()
    test_df = test_data.to_dataframe()
    train_df.drop(columns=['signals'], inplace=True)
    test_df.drop(columns=['signals'], inplace=True)
    train_df.to_pickle(write_setup_path + 'train.pkl')
    test_df.to_pickle(write_setup_path + 'test.pkl')