-
Notifications
You must be signed in to change notification settings - Fork 78
/
base.yml
123 lines (109 loc) · 2.67 KB
/
base.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Model setup
DAC.sample_rate: 44100
DAC.encoder_dim: 64
DAC.encoder_rates: [2, 4, 8, 8]
DAC.decoder_dim: 1536
DAC.decoder_rates: [8, 8, 4, 2]
# Quantization
DAC.n_codebooks: 9
DAC.codebook_size: 1024
DAC.codebook_dim: 8
DAC.quantizer_dropout: 1.0
# Discriminator
Discriminator.sample_rate: 44100
Discriminator.rates: []
Discriminator.periods: [2, 3, 5, 7, 11]
Discriminator.fft_sizes: [2048, 1024, 512]
Discriminator.bands:
- [0.0, 0.1]
- [0.1, 0.25]
- [0.25, 0.5]
- [0.5, 0.75]
- [0.75, 1.0]
# Optimization
AdamW.betas: [0.8, 0.99]
AdamW.lr: 0.0001
ExponentialLR.gamma: 0.999996
amp: false
val_batch_size: 100
device: cuda
num_iters: 250000
save_iters: [10000, 50000, 100000, 200000]
valid_freq: 1000
sample_freq: 10000
num_workers: 32
val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
seed: 0
lambdas:
mel/loss: 15.0
adv/feat_loss: 2.0
adv/gen_loss: 1.0
vq/commitment_loss: 0.25
vq/codebook_loss: 1.0
VolumeNorm.db: [const, -16]
# Transforms
build_transform.preprocess:
- Identity
build_transform.augment_prob: 0.0
build_transform.augment:
- Identity
build_transform.postprocess:
- VolumeNorm
- RescaleAudio
- ShiftPhase
# Loss setup
MultiScaleSTFTLoss.window_lengths: [2048, 512]
MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
MelSpectrogramLoss.pow: 1.0
MelSpectrogramLoss.clamp_eps: 1.0e-5
MelSpectrogramLoss.mag_weight: 0.0
# Data
batch_size: 72
train/AudioDataset.duration: 0.38
train/AudioDataset.n_examples: 10000000
val/AudioDataset.duration: 5.0
val/build_transform.augment_prob: 1.0
val/AudioDataset.n_examples: 250
test/AudioDataset.duration: 10.0
test/build_transform.augment_prob: 1.0
test/AudioDataset.n_examples: 1000
AudioLoader.shuffle: true
AudioDataset.without_replacement: true
train/build_dataset.folders:
speech_fb:
- /data/daps/train
speech_hq:
- /data/vctk
- /data/vocalset
- /data/read_speech
- /data/french_speech
speech_uq:
- /data/emotional_speech/
- /data/common_voice/
- /data/german_speech/
- /data/russian_speech/
- /data/spanish_speech/
music_hq:
- /data/musdb/train
music_uq:
- /data/jamendo
general:
- /data/audioset/data/unbalanced_train_segments/
- /data/audioset/data/balanced_train_segments/
val/build_dataset.folders:
speech_hq:
- /data/daps/val
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/
test/build_dataset.folders:
speech_hq:
- /data/daps/test
music_hq:
- /data/musdb/test
general:
- /data/audioset/data/eval_segments/