The majority of the code in this notebook is taken from:
--------------------------------------------------------
 https://github.com/coqui-ai/STT/blob/main/notebooks/train_personal_model_with_common_voice.ipynb

In [1]:
# comment out this code block when running the notebook

# # install necessary packages and clone the STT directory from COQUI
# ! apt-get install sox libsox-fmt-mp3 libopusfile0 libopus-dev libopusfile-dev
# ! pip install --upgrade pip
# ! pip install gdown
# ! pip install coqui_stt_training
# ! pip uninstall -y tensorflow; pip install "tensorflow-gpu==1.15"
# ! git clone --depth=1 https://github.com/coqui-ai/STT.git

In [None]:
# import necessary packages
from coqui_stt_training.util.config import initialize_globals_from_args
from coqui_stt_training.train import train
from coqui_stt_training.evaluate import test
from coqui_stt_training.util.downloader import maybe_download
import glob
import os
import tarfile
import gdown

In [None]:
# downloading my own metadata and data files via gdown, no mounting required
urls = ["https://drive.google.com/uc?id=17-ZmhHrI9sNM2kgkihBL3XyVgi3y31cf", "https://drive.google.com/uc?id=1UyFPRsW5BFwy7cOVj0JqtpDjhVOkopxf"]
outputs = ["takeout_452_metadata.txt", "takeout_452_pt_0.zip"]

for i in range(len(urls)):
  gdown.download(urls[i], outputs[i], quiet=False)

Downloading...
From: https://drive.google.com/uc?id=17-ZmhHrI9sNM2kgkihBL3XyVgi3y31cf
To: /content/takeout_452_metadata.txt
100%|██████████| 30.4k/30.4k [00:00<00:00, 32.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1UyFPRsW5BFwy7cOVj0JqtpDjhVOkopxf
To: /content/takeout_452_pt_0.zip
100%|██████████| 7.83M/7.83M [00:00<00:00, 13.8MB/s]


In [None]:
# Just to be transparent: Victoria Ivanova helped me with this specific code block; in the tutorial it wasn't clear that we had to specify 
# load_checkpoint_dir and save_checkpoint_dir here already, so she pointed that out to me.

# obtain data and split the dataset
! python STT/bin/import_cv_personal.py --normalize takeout_*.txt takeout_*.zip

DATA_CSV=glob.glob("/content/takeout_*/data.csv")[0]

initialize_globals_from_args(
    load_checkpoint_dir="english/coqui-stt-1.1.0-checkpoint",
    save_checkpoint_dir="my-model/checkpoints",
    auto_input_dataset=DATA_CSV
)

Loading TSV file:  /content/takeout_452_metadata.txt
Importing mp3 files...
Imported 229 samples.
Skipped 1 samples that failed on transcript validation.
Final amount of imported audio: 0:21:51 from 0:21:56.
Saving new Coqui STT-formatted CSV file to:  /content/takeout_452_pt_0/data.csv
Writing CSV file for train.py as:  /content/takeout_452_pt_0/data.csv
INFO: compiled /content/data.csv
INFO: formatted data located in  /content/takeout_452_pt_0
INFO: you now should decide {train,test,dev} splits on your own
INFO: or you can use --auto_input_dataset flag from our training code
I Processing --auto_input_dataset input: /content/takeout_452_pt_0/data.csv...
I Saved generated alphabet with characters ([' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']) into /content/takeout_452_pt_0/alphabet.txt
I Generated train set size: 77 samples.
I Generated validation set size: 76 samples.
I Generated test set s

In [None]:
# download pre-trained STT model from COQUI

def download_pretrained_model():
    model_dir="english/"
    if not os.path.exists("english/coqui-stt-1.1.0-checkpoint"):
        maybe_download("model.tar.gz", model_dir, "https://github.com/coqui-ai/STT/releases/download/v1.1.0/coqui-stt-1.1.0-checkpoint.tar.gz")
        print('\nNo extracted pre-trained model found. Extracting now...')
        tar = tarfile.open("english/model.tar.gz")
        tar.extractall("english/")
        tar.close()
    else:
        print('Found pre-trained 🐸STT model, skipping download.')

def download_language_model():
    model_dir="english/"
    if not os.path.exists("english/huge-vocabulary.scorer"):
        maybe_download("huge-vocabulary.scorer", model_dir, "https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-huge-vocab/huge-vocabulary.scorer")
    else:
        print('Found 🐸STT language model, skipping download.')

# Download + extract pre-trained English model
download_pretrained_model()
download_language_model()

No path "english/" - creating ...
No archive "english/model.tar.gz" - downloading...


100%|██████████| 647314932/647314932 [01:32<00:00, 7035414.50it/s]



No extracted pre-trained model found. Extracting now...
No archive "english/huge-vocabulary.scorer" - downloading...


100%|██████████| 978407904/978407904 [03:01<00:00, 5401164.82it/s]


In [None]:
# fine-tune model with my own training and validation sets

TRAIN_CSV = glob.glob("/content/takeout_*/train.csv")
DEV_CSV = glob.glob("/content/takeout_*/dev.csv")

initialize_globals_from_args(
    load_checkpoint_dir="english/coqui-stt-1.1.0-checkpoint",
    save_checkpoint_dir="my-model/checkpoints",
    alphabet_config_path="english/coqui-stt-1.1.0-checkpoint/alphabet.txt",
    train_files=TRAIN_CSV,
    dev_files=DEV_CSV,
    epochs=10,
    load_cudnn=True,
    train_batch_size=32,
    dev_batch_size=32,
)

train()

I Performing dummy training to check for memory problems.
I If the following process crashes, you likely have batch sizes that are too big for your available system memory (or GPU memory).
I Loading best validating checkpoint from english/coqui-stt-1.1.0-checkpoint/best_dev-3663881
W CUDNN variable not found: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel/Adam_1
W CUDNN variable not found: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/bias/Adam
W CUDNN variable not found: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/bias/Adam_1
W CUDNN variable not found: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel/Adam
I Loading variable from checkpoint: beta1_power
I Loading variable from checkpoint: beta2_power
I Loading variable from checkpoint: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/bias
I Loading variable from checkpoint: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/

In [None]:
# evaluate COQUI's pre-trained model on the test set

TEST_CSV = glob.glob("/content/takeout_*/test.csv")

initialize_globals_from_args(
    load_checkpoint_dir="english/coqui-stt-1.1.0-checkpoint",
    save_checkpoint_dir="english/coqui-stt-1.1.0-checkpoint",
    scorer_path="english/huge-vocabulary.scorer",
    alphabet_config_path="english/coqui-stt-1.1.0-checkpoint/alphabet.txt",
    test_files = TEST_CSV,
    test_batch_size = 32,
)

test()

I Loading best validating checkpoint from english/coqui-stt-1.1.0-checkpoint/best_dev-3663881
I Loading variable from checkpoint: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/bias
I Loading variable from checkpoint: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel
I Loading variable from checkpoint: global_step
I Loading variable from checkpoint: layer_1/bias
I Loading variable from checkpoint: layer_1/weights
I Loading variable from checkpoint: layer_2/bias
I Loading variable from checkpoint: layer_2/weights
I Loading variable from checkpoint: layer_3/bias
I Loading variable from checkpoint: layer_3/weights
I Loading variable from checkpoint: layer_5/bias
I Loading variable from checkpoint: layer_5/weights
I Loading variable from checkpoint: layer_6/bias
I Loading variable from checkpoint: layer_6/weights
Testing model on /content/takeout_452_pt_0/test.csv
Test epoch | Steps: 3 | Elapsed Time: 0:01:02                                  
Test on /

In [None]:
# evaluate my own fine-tuned model on the test set

initialize_globals_from_args(
    load_checkpoint_dir="my-model/checkpoints",
    save_checkpoint_dir="my-model/checkpoints",
    scorer_path="english/huge-vocabulary.scorer",
    alphabet_config_path="english/coqui-stt-1.1.0-checkpoint/alphabet.txt",
    test_files = TEST_CSV,
    test_batch_size = 32,
)

test()

I Loading best validating checkpoint from my-model/checkpoints/best_dev-3663901
I Loading variable from checkpoint: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/bias
I Loading variable from checkpoint: cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel
I Loading variable from checkpoint: global_step
I Loading variable from checkpoint: layer_1/bias
I Loading variable from checkpoint: layer_1/weights
I Loading variable from checkpoint: layer_2/bias
I Loading variable from checkpoint: layer_2/weights
I Loading variable from checkpoint: layer_3/bias
I Loading variable from checkpoint: layer_3/weights
I Loading variable from checkpoint: layer_5/bias
I Loading variable from checkpoint: layer_5/weights
I Loading variable from checkpoint: layer_6/bias
I Loading variable from checkpoint: layer_6/weights
Testing model on /content/takeout_452_pt_0/test.csv
Test epoch | Steps: 3 | Elapsed Time: 0:00:57                                  
Test on /content/takeou