In [2]:
import time
import yaml
import numpy as np
import torchaudio
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

from s3prl.hub import distilhubert
from modules.model import CustomStudentModelConfig, CustomStudentModel

# Measure # of parameters in model
def nparams(model):
    #print(f"{sum(p.numel() for p in model.parameters())} ({type(model).__name__})")
    return (sum(p.numel() for p in model.parameters()))

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [3]:
librosa_mel_cfg_path = "./fitnet-torchaudio.yaml"
ch512_cfg_path = "./fitnet-default.yaml"
ch256_cfg_path = "./fitnet-ch256.yaml"
ch128_cfg_path = "./fitnet-ch128.yaml"

with open(librosa_mel_cfg_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    model_config = cfg['distiller']
    librosa_mel_cfg = CustomStudentModelConfig(**model_config)
    librosa_mel_model = CustomStudentModel(librosa_mel_cfg)
    librosa_mel_model._disable_projection_heads()
    
with open(ch512_cfg_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    model_config = cfg['distiller']
    ch512_cfg = CustomStudentModelConfig(**model_config)
    ch512_model = CustomStudentModel(ch512_cfg)
    ch512_model._disable_projection_heads()
    
with open(ch256_cfg_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    model_config = cfg['distiller']
    ch256_cfg = CustomStudentModelConfig(**model_config)
    ch256_model = CustomStudentModel(ch256_cfg)
    ch256_model._disable_projection_heads()
    
with open(ch128_cfg_path) as f:
    cfg = yaml.load(f, Loader=yaml.FullLoader)
    model_config = cfg['distiller']
    ch128_cfg = CustomStudentModelConfig(**model_config)
    ch128_model = CustomStudentModel(ch128_cfg)
    ch128_model._disable_projection_heads()

In [4]:
print ("DistilHuBERT has # of params: " + str(23491200))
print ("80dim mel fitnet has # of params: " + str(nparams(librosa_mel_model)))
print ("ch512 fitnet has # of params: " + str(nparams(ch512_model)))
print ("ch256 fitnet has # of params: " + str(nparams(ch256_model)))
print ("ch128 fitnet has # of params: " + str(nparams(ch128_model)))

DistilHuBERT has # of params: 23491200
80dim mel fitnet has # of params: 19452768
ch512 fitnet has # of params: 23861440
ch256 fitnet has # of params: 20589248
ch128 fitnet has # of params: 19739584


In [None]:
!pip install ipywidgets

In [4]:
# Measure inference time and compare with DistilHuBERT for test data
from s3prl.hub import distilhubert

LS_PATH = "../db/"

def inference_time(model):

    test_data = torchaudio.datasets.LIBRISPEECH(LS_PATH, "test-clean", download=True)
    test_dataloader = DataLoader(test_data, batch_size=1, num_workers=4)

    inf_time = []

    for i in range(5):
        start_time = time.time()
        pbar = tqdm(range(50))
        for i, batch in enumerate(test_dataloader):
            model(batch[0][0])
            if i == 50:
                break
            pbar.update(1)

        end_time = time.time()
        inf_time.append(end_time - start_time)

    print(f"Inference Time: {np.mean(inf_time)}")

    # Compare with DistilHuBERT's inference time
    inf_time_distilhubert = []
    dh_model = distilhubert()
    
    for i in range(5):
        start_time = time.time()
        pbar = tqdm(range(50))
        for i, batch in enumerate(test_dataloader):
            dh_model([batch[0][0][0]])
            if i == 50:
                break
            pbar.update(1)

        end_time = time.time()
        inf_time_distilhubert.append(end_time - start_time)

    print(f"Inference Time: {np.mean(inf_time_distilhubert)}")
    speed_up = np.mean(inf_time) / np.mean(inf_time_distilhubert)
    print(f"Speed Up: {speed_up}")

In [5]:
inference_time(librosa_mel_model)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Inference Time: 6.922431230545044
Using cache found in /home/kangwook/.cache/torch/hub/s3prl_cache/b6df359653d40d65eccc3f921afac202d81adb697418d1fe04c09fd440489ec3
for https://www.dropbox.com/s/hcfczqo5ao8tul3/disilhubert_ls960_4-8-12.ckpt?dl=0
[UpstreamExpert] - Using the default upstream expert config
[DistillerModel] - Expands the output dimension by 3 times
[DistillerModel] - Pred layers: [4, 8, 12]
[TransformerEncoder] - Attention type = original
[DistillerModel] - Out layer type: expand-last
[DistillerModel] - Inter dim = 768


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Inference Time: 21.953469467163085
Speed Up: 0.31532288055422286


In [8]:
inference_time(ch512_model)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Inference Time: 22.1890558719635
Using cache found in /home/kangwook/.cache/torch/hub/s3prl_cache/b6df359653d40d65eccc3f921afac202d81adb697418d1fe04c09fd440489ec3
for https://www.dropbox.com/s/hcfczqo5ao8tul3/disilhubert_ls960_4-8-12.ckpt?dl=0
[UpstreamExpert] - Using the default upstream expert config
[DistillerModel] - Expands the output dimension by 3 times
[DistillerModel] - Pred layers: [4, 8, 12]
[TransformerEncoder] - Attention type = original
[DistillerModel] - Out layer type: expand-last
[DistillerModel] - Inter dim = 768


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Inference Time: 21.64878454208374
Speed Up: 1.0249561969093235


In [9]:
inference_time(ch256_model)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Inference Time: 12.282356405258179
Using cache found in /home/kangwook/.cache/torch/hub/s3prl_cache/b6df359653d40d65eccc3f921afac202d81adb697418d1fe04c09fd440489ec3
for https://www.dropbox.com/s/hcfczqo5ao8tul3/disilhubert_ls960_4-8-12.ckpt?dl=0
[UpstreamExpert] - Using the default upstream expert config
[DistillerModel] - Expands the output dimension by 3 times
[DistillerModel] - Pred layers: [4, 8, 12]
[TransformerEncoder] - Attention type = original
[DistillerModel] - Out layer type: expand-last
[DistillerModel] - Inter dim = 768


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Inference Time: 22.343941974639893
Speed Up: 0.5496951441781628
