In [1]:
from load_fsq_model import load_model

In [2]:
model = load_model('../wav2vec_small_960h.pt')

<class 'fairseq.tasks.audio_pretraining.AudioPretrainingTask'>


In [3]:
model = model.w2v_encoder.w2v_model

## Examine number of parameters(base)

In [4]:
# Examine # of parameters in model
def nparams(model):
    print(f"{sum(p.numel() for p in model.parameters())} ({type(model).__name__})")

nparams(model)
nparams(model.encoder)
nparams(model.feature_extractor)
nparams(model.post_extract_proj)
nparams(model.layer_norm)

94371712 (Wav2Vec2Model)
89775488 (TransformerEncoder)
4200448 (ConvFeatureExtractionModel)
393984 (Linear)
1024 (FusedLayerNorm)


In [5]:
nparams(model.encoder.pos_conv)
nparams(model.encoder.layers[0])

4719488 (Sequential)
7087872 (TransformerSentenceEncoderLayer)


## Examine inference speed(base)

In [6]:
# Get one audio sample from librispeech
import torchaudio
test_data = torchaudio.datasets.LIBRISPEECH("../", "test-clean", download=True)
sample = test_data[0][0]
sample.shape

torch.Size([1, 166960])

In [8]:
# Compare inference time of each component
import time
import numpy as np
import torch.nn.functional as F
from tqdm.auto import tqdm

sample = test_data[0][0]

ckpt1 = []
ckpt2 = []
ckpt3 = []
ckpt4 = []
ckpt5 = []
whole = []

pbar = tqdm(range(100))

for i in pbar:
    start_time = time.time()
    #conv feature extractor
    features = model.feature_extractor(sample)
    features = features.transpose(1, 2)
    features = model.layer_norm(features)
    ckpt1_time = time.time()
    ckpt1.append(ckpt1_time - start_time)

    #post extract proj
    features = model.post_extract_proj(features)
    ckpt2_time = time.time()
    ckpt2.append(ckpt2_time - ckpt1_time)

    #conv position embedding
    x_conv = model.encoder.pos_conv(features.transpose(1, 2))
    x_conv = x_conv.transpose(1, 2)
    x = features + x_conv
    x = model.encoder.layer_norm(x)
    x = F.dropout(x, p=0.1, training=True)
    x = x.transpose(0, 1)
    ckpt3_time = time.time()
    ckpt3.append(ckpt3_time - ckpt2_time)

    #Transformer Layers
    for i, layer in enumerate(model.encoder.layers):
        x, z = layer(x)
        if i==1:
            ckpt4_time = time.time()
            ckpt4.append(ckpt4_time - ckpt3_time)
    x = x.transpose(0, 1)
    ckpt5_time = time.time()
    ckpt5.append(ckpt5_time - ckpt3_time)
    whole.append(ckpt5_time - start_time)
    
print(f"Checkpoint 1: {np.mean(ckpt1)}")
print(f"Checkpoint 2: {np.mean(ckpt2)}")
print(f"Checkpoint 3: {np.mean(ckpt3)}")
print(f"Checkpoint 4: {np.mean(ckpt4)}")
print(f"Checkpoint 5: {np.mean(ckpt5)}")
print(f"Whole inference time: {np.mean(whole)}")

  0%|          | 0/100 [00:00<?, ?it/s]

Checkpoint 1: 0.2713434028625488
Checkpoint 2: 0.0017110848426818849
Checkpoint 3: 0.026907129287719725
Checkpoint 4: 0.07916448831558227
Checkpoint 5: 0.47131627321243286
Whole inference time: 0.7712778902053833


# Wav2Vec2 Large

In [9]:
model = load_model('../wav2vec_big_960h.pt')

<class 'fairseq.tasks.audio_pretraining.AudioPretrainingTask'>


In [10]:
model = model.w2v_encoder.w2v_model

## Examine number of parameters(large)

In [11]:
# Examine # of parameters in model
def nparams(model):
    print(f"{sum(p.numel() for p in model.parameters())} ({type(model).__name__})")

nparams(model)
nparams(model.encoder)
nparams(model.feature_extractor)
nparams(model.post_extract_proj)
nparams(model.layer_norm)

315428992 (Wav2Vec2Model)
310701184 (TransformerEncoder)
4200448 (ConvFeatureExtractionModel)
525312 (Linear)
1024 (FusedLayerNorm)


In [12]:
nparams(model.encoder.pos_conv)
nparams(model.encoder.layers[0])

8389760 (Sequential)
12596224 (TransformerSentenceEncoderLayer)


## Examine inference speed(large)

In [13]:
# Get one audio sample from librispeech
import torchaudio
test_data = torchaudio.datasets.LIBRISPEECH("../", "test-clean", download=True)
sample = test_data[0][0]
sample.shape

torch.Size([1, 166960])

In [14]:
# Compare inference time of each component
import time
import numpy as np
import torch.nn.functional as F
from tqdm.auto import tqdm

sample = test_data[0][0]

ckpt1 = []
ckpt2 = []
ckpt3 = []
ckpt4 = []
ckpt5 = []
whole = []

pbar = tqdm(range(100))

for i in pbar:
    start_time = time.time()
    #conv feature extractor
    features = model.feature_extractor(sample)
    features = features.transpose(1, 2)
    features = model.layer_norm(features)
    ckpt1_time = time.time()
    ckpt1.append(ckpt1_time - start_time)

    #post extract proj
    features = model.post_extract_proj(features)
    ckpt2_time = time.time()
    ckpt2.append(ckpt2_time - ckpt1_time)

    #conv position embedding
    x_conv = model.encoder.pos_conv(features.transpose(1, 2))
    x_conv = x_conv.transpose(1, 2)
    x = features + x_conv
    x = model.encoder.layer_norm(x)
    x = F.dropout(x, p=0.1, training=True)
    x = x.transpose(0, 1)
    ckpt3_time = time.time()
    ckpt3.append(ckpt3_time - ckpt2_time)

    #Transformer Layers
    for i, layer in enumerate(model.encoder.layers):
        x, z = layer(x)
        if i==1:
            ckpt4_time = time.time()
            ckpt4.append(ckpt4_time - ckpt3_time)
    x = x.transpose(0, 1)
    ckpt5_time = time.time()
    ckpt5.append(ckpt5_time - ckpt3_time)
    whole.append(ckpt5_time - start_time)
    
print(f"Checkpoint 1: {np.mean(ckpt1)}")
print(f"Checkpoint 2: {np.mean(ckpt2)}")
print(f"Checkpoint 3: {np.mean(ckpt3)}")
print(f"Checkpoint 4: {np.mean(ckpt4)}")
print(f"Checkpoint 5: {np.mean(ckpt5)}")
print(f"Whole inference time: {np.mean(whole)}")

  0%|          | 0/100 [00:00<?, ?it/s]

Checkpoint 1: 0.2549955487251282
Checkpoint 2: 0.0022010564804077148
Checkpoint 3: 0.055412154197692874
Checkpoint 4: 0.12740011692047118
Checkpoint 5: 1.4390701627731324
Whole inference time: 1.7516789221763611
