In [1]:
from speechbrain.inference.ASR import StreamingASR
from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
asr_model = StreamingASR.from_hparams(
    source="speechbrain/asr-streaming-conformer-librispeech",
    savedir="pretrained_models/asr-streaming-conformer-librispeech"
)

asr_model.transcribe_file(
    "harvard.wav",
    # select a chunk size of ~960ms with 4 chunks of left context
    DynChunkTrainConfig(24, 4),
    # disable torchaudio streaming to allow fetching from HuggingFace
    # set this to True for your own files or streams to allow for streaming file decoding
    use_torchaudio_streaming=True,
)

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


"THE STALE SMELL OF OLD BEER LINGERS IT TAKES HEAT TO BRING OUT THE ODOR A COLD DIP RESTORES HEALTH AND ZEST A SALT PICKLE TASTES FINE WITH HAM TACKLE'S AL PASTORA ARE MY FAVOURITE A ZESTFUL FOOD IS THE HOT CROSS BUN"

In [7]:
asr_model.mods.keys()

dict_keys(['CNN', 'enc', 'emb', 'dec', 'Tjoint', 'transducer_lin', 'normalize', 'proj_ctc', 'proj_dec', 'proj_enc'])

In [8]:
asr_model.mods.CNN

ConvolutionFrontEnd(
  (convblock_0): ConvBlock(
    (convs): Sequential(
      (conv_0): Conv2d(
        (conv): Conv2d(1, 64, kernel_size=(3, 3), stride=(2, 2))
      )
      (norm_0): LayerNorm(
        (norm): LayerNorm((40, 64), eps=1e-05, elementwise_affine=True)
      )
      (act_0): LeakyReLU(negative_slope=0.01)
      (dropout_0): Dropout(p=0.1, inplace=False)
    )
  )
  (convblock_1): ConvBlock(
    (convs): Sequential(
      (conv_0): Conv2d(
        (conv): Conv2d(64, 32, kernel_size=(3, 3), stride=(2, 2))
      )
      (norm_0): LayerNorm(
        (norm): LayerNorm((20, 32), eps=1e-05, elementwise_affine=True)
      )
      (act_0): LeakyReLU(negative_slope=0.01)
      (dropout_0): Dropout(p=0.1, inplace=False)
    )
  )
)

In [28]:
import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, time_dim, freq_dim):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(3, 3), stride=(2, 2), padding=1)
        self.norm = nn.LayerNorm([out_channels, time_dim // 2, freq_dim // 2])  # Normalize across (C, T, F)
        self.activation = nn.LeakyReLU(negative_slope=0.01)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        x = self.conv(x)  # Convolution
        x = self.norm(x)  # Apply LayerNorm across feature maps
        x = self.activation(x)
        x = self.dropout(x)
        return x

class ConvolutionFrontEnd(nn.Module):
    def __init__(self, time_dim=80, freq_dim=80):
        super().__init__()
        self.convblock_0 = ConvBlock(1, 64, time_dim, freq_dim)  # (1 -> 64)
        self.convblock_1 = ConvBlock(64, 32, time_dim // 2, freq_dim // 2)  # (64 -> 32)

    def forward(self, x):
        x = self.convblock_0(x)
        x = self.convblock_1(x)
        return x

# Instantiate Model
model = ConvolutionFrontEnd(time_dim=80, freq_dim=80)

# Generate Fake Audio Data
fake_audio_input = torch.randn(2, 1, 80, 80)  # (batch=2, channels=1, time=80, freq=80)

# Forward Pass
output = model(fake_audio_input)

# Print Output Shape
print("Output shape:", output.shape)  # Should be (2, 32, 20, 20)


Output shape: torch.Size([2, 32, 20, 20])
