In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audios/zero.wav
/kaggle/input/audios/backward.wav
/kaggle/input/speech_commands_wav/pytorch/default/1/best_model.pth


In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.13.2-py3-none-any.whl.metadata (16 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.1

In [3]:
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import IPython.display as ipd
from transformers import AutoModel, AutoFeatureExtractor
import gradio as gr

In [4]:
labels = ['backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 
          'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left',
          'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 
          'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual',
          'wow', 'yes', 'zero']

In [5]:
class SpeechCommandsClassifier(nn.Module):
    def __init__(self, base_model, num_labels, dropout_rate=0.3):
        super().__init__()
        self.base_model = base_model
        
        # Congelar todo o modelo base
        for param in self.base_model.parameters():
            param.requires_grad = False
            
        # Descongelar apenas as 2 últimas camadas do encoder
        for i in range(2):
            for param in self.base_model.encoder.layers[-(i+1)].parameters():
                param.requires_grad = True
        
        self.dropout1 = nn.Dropout(dropout_rate)
        self.batch_norm1 = nn.BatchNorm1d(768)
        self.dense1 = nn.Linear(768, 384)
        
        self.dropout2 = nn.Dropout(dropout_rate)
        self.batch_norm2 = nn.BatchNorm1d(384)
        self.dense2 = nn.Linear(384, num_labels)

    def forward(self, input_values):
        outputs = self.base_model(input_values)
        hidden_states = outputs.last_hidden_state
        
        pooled = torch.mean(hidden_states, dim=1)
        
        x = self.dropout1(pooled)
        x = self.batch_norm1(x)
        x = F.relu(self.dense1(x))
        
        x = self.dropout2(x)
        x = self.batch_norm2(x)
        logits = self.dense2(x)
        
        return logits

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
base_model = AutoModel.from_pretrained("facebook/wav2vec2-base")
model = SpeechCommandsClassifier(
    base_model=base_model,
    num_labels=35,
    dropout_rate=0.3
).to(device)
model.load_state_dict(torch.load('/kaggle/input/speech_commands_wav/pytorch/default/1/best_model.pth', weights_only=True))
model.eval()

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

SpeechCommandsClassifier(
  (base_model): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
 

In [8]:
def predict_audio(audio_file):
    waveform, sample_rate = torchaudio.load(audio_file)
    
    print(f'shape inicial: {waveform.shape}')
    print(f'sample rate inicial: {sample_rate}')

    if sample_rate != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)

    print(f'shape após resample: {waveform.shape}')
    print(f'sample rate após resample: {16000}')
    
    waveform = waveform.to(device)
    
    with torch.no_grad():
        outputs = model(waveform)
        predictions = torch.softmax(outputs.squeeze(), dim=-1)
        predicted_idx = torch.argmax(predictions).item()
        confidence = predictions[predicted_idx].item()

    predicted_label = labels[predicted_idx]

    return f"{predicted_label} (Confidence: {confidence:.2%})"

In [9]:
iface = gr.Interface(
    fn=predict_audio,
    inputs=gr.Audio(type="filepath", label="Upload de Áudio"),
    outputs=gr.Textbox(label="Predição"),
    title="Classificador de Comandos de Voz",
    description="Faça upload de um arquivo de áudio para classificar o comando.",
    examples=[
        ["exemplo1.wav"],
        ["exemplo2.wav"]
    ]
)

In [10]:
iface.launch()

* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://bb1039f771cd638d72.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


