In [1]:
%cd "/content/drive/MyDrive/Colab Notebooks/TensorflowLite/yamnet"

/content/drive/MyDrive/Colab Notebooks/TensorflowLite/yamnet


In [2]:
!pwd

/content/drive/MyDrive/Colab Notebooks/TensorflowLite/yamnet


In [3]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile

In [4]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

In [5]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  class_names = []
  with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      class_names.append(row['display_name'])

  return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

In [6]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [7]:
# wav_file_name = 'speech_whistling2.wav'
wav_file_name = 'data/baby_1.wav'
sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

# Show some basic information about the audio.
duration = len(wav_data)/sample_rate
print(f'Sample rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(wav_data)}')

# Listening to the wav file.
Audio(wav_data, rate=sample_rate)

Sample rate: 16000 Hz
Total duration: 5.00s
Size of the input: 80000


In [8]:
waveform = wav_data / tf.int16.max

In [9]:
# Run the model, check the output.
scores, embeddings, spectrogram = model(waveform)

In [10]:
scores_np = scores.numpy()
spectrogram_np = spectrogram.numpy()
infered_class = class_names[scores_np.mean(axis=0).argmax()]
print(f'The main sound is: {infered_class}')

The main sound is: Crying, sobbing


In [17]:
import os
 
path_dir = '/content/drive/MyDrive/Colab Notebooks/TensorflowLite/yamnet/data' 
raw_file_list = os.listdir(path_dir)

In [18]:
file_list = []

for file in raw_file_list:
  temp = 'data/' + file
  file_list.append(temp)

In [20]:
for file in file_list:
  wav_file_name = file
  sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
  sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

  waveform = wav_data / tf.int16.max

  scores, embeddings, spectrogram = model(waveform)

  scores_np = scores.numpy()
  spectrogram_np = spectrogram.numpy()
  infered_class = class_names[scores_np.mean(axis=0).argmax()]
  print(file)
  print(f'The main sound is: {infered_class}')

data/vaccum_1.wav
The main sound is: Vehicle
data/glass_1.wav
The main sound is: Silence
data/alarm_1.wav
The main sound is: Alarm clock
data/siren_1.wav
The main sound is: Emergency vehicle
data/helicopter_1.wav
The main sound is: Vehicle
data/sheep_1.wav
The main sound is: Livestock, farm animals, working animals
data/baby_1.wav
The main sound is: Crying, sobbing
data/dog_1.wav
The main sound is: Animal
data/pig_1.wav
The main sound is: Speech
data/cat_1.wav
The main sound is: Cat
data/train_1.wav
The main sound is: Rail transport
