# Convert .wav to spectrograms

Based on https://medium.com/analytics-vidhya/how-to-classify-sounds-using-pytorch-27c9f2d4d714


# Final code

Credits: # https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/audio_preprocessing_tutorial.ipynb#scrollTo=RIl-u3baEABA

In [1]:
!pip3 install torchaudio
!pip install torch

Collecting torchaudio
[?25l  Downloading https://files.pythonhosted.org/packages/aa/55/01ad9244bcd595e39cea5ce30726a7fe02fd963d07daeb136bfe7e23f0a5/torchaudio-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 16.9MB/s 
[?25hCollecting torch==1.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/56/74/6fc9dee50f7c93d6b7d9644554bdc9692f3023fa5d1de779666e6bf8ae76/torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (804.1MB)
[K     |████████████████████████████████| 804.1MB 22kB/s 
[31mERROR: torchvision 0.9.0+cu101 has requirement torch==1.8.0, but you'll have torch 1.8.1 which is incompatible.[0m
[31mERROR: torchtext 0.9.0 has requirement torch==1.8.0, but you'll have torch 1.8.1 which is incompatible.[0m
Installing collected packages: torch, torchaudio
  Found existing installation: torch 1.8.0+cu101
    Uninstalling torch-1.8.0+cu101:
      Successfully uninstalled torch-1.8.0+cu101
Successfully installed torch-1.8.1 torchaudio-

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import os

Code from # https://pytorch.org/tutorials/beginner/audio_preprocessing_tutorial.html


In [4]:
def plot_waveform(waveform, sample_rate, path, filename, xlim=None, ylim=None):
  title = "Waveform: {}".format(filename)
  waveform = waveform.numpy()

  num_channels, num_frames = waveform.shape
  time_axis = torch.arange(0, num_frames) / sample_rate

  figure, axes = plt.subplots(num_channels, 1)
  if num_channels == 1:
    axes = [axes]
  for c in range(num_channels):
    axes[c].plot(time_axis, waveform[c], linewidth=1)
    axes[c].grid(True)
    if num_channels > 1:
      axes[c].set_ylabel(f'Channel {c+1}')
    if xlim:
      axes[c].set_xlim(xlim)
    if ylim:
      axes[c].set_ylim(ylim)
  figure.suptitle(title)

  fig1 = plt.gcf()
  plt.show(block=False)
  fig1.savefig(f'{path}/{filename}_waveform.png')

In [5]:
def plot_specgram(waveform, sample_rate, path, filename, xlim=None):
  title = "Spectrogram: {}".format(filename)
  waveform = waveform.numpy()

  num_channels, num_frames = waveform.shape
  time_axis = torch.arange(0, num_frames) / sample_rate

  figure, axes = plt.subplots(num_channels, 1)
  if num_channels == 1:
    axes = [axes]
  for c in range(num_channels):
    axes[c].specgram(waveform[c], Fs=sample_rate)
    if num_channels > 1:
      axes[c].set_ylabel(f'Channel {c+1}')
    if xlim:
      axes[c].set_xlim(xlim)
  figure.suptitle(title)

  fig1 = plt.gcf()
  plt.show(block=False)
  fig1.savefig(f'{path}/{filename}_specgram.png')


In [6]:
def process_recording(base_path, wav_filename, plot_waveform=False):
  wav_path = f'{base_path}/{wav_filename}'
  waveform, sample_rate = torchaudio.load(wav_path)

  filename = wav_filename.replace(".wav", "")
  filename = filename.replace(".mp3", "")
  if plot_waveform:
    plot_waveform(waveform=waveform, sample_rate=sample_rate, path=base_path, filename=filename)
  plot_specgram(waveform=waveform, sample_rate=sample_rate, path=base_path, filename=filename)

In [7]:
def process_recordings_folder(folder_name):
  with os.scandir(folder_name) as entries:
    for entry in entries:
      if entry.is_file():
        filename = entry.name
        if filename.endswith('.wav') or filename.endswith('.mp3'):
          print(f"processing: {filename}")
          
          process_recording(base_path=folder_name, wav_filename=filename)

In [8]:
base_path = '/content/gdrive/MyDrive/CMU/11785_Intro_to_Deep_Learning/DL_Group_Project/Dataset/'
folders = [
           'Marta_Recordings',
           'Mansi_Recordings/Complex',
           'Mansi_Recordings/Steady_State',
           'Sreedhar_Recordings',
           'Sreenidhi_Recordings'
]

In [9]:
for folder in folders:
  process_recordings_folder(f"{base_path}/{folder}")

Output hidden; open in https://colab.research.google.com to view.

# Other Resources

Some other resources checked to generate the plots

**1. First we will load the audio file.** From the directory having n number of sounds files, we will try to load 2–3 out of them using torchaudio.load first. torchaudio supports sound files of format ‘.wav’ and ‘.mp3’ which is used to give waveform and sample rate of the sound file. Waveform consists of frequencies of the sound per frame in an array format whereas the sample rate determines the frequency at which the waveform can be represented

In [None]:
import torchaudio
waveform, sample_rate = torchaudio.load("_PATH OF THE AUDIO FILE_")

**2. Normalize all the shape of waveforms to one size.** After loading the file, check the shape of the waveform using waveform.size()[0] . If it’s value is more than 1 , then we will have to normalize it using

In [None]:
from pydub import AudioSegment
waveform = AudioSegment.from_mp3(_PATH OF THE AUDIO FILE_)
waveform = waveform.set_channels(1)
waveform = waveform.get_array_of_samples()
waveform = torch.tensor(waveform, dtype = torch.float)
waveform = torch.reshape(waveform, (1,waveform.shape[0])

**3. Change the waveform to Spectrogram, Mel Spectrogram or, MFCC.** Now we will change waveform into Spectrogram(a visual representation of the spectrum of frequencies of a signal as it varies with time) using

In [None]:
Spectrogram = torchaudio.transforms.Spectrogram()(waveform)

or, mel spectrogram(a representation of the short-term power spectrum of a sound, based on a linear cosine transform of a log power spectrum on a nonlinear mel scale of frequency) using

In [None]:
Mel_Spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)

or, MFCC(Mel-frequency cepstral coefficients (MFCCs) are coefficients that collectively make up an mel-frequency cepstrum. Mel-frequency cepstrum is a representation of the short-term power spectrum of a sound, based on a linear cosine transform of a log power spectrum on a nonlinear mel scale of frequency) using

In [None]:
n_fft = 400.0
frame_length = n_fft / sample_rate * 1000.0
frame_shift = frame_length / 2.0

params = {
    "channel": 0,
    "dither": 0.0,
    "window_type": "hanning",
    "frame_length": frame_length,
    "frame_shift": frame_shift,
    "remove_dc_offset": False,
    "round_to_power_of_two": False,
    "sample_frequency": sample_rate,
}
mfcc = torchaudio.compliance.kaldi.mfcc(waveform, **params)