In [1]:
import os
import torch
import torchaudio
import IPython.display as ipd
# import matplotlib
# matplotlib.use('qt5agg')
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import soundfile as sf
import pandas as pd
from torchaudio.compliance.kaldi import spectrogram

In [6]:
plt.close('all')

# CNN with images of spectrograms

## Explore noWhale

In [2]:
# audio NoWhale
waveform_noWhale, sr_noWhale = torchaudio.load(r'.\train\train\0.wav')

print(type(waveform_noWhale))
print(type(sr_noWhale))
print(waveform_noWhale.shape)
# 2000 data points per second, in total 4000 data points
print(sr_noWhale)


<class 'torch.Tensor'>
<class 'int'>
torch.Size([1, 4000])
2000


In [None]:
plt.figure()
plt.plot(waveform_noWhale.t().numpy())

[<matplotlib.lines.Line2D at 0x206616c5f10>]

In [5]:
# Spectrogram
spectrogram_noWhale = torchaudio.transforms.Spectrogram()(waveform_noWhale)
# plot the spectrogram
plt.figure()
# scale the values with log2 and then select the first channel
plt.imshow(spectrogram_noWhale.log2()[0,:,:].numpy(), cmap='viridis')

<matplotlib.image.AxesImage at 0x1933c6f37d0>

## Explore RightWhale

In [6]:
# audio RightWhale
waveform_rightWhale, sr_rightWhale = torchaudio.load(r'.\train\train\1.wav')

print(type(waveform_rightWhale))
print(type(sr_rightWhale))
print(waveform_rightWhale.shape)
# 2000 data points per second, in total 4000 data points
print(sr_rightWhale)



<class 'torch.Tensor'>
<class 'int'>
torch.Size([1, 4000])
2000


In [7]:
plt.figure()
plt.plot(waveform_rightWhale.t().numpy())


[<matplotlib.lines.Line2D at 0x1933fa65460>]

In [8]:
# Spectrogram
spectrogram_rightWhale = torchaudio.transforms.Spectrogram()(waveform_rightWhale)
# plot the spectrogram
plt.figure()
# scale the values with log2 and then select the first channel
plt.imshow(spectrogram_rightWhale.log2()[0,:,:].numpy(), cmap='viridis')


<matplotlib.image.AxesImage at 0x1933fa96bd0>

## Load data

In [9]:
# collect all the paths in the train folder
sound_files = os.listdir(r'.\train\train')
df_labels = pd.read_csv(r'.\train.csv')
print(len(sound_files))

10944


In [10]:
# get labels
train_labels = df_labels['class'].to_numpy()
print(train_labels[:10])
print(len(train_labels))

['NoWhale' 'RightWhale' 'NoWhale' 'NoWhale' 'NoWhale' 'NoWhale'
 'RightWhale' 'NoWhale' 'NoWhale' 'RightWhale']
10934


In [11]:
# remove the audios that are not valid
idx_labeled_audio_files = df_labels['idx'].to_numpy()
available_audio_files = []
for file in sound_files:
    try:
        idx = int(file.split('.')[0])
        available_audio_files.append(idx)
    except:
        print("Invalid file: ", file)

Invalid file:  8675(1).wav
Invalid file:  8676(1).wav
Invalid file:  8677(1).wav
Invalid file:  8678(1).wav
Invalid file:  8679(1).wav
Invalid file:  868(1).wav
Invalid file:  8680(1).wav
Invalid file:  8681(1).wav
Invalid file:  8682(1).wav
Invalid file:  8684(1).wav


In [12]:
available_audio_files = sorted(available_audio_files)
print(available_audio_files[:5])

[0, 1, 2, 3, 4]


In [13]:
# split the valid audio files into noWhale and RightWhale
noWhale_paths = []
rightWhale_paths = []
# assign the paths to the corresponding label
for idx in available_audio_files:
    if train_labels[idx] == 'NoWhale':
        noWhale_paths.append(idx)
    elif train_labels[idx] == 'RightWhale':
        rightWhale_paths.append(idx)
    else:
        print("Invalid label: ", train_labels[idx])

print(len(noWhale_paths))
print(len(rightWhale_paths))

5467
5467


In [14]:
# get the complete path of the valid audio files
noWhale_paths = [os.path.join(r'.\train\train', str(file) + '.wav') for file in noWhale_paths] 
rightWhale_paths = [os.path.join(r'.\train\train', str(file) + '.wav') for file in rightWhale_paths]
print(noWhale_paths[:5])
print(rightWhale_paths[:5])

['.\\train\\train\\0.wav', '.\\train\\train\\2.wav', '.\\train\\train\\3.wav', '.\\train\\train\\4.wav', '.\\train\\train\\5.wav']
['.\\train\\train\\1.wav', '.\\train\\train\\6.wav', '.\\train\\train\\9.wav', '.\\train\\train\\10.wav', '.\\train\\train\\14.wav']


## Process data : Get images

In [15]:
if not os.path.exists(r'.\train_images_spectrogram'):
    os.makedirs(r'.\train_images_spectrogram\noWhale')
    os.makedirs(r'.\train_images_spectrogram\rightWhale')

In [16]:
# save the spectrogram images of the noWhale

for j, path in enumerate(noWhale_paths):
    if j % 1000 == 0:
        print(j)
    idx = int(path.split('\\')[-1].split('.')[0])
    waveform, sr = torchaudio.load(path)
    
    spectrogram_noWhale = torchaudio.transforms.Spectrogram()(waveform)
    spectrogram_path = os.path.join(r'.\train_images_spectrogram\noWhale', str(idx) + '.png')
    # scale the values with log2 and then select the first channel
   
    plt.imsave(spectrogram_path, spectrogram_noWhale.log2()[0,:,:].numpy(), cmap='viridis')
    j = j + 1


0
1000
2000
3000
4000
5000


In [17]:

# save the spectrogram images of the rightWhale

for j, path in enumerate(rightWhale_paths):
    if j % 1000 == 0:
        print(j)
    idx = int(path.split('\\')[-1].split('.')[0])
    waveform, sr = torchaudio.load(path)

    spectrogram_rightWhale = torchaudio.transforms.Spectrogram()(waveform)
    spectrogram_path = os.path.join(r'.\train_images_spectrogram\rightWhale', str(idx) + '.png')
    # scale the values with log2 and then select the first channel

    plt.imsave(spectrogram_path, spectrogram_rightWhale.log2()[0,:,:].numpy(), cmap='viridis')
    j = j + 1



0
1000
2000
3000
4000
5000


In [None]:
# save in labeled directories to apply ImageFolder