## Unzip, read sound files, convert to image, store in folder

In [3]:
# Run once to unpack
# !unzip ./data/archive.zip -d ./data/

Archive:  ./data/archive.zip
  inflating: ./data/cats_dogs/cat_1.wav  
  inflating: ./data/cats_dogs/cat_10.wav  
  inflating: ./data/cats_dogs/cat_100.wav  
  inflating: ./data/cats_dogs/cat_101.wav  
  inflating: ./data/cats_dogs/cat_102.wav  
  inflating: ./data/cats_dogs/cat_103.wav  
  inflating: ./data/cats_dogs/cat_105.wav  
  inflating: ./data/cats_dogs/cat_106.wav  
  inflating: ./data/cats_dogs/cat_107.wav  
  inflating: ./data/cats_dogs/cat_108.wav  
  inflating: ./data/cats_dogs/cat_109.wav  
  inflating: ./data/cats_dogs/cat_11.wav  
  inflating: ./data/cats_dogs/cat_110.wav  
  inflating: ./data/cats_dogs/cat_112.wav  
  inflating: ./data/cats_dogs/cat_113.wav  
  inflating: ./data/cats_dogs/cat_114.wav  
  inflating: ./data/cats_dogs/cat_115.wav  
  inflating: ./data/cats_dogs/cat_116.wav  
  inflating: ./data/cats_dogs/cat_117.wav  
  inflating: ./data/cats_dogs/cat_118.wav  
  inflating: ./data/cats_dogs/cat_119.wav  
  inflating: ./data/cats_dogs/cat_12.wav  
  inflat

In [1]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.10.0.post2-py3-none-any.whl (253 kB)
     |████████████████████████████████| 253 kB 1.2 MB/s            
Collecting soxr>=0.3.2
  Downloading soxr-0.3.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
     |████████████████████████████████| 1.3 MB 1.2 MB/s            
Collecting typing-extensions>=4.1.1
  Downloading typing_extensions-4.6.2-py3-none-any.whl (31 kB)
Collecting soundfile>=0.12.1
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
     |████████████████████████████████| 1.2 MB 113.5 MB/s            
Collecting lazy-loader>=0.1
  Downloading lazy_loader-0.2-py3-none-any.whl (8.6 kB)
Collecting audioread>=2.1.9
  Downloading audioread-3.0.0.tar.gz (377 kB)
     |████████████████████████████████| 377 kB 1.2 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: audioread
  Building wheel for audioread (setup.py) ... [?25ldone
[?25h 

In [2]:
# Import Libraries
import os

import librosa
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.image import imread
import matplotlib.image as mpimg

import torch
from PIL import Image
import torchvision.transforms as transforms

In [39]:
df = pd.read_csv('./data/train_test_split.csv')

In [40]:
df

Unnamed: 0.1,Unnamed: 0,test_cat,test_dog,train_cat,train_dog
0,0,cat_22.wav,dog_barking_97.wav,cat_99.wav,dog_barking_33.wav
1,1,cat_116.wav,dog_barking_0.wav,cat_54.wav,dog_barking_86.wav
2,2,cat_155.wav,dog_barking_93.wav,cat_34.wav,dog_barking_45.wav
3,3,cat_58.wav,dog_barking_10.wav,cat_132.wav,dog_barking_76.wav
4,4,cat_77.wav,dog_barking_26.wav,cat_124.wav,dog_barking_4.wav
...,...,...,...,...,...
110,110,,,cat_15.wav,
111,111,,,cat_88.wav,
112,112,,,cat_73.wav,
113,113,,,cat_32.wav,


In [41]:
#Make new folder to relocate image data
!mkdir -p img_dataset/train/{cat,dog}
!mkdir -p img_dataset/test/{cat,dog}

In [3]:
def create_spectogram(audio_file_name,source_path,save_path): 
    x, sr = librosa.load(source_path+audio_file_name)
    X = librosa.stft(x)
    Xdb = librosa.amplitude_to_db(abs(X))
    plt.figure(figsize=(14, 5))
    librosa.display.specshow(Xdb, sr=sr, y_axis='hz')
    plt.ylabel('')
    plt.axis('off')
    file_name = audio_file_name.replace('.wav','')
    plt.savefig(save_path+file_name+'.jpg', bbox_inches='tight', pad_inches=0)
    plt.close() # Comment if you want to see the image


## Image Process one example

In [48]:
# Convert wav to image

# source sound file, file directory of sound file, destination for image

create_spectogram('cat_1.wav','./data/cats_dogs/train/cat/','./img_dataset/train/cat/')

In [49]:
# Read a PIL image
image = Image.open('./img_dataset/train/cat/cat_1.jpg')

In [50]:
# Create the spectogram images:
transform = transforms.Compose([
    transforms.PILToTensor(),
    transforms.Resize(size = (256,256))
])
  
# transform = transforms.PILToTensor()
# Convert the PIL image to Torch tensor
img_tensor = transform(image)
  
# print the converted Torch tensor
print(img_tensor)


tensor([[[ 59,  59,  59,  ...,  59,  59,  59],
         [ 59,  59,  59,  ...,  59,  59,  59],
         [ 59,  59,  59,  ...,  59,  59,  59],
         ...,
         [191, 214, 216,  ..., 210, 206, 187],
         [187, 205, 213,  ..., 179, 197, 206],
         [174, 164, 149,  ..., 184, 164, 190]],

        [[ 76,  76,  76,  ...,  76,  76,  76],
         [ 76,  76,  76,  ...,  76,  76,  76],
         [ 76,  76,  76,  ...,  76,  76,  76],
         ...,
         [210, 214, 218,  ..., 215, 201, 187],
         [217, 217, 223,  ..., 192, 200, 213],
         [212, 181, 162,  ..., 201, 169, 199]],

        [[192, 192, 192,  ..., 192, 192, 192],
         [192, 192, 192,  ..., 192, 192, 192],
         [192, 192, 192,  ..., 192, 192, 192],
         ...,
         [241, 228, 229,  ..., 241, 225, 209],
         [241, 226, 237,  ..., 219, 219, 218],
         [250, 210, 204,  ..., 252, 211, 216]]], dtype=torch.uint8)


## Bulk Process Wav to Images for Cat/Dogs and Train/Test

In [51]:
CAT_TRAIN = './data/cats_dogs/train/cat/' 
for sound in os.listdir(CAT_TRAIN): 
    create_spectogram(sound,CAT_TRAIN,'./img_dataset/train/cat/')
    
DOG_TRAIN = './data/cats_dogs/train/dog/' 
for sound in os.listdir(DOG_TRAIN): 
    create_spectogram(sound,DOG_TRAIN,'./img_dataset/train/dog/')
    
DOG_TEST = './data/cats_dogs/test/test/' 
for sound in os.listdir(DOG_TEST): 
    create_spectogram(sound,DOG_TEST,'./img_dataset/test/dog/')
    
    
CAT_TEST = './data/cats_dogs/test/cats/' 
for sound in os.listdir(CAT_TEST): 
    create_spectogram(sound,CAT_TEST,'./img_dataset/test/cat/')

In [5]:
Inferences = './data/cats_dogs/inferences/' 
for sound in os.listdir(Inferences): 
    create_spectogram(sound,Inferences,'./img_dataset/inferences/')

Note: Illegal Audio-MPEG-Header 0x4c495354 at offset 38626.
Note: Trying to resync...
Note: Hit end of (available) data during resync.


In [53]:
# Create Metadata for Tensor.Dataset

image_names_ls = []
file_location = []

for i in ['test','train']:
    file_location_subset = []
    for j in ['cat','dog']:
        image_names_ls.append([img for img in os.listdir(f'./img_dataset/{i}/{j}/')])
        file_location.append([f'./img_dataset/{i}/{j}/{img}' for img in os.listdir(f'./img_dataset/{i}/{j}/')])
    

In [30]:
test_set = pd.DataFrame({'image_name': image_names_ls[0] + image_names_ls[1], 'image_location': file_location[0] + file_location[1], 'target':len(file_location[0])*['cat']+len(file_location[1])*['dog']} )
train_set = pd.DataFrame({'image_name': image_names_ls[2] + image_names_ls[3], 'image_location': file_location[2] + file_location[3], 'target':len(file_location[2])*['cat']+len(file_location[3])*['dog']})


In [23]:
test_set.head()

Unnamed: 0,image_name,image_location
0,cat_28.jpg,./img_dataset/test/cat/cat_28.jpg
1,cat_82.jpg,./img_dataset/test/cat/cat_82.jpg
2,cat_55.jpg,./img_dataset/test/cat/cat_55.jpg
3,cat_110.jpg,./img_dataset/test/cat/cat_110.jpg
4,cat_20.jpg,./img_dataset/test/cat/cat_20.jpg


In [31]:
test_set.to_csv('./img_dataset/test/test.csv')
train_set.to_csv('./img_dataset/train/train.csv')

# Research/References
1. Data Source: https://www.kaggle.com/datasets/mmoreaux/audio-cats-and-dogs?select=cats_dogs
2. Samples
    - https://www.kaggle.com/code/thanht02/audio-classification-cnn-864d1f
    - https://www.kaggle.com/code/kanncaa1/pytorch-tutorial-for-deep-learning-lovers
3. Papers
    - 
4. Data Loader
    - https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    - https://www.kaggle.com/code/pinocookie/pytorch-dataset-and-dataloader
5. Image Processing
    - https://www.geeksforgeeks.org/converting-an-image-to-a-torch-tensor-in-python/
    - https://www.tutorialspoint.com/pytorch-how-to-resize-an-image-to-a-given-size