In [1]:
!wget https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
!tar -xf UrbanSound8K.tar.gz

--2021-08-11 09:44:56--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘UrbanSound8K.tar.gz’


2021-08-11 09:54:47 (9.74 MB/s) - ‘UrbanSound8K.tar.gz’ saved [6023741708/6023741708]



In [2]:
!pip install torchaudio -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torchaudio
  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.8 MB/s 
Installing collected packages: torchaudio
Successfully installed torchaudio-0.9.0


In [3]:
from google.colab import drive
import os
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import librosa
import argparse
import pandas as pd
import numpy as np
import pickle as pkl 
import torch
import torchaudio
import torchvision
from PIL import Image
from pathlib import Path

'''
parser = argparse.ArgumentParser()
parser.add_argument("--csv_file", type=str)
parser.add_argument("--data_dir", type=str)
parser.add_argument("--store_dir", type=str)
'''

class arguments():
  def __init__(self):
    self.csv_file = '/content/UrbanSound8K/metadata/UrbanSound8K.csv'
    self.data_dir = '/content/UrbanSound8K/audio/'
    #self.store_dir = '/content/spectrograms/'
    self.store_dir = '/content/gdrive/MyDrive/arg-colab/perceptions/sound-localization/'

def extract_spectrogram(values, clip, entries, sr):
	for data in entries:

		num_channels = 3
		window_sizes = [25, 50, 100]
		hop_sizes = [10, 25, 50]

		# Zero-padding for clip(size <= 2205)
		if len(clip) <= 2205:
			clip = np.concatenate((clip, np.zeros(2205 - len(clip) + 1)))

		specs = []
		for i in range(num_channels):
			window_length = int(round(window_sizes[i]*sr/1000))
			hop_length = int(round(hop_sizes[i]*sr/1000))

			clip = torch.Tensor(clip)
			spec = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=2205, win_length=window_length, hop_length=hop_length, n_mels=128)(clip)
			eps = 1e-6
			spec = spec.numpy()
			spec = np.log(spec+ eps)
			spec = np.asarray(torchvision.transforms.Resize((128, 250))(Image.fromarray(spec)))
			specs.append(spec)
		new_entry = {}
		new_entry["audio"] = clip.numpy()
		new_entry["values"] = np.array(specs)
		new_entry["target"] = data["classID"]
		values.append(new_entry)

def extract_features(audios):
	audio_names = list(audios.slice_file_name.unique())
	values = []
	for audio in audio_names:
		entries = audios.loc[audios["slice_file_name"]==audio].to_dict(orient="records")
		clip, sr = librosa.load("{}fold{}/{}".format(args.data_dir, entries[0]["fold"], audio)) #All audio all sampled to a sampling rate of 22050
		extract_spectrogram(values, clip, entries, sr)
		print("Finished audio {}".format(audio))
	return values

if __name__=="__main__":
	# args = parser.parse_args()
  args = arguments()
  audios = pd.read_csv(args.csv_file, skipinitialspace=True)
  num_folds = 10
  
  for i in range(2, num_folds+1):
  #for i in range(1, num_folds+1):
    training_audios = audios.loc[audios["fold"]!=i]
    validation_audios = audios.loc[audios["fold"]==i]

    training_values = extract_features(training_audios)
    with open("{}training128mel{}.pkl".format(args.store_dir, i),"wb") as handler:
      pkl.dump(training_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

    validation_values = extract_features(validation_audios)
    with open("{}validation128mel{}.pkl".format(args.store_dir, i),"wb") as handler:
      pkl.dump(validation_values, handler, protocol=pkl.HIGHEST_PROTOCOL)


[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
Finished audio 179039-9-0-4.wav
Finished audio 179039-9-0-40.wav
Finished audio 179096-3-0-1.wav
Finished audio 179096-3-0-10.wav
Finished audio 179096-3-0-5.wav
Finished audio 179096-3-0-9.wav
Finished audio 17913-4-0-0.wav
Finished audio 17913-4-0-1.wav
Finished audio 17913-4-0-2.wav
Finished audio 17913-4-1-0.wav
Finished audio 17913-4-2-0.wav
Finished audio 179212-4-0-0.wav
Finished audio 179386-3-0-0.wav
Finished audio 179386-3-0-1.wav
Finished audio 179386-3-0-2.wav
Finished audio 179386-3-0-3.wav
Finished audio 179725-4-0-0.wav
Finished audio 17973-2-0-17.wav
Finished audio 17973-2-0-21.wav
Finished audio 17973-2-0-22.wav
Finished audio 17973-2-0-29.wav
Finished audio 17973-2-0-31.wav
Finished audio 17973-2-0-32.wav
Finished audio 17973-2-0-8.wav
Finished audio 17973-2-0-9.wav
Finished audio 179858-1-0-0.wav
Finished audio 179860-1-0-0.wav
Finished audio 179861-1-0-0.wav
Finished audio 179862-1-0-0.wav
Finished audio 179863-1-0-0.wav
Finished a

In [12]:
#!zip -r /content/spectrograms.zip spectrograms
#!cp /content/spectrograms.zip /content/gdrive/MyDrive/arg-colab/perceptions/sound-localization/

  adding: spectrograms/ (stored 0%)
  adding: spectrograms/validation128mel1.pkl (deflated 9%)
  adding: spectrograms/training128mel1.pkl (deflated 9%)
  adding: spectrograms/.ipynb_checkpoints/ (stored 0%)
