In [16]:
from io import BytesIO
from google.cloud import storage
import librosa as li
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [10]:
# the google cloud storage bucket we will be working on
BUCKET_NAME = "speech-emotion-bucket"

In [11]:
# the folder inside the bucket we will be working on
PREFIX = "Raw/"

In [20]:
# creates a client object to interact with GCS
# loggin in into our GCS environment
client = storage.Client()

In [21]:
# the list of files (blobs) inside the bucket
# stores those files into a variable blobs
blobs = client.list_blobs(BUCKET_NAME, prefix=PREFIX)

In [22]:
# iterates trough each file inside the bucket
for blob in blobs:
    if blob.name.endswith(".wav"):

        # downloads each audio file and stores it into a binary data variable
        bytes_data = blob.download_as_bytes(raw_download=True)
        binary_data = BytesIO(bytes_data)

        # load the audio into librosa
	    # y is a NumPy array witht he waveform data
        # sr is the sample rate (16khz)
        y, sr = li.load(binary_data, sr=None)

        # scales the waveform so the maximum value is 1
        y = y / np.abs(y).max()

        # remove silence
        # silence is defined by a decibel threshold (tob_db)
        # default should be around 60 db
        y, _ = li.effects.trim(y)

        # compute the mel spectogram
        # it's a 2d array (rows = frequency ; columns = time ; values = intensity (brightness))
        S = li.feature.melspectrogram(y=y, sr=sr)

        # convert the spectogram from raw energy (power) into decibel scale (log scale)
        S_dB = li.power_to_db(S, ref=np.max)

        # convert the standardized spectogram into an image file & save it in the cloud

        # create matlotplib figute size with 64 pixels
        fig = plt.figure(figsize=(4, 4), dpi=100)

        # display image (lower flips the image so that it's well represented)
        plt.imshow(S_dB, cmap="magma", origin="lower", aspect="auto")
        # remove axis
        plt.axis("off")
        # create temporary memory
        buf = BytesIO()
        # saves image (no padding, resolution = 100)
        plt.savefig(buf, format="jpg", bbox_inches="tight", pad_inches=0, dpi=100)
        # close figure to free memory
        plt.close()
        # we need this I don't know why
        buf.seek(0)

        # Resize image to 64x64 in RGB & using the LANCZOS algorithm
        image = Image.open(buf).convert("RGB")
        image = image.resize((64, 64), resample=Image.Resampling.LANCZOS)

        # Save back to buffer for cloud upload
        resized_buf = BytesIO()
        image.save(resized_buf, format="JPEG")  # Quality tweakable
        resized_buf.seek(0)

        # Define the destination (the place where I'll save it: in the bucket)
        # Save the spectogram images into a new folder (Spectograms)
        output_path = blob.name.replace("Raw/", "Spectrograms_64_p/")
        # Change the files from audio to images
        output_path = output_path.replace(".wav", ".jpg")
        # Get the bucket where we want to store our data
        bucket = client.bucket(BUCKET_NAME)
        # Create a new blob (a new file) for each image
        image_blob = bucket.blob(output_path)
        # Upload the image in each blob (in the cloud), specify that it's an image
        image_blob.upload_from_file(resized_buf, content_type="image/jpeg")