

---

#### Note: Throughout the thesis code/notebooks, to reproduce different results and methods, code cells are edited and desired paramteres entered and re-ran. Code is commented out and in at times when we want to use different variables etc, this saves having lots of repeated code clogging up the notebooks. Output from cells is not always maintained.


---





---
# Imports & Installations
---




In [3]:
!pip install tensorflow-io
!pip install pydub
!pip install wget 
!pip install audiomentations
!pip install mutagen

Collecting tensorflow-io
  Downloading tensorflow_io-0.25.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.4 MB)
[K     |████████████████████████████████| 23.4 MB 1.4 MB/s 
[?25hCollecting tensorflow-io-gcs-filesystem==0.25.0
  Downloading tensorflow_io_gcs_filesystem-0.25.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 81.0 MB/s 
[?25hInstalling collected packages: tensorflow-io-gcs-filesystem, tensorflow-io
  Attempting uninstall: tensorflow-io-gcs-filesystem
    Found existing installation: tensorflow-io-gcs-filesystem 0.24.0
    Uninstalling tensorflow-io-gcs-filesystem-0.24.0:
      Successfully uninstalled tensorflow-io-gcs-filesystem-0.24.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which 

In [4]:
from google.colab import drive 
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
import tensorflow_hub as hub
import pydub
import json
import tensorflow_io as tfio
import wget
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import soundfile as sf
from mutagen.mp3 import MP3

In [None]:
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!ls

gdrive	sample_data




---

# Functions


---



In [2]:
def extract_snippet(f, normalized=False, frame_rate=16000):
    """
      Input: podcast audio episode in mp3

        Read with pydub and set to mono
        Extract snippet
        Convert to tf.tensor

      Returns: Normalized tensor audio snippet
    """
    try:
      # Preprocess with pydub
      a = pydub.AudioSegment.from_mp3(f)
      a = a.set_frame_rate(frame_rate)
      a = a.set_channels(1)

      # Stitch snippets of audio (leading N seconds and N seconds from middle)
      # Same input as baseline but needs to be stitched to combat GPU memory
      start1 = a[:(100*1000)]
      start2 = a[(100*1000):(150*1000)]
      midpoint = len(a)//2
      middle1 = a[midpoint : midpoint + (50*1000)]
      middle2 = a[midpoint + (50*1000) : midpoint + (150*1000)]

      # Export audio sample for preprocessing
      a = start1 + start2 + middle1 + middle2
      a.export('new_file.wav', format='wav')

    except Exception as e:
      print(e)
      return None

def convert_to_tensor(file, normalized=True):
    """
      Input: Audio snippet

      Returns: standardized audio tensor object
    """

    a = pydub.AudioSegment.from_file(file)
    a = a.set_frame_rate(16000)
    a = a.set_channels(1)

    a = np.array(a.get_array_of_samples()) #.astype(np.float32)


    if normalized:
        return tf.convert_to_tensor( (np.float32(a) / 2**15), dtype=tf.float32)
    else:
        return tf.convert_to_tensor(a, dtype=tf.float32)


def create_episode_map(file):
  """ 
    Create a map from IDs -> urls, titles
    The popularity train/test sets don't have the URLs.
    We use the map to get urls for a give episode from the full set in
    podcast_episodes_sb dataset.

    Returns: a map of IDs that exist in both datasets
  """

  data = [json.loads(line) for line in open(file, 'r')]

  map = {}
  for x in data:
    map[x['id']] = (x['url'], x['title'])

  return map

def augment_sample(file, augment_method):
  """
    Augment sample with audiomentations

    Returns:Write augmented version as wav file for future use.
  """

  signal, sr = librosa.load(file)
  augmented_signal = augment_method(signal, sample_rate=16000)
  sf.write('augmented.wav', augmented_signal, sr)


def mutagen_length(path):
  """
    Input: path to mp3 podcast episode
    Returns: length of mp3 in seconds
  """

  try:
      audio = MP3(path)
      length = audio.info.length
      return length
  except:
      return None



---

# Load Pretrained Trillsson model from TensorflowHub


---



In [None]:
module = hub.load('https://tfhub.dev/google/trillsson1/1')



---

# Iteratively push data through pretrained model

1. Extract TRILLsson (distilled CAP12) embeddings
2. Export the features for future use

---




In [None]:
import json
import wget
import os 
import socket
import shutil
# import os.path
from os import path

# Read in map of IDs -> urls (links as urls are not in train/test sets)
data_map = create_episode_map('gdrive/MyDrive/thesis/podcast_data/podcast_episodes_sb.json')

# Read in train/test
popularity_train = [json.loads(line) for line in open('gdrive/MyDrive/thesis/podcast_data/popularity_train.json', 'r')]
popularity_test = [json.loads(line) for line in open('gdrive/MyDrive/thesis/podcast_data/popularity_test.json', 'r')]

# Empty dataset to extend
dataset = {
    "id" : [],
    "offset": [],
    "trill_embedd/ing": []
    # "length": []
}

# Pure audio Augmentation
augment = Compose([
  AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
  TimeStretch(min_rate=0.8, max_rate=2.00, p=0.5),
  PitchShift(min_semitones=-4, max_semitones=4, p=0.5)
])

augment=False
socket.setdefaulttimeout(120)
x = 0

for i in popularity_train:
  id = i['id']

  # if the episode exists in the sampled subset of annotated samples
  if id in data_map:
     url = data_map[id][0]
     try:
        lab = i['label']

        # If saved locally, otherwise download and copy
        if path.exists(f"gdrive/MyDrive/thesis/pop_train_files/{id}"):
          file = f"gdrive/MyDrive/thesis/pop_train_files/{id}"
        else:
          file = wget.download(url)
          shutil.copyfile(file, f"gdrive/MyDrive/thesis/pop_train_files/{id}")

        # length feature
        # length = mutagen_length(f"gdrive/MyDrive/thesis/pop_train_files/{id}")

        extract_snippet(file)

        # NOTE: Audio should be floats in [-1, 1], sampled at 16kHz. Model input is of
        # the shape [batch size, time].
        if augment=True:
          augment_sample('new_file.wav', augment)
          audio_tensor = convert_to_tensor('augmented.wav')
        else:
          audio_tensor = convert_to_tensor('augmented.wav')
 
        # Split audio tensor into chunks (memory too low to process all)
        y1, y2, y3 = tf.split(audio_tensor, num_or_size_splits=3)

        y1 = tf.reshape(y1, (-1, y1.shape[0]))
        y2 = tf.reshape(y2, (-1, y2.shape[0]))
        y3 = tf.reshape(y3, (-1, y3.shape[0]))

        # Models internally aggregate over time. For a time-series of embeddings, the
        # user can frame audio however they want.
        # Offset keeps track of what chunk (y1, y2, y3)
        chunks = [y1, y2, y3]
        offset = 0
        for audio_snippet in chunks:
          embedding = module(audio_snippet)['embedding']
          
          # Models internally aggregate over time. For a time-series of embeddings, the
          # user can frame audio however they want.
          embedding.shape.assert_is_compatible_with([None, 1024])

        #   # Append to dataset
        dataset['id'].append(id)
        # dataset['length'].append(length)
        dataset['label'].append(lab)
        dataset['trill_embedding'].append(embedding.numpy().T.tolist())

        offset += 1

        os.system(f'rm *.mp3')
        os.system(f'rm *.wav')
        

     except Exception as e:
       print(x, e)

  x += 1
  print(x)
  
# with open('gdrive/MyDrive/thesis/podcast_data/pop_train_lengths.json', 'w') as fp:
#   json.dump(dataset, fp)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
local
622b50e5-1bad-4c84-80e9-b7ca01d54c30
duration sec: 2000.0233
duration min: 33:20
2404
2404 <urlopen error [Errno -5] No address associated with hostname>
2405
2405 HTTP Error 404: Not Found
2406
2406 HTTP Error 404: Not Found
2407
2407 HTTP Error 410: Gone
2408
2408 HTTP Error 403: Forbidden
2409
local
265a01ec-3eba-4024-8862-2bf7e6b63af1
duration sec: 1024.1780625
duration min: 17:4
2410
2410 HTTP Error 404: Not Found
2411
local
5b99cea4-69d9-49f3-a8ee-690476f4b230
duration sec: 6849.044875
duration min: 114:9
2412
2412 HTTP Error 504: Gateway Timeout
2413
local
c7e4cf81-0a3d-438a-8a90-1d4880be3c75
duration sec: 807.399
duration min: 13:27
2414
local
791e2023-fe23-42c1-ad26-8f037b356e25
duration sec: 1521.7894
duration min: 25:21
2415
local
34fc3113-e0be-4883-ba34-31b798643939
duration sec: 1008.4966666666667
duration min: 16:48
2416
2416 HTTP Error 410: Gone
2417
2417 HTTP Error 404: Not Found
2418
2418 HTTP Error