

---

#### Note: Throughout the thesis code/notebooks, to reproduce different results and methods, code cells are edited and desired paramteres entered and re-ran. Code is commented out and in at times when we want to use different variables etc, this saves having lots of repeated code clogging up the notebooks. Output from cells is not always maintained.


---





---

# Installations & Imports


---



In [None]:
!pip install tensorflow-io
!pip install pydub
!pip install wget

Collecting tensorflow-io
  Downloading tensorflow_io-0.24.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.4 MB)
[K     |████████████████████████████████| 23.4 MB 1.2 MB/s 
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.24.0
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=d9ebeae6a887935b6a90b70e968daafa873e4317dc392526b97b7b53034d7c74
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import librosa
import json
import os 
import wget
import shutil

from pydub import AudioSegment
from pydub.playback import play

In [None]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive




---

# Functions


---



In [None]:
def create_audio_sample(file, snippet_size=120):
  """
    Input: full podcast episode (mp3)
    Output: Extract/stitch desired snippet and export as wav file
  """

  audio = AudioSegment.from_mp3(file)
  audio = audio.set_channels(1)

  # pydub does things in milliseconds
  # Take leading snippet size and a snippet from during the podcast
  start = audio[:(snippet_size*1000)]
  midpoint = len(audio)//2
  middle = audio[midpoint : midpoint + (snippet_size*1000)]

  # Export audio sample for preprocessing
  sample = start + middle
  sample.export('new_file.wav', format="wav")


def extract_features(file):
  """
    Input: Wav audio snippet
    Output: audio features via librosa
  """

  signal, sr = librosa.load(file)

  # Baseline audio features - MFCC, zero crossing rate, spectral ecntroid
  mfcc = librosa.feature.mfcc(signal, sr)
  zcr = librosa.feature.zero_crossing_rate(signal)
  spec_centroid = librosa.feature.spectral_centroid(signal, sr)

  return mfcc, zcr, spec_centroid


def create_episode_map(file):
  """ 
    Create a map from IDs -> urls, titles
    The popularity train/test sets don't have the URLs.
    We use the map to get urls for a give episode from the full set in
    podcast_episodes_sb dataset.

    Returns: a map of IDs that exist in both datasets
  """

  data = [json.loads(line) for line in open(file, 'r')]

  map = {}
  for x in data:
    map[x['id']] = (x['url'], x['title'])

  return map





---

# Feature Extraction: 
1. Iteratively process podcast episodes
2. Extract baseline traditional features
3. Output dataset

First time around, this will be time consuming since we have to download files and extract samples for the first time. Files are saved to google drive and the features are dumped into json files for future use.

### This is executed in batches by using slices in the for loop, due to colab time limits and constraints etc.
---



In [None]:

# Map to store ID -> metadata relationships
data_map = create_episode_map('gdrive/MyDrive/thesis/podcast_data/podcast_episodes_sb.json')

# Labelled train/test subset of podcat episodes from Yang et al
# We will be making our own train/test sets once all featurex are extracted
popularity_train = [json.loads(line) for line in open('gdrive/MyDrive/thesis/podcast_data/popularity_train.json', 'r')]
popularity_test = [json.loads(line) for line in open('gdrive/MyDrive/thesis/podcast_data/popularity_test.json', 'r')]

# A dict to curate the output dataset with extracted features
dataset = {
    "id" : [],
    "title": [],
    "mfcc": [],
    "zcr": [],
    "spec_centroid": [],
    "label": []
}

# Debug count
x = 0

for i in popularity_test[20:21]:
  id = i['id']

  # if the episode exists in the sampled subset of annotated samples
  url = data_map[id][0]
  try:
    print(url)
    file = wget.download(url)
    lab = i['label']

    # Convert file to wav and create a sample for extracting features
    # Literature takes leading minutes
    create_audio_sample(file, snippet_size = 150)

    # Extract features from created audio sample
    mfcc, zcr, spec_centroid = extract_features('new_file.wav')

    print(mfcc.shape)

    # # Add to datset
    dataset['id'].append(id)
    dataset['label'].append(lab)
    dataset['mfcc'].append(mfcc.T.tolist())
    dataset['zcr'].append(zcr.T.tolist())
    dataset['spec_centroid'].append(spec_centroid.tolist())
    
    # # Clean up 
    os.system(f'rm {file}')
    os.system('rm new_file.wav')
    # print(x)

  except Exception as e:
    print(x, e, url)
    pass

  x += 1

  
# # Dump dataset into json file
# with open('gdrive/MyDrive/thesis/podcast_data/pop_test/popularity_test_features_1400_2400.json', 'w') as fp:
#     json.dump(dataset, fp)

https://media.acast.com/ctrlaltdelete/-87matthaig-twitter-timeandmentalhealth/media.mp3
(20, 12920)
