

---

#### Note: Throughout the thesis code/notebooks, to reproduce different results and methods, code cells are edited and desired paramteres entered and re-ran. Code is commented out and in at times when we want to use different variables etc, this saves having lots of repeated code clogging up the notebooks. Output from cells is not always maintained.


---

# This is a helper script for reading in chunks of extracted features and aggregating into final, clean dataset

# Installations & Imports

In [None]:
!pip install tensorflow-io
!pip install pydub
!pip install wget
!pip install imbalanced-learn

Collecting tensorflow-io
  Downloading tensorflow_io-0.24.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.4 MB)
[K     |████████████████████████████████| 23.4 MB 1.2 MB/s 
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.24.0
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=99f075ba121f59a2ff19f55e285096752e5f585e6ec145856c90e5f771ecb23a
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import librosa
import json
import os 


In [None]:
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive




---

# Functions


---


In [None]:
def format_data(df):

  df['mfcc'] = df['mfcc'].apply(lambda x: np.mean(np.array(x).T, axis=1))
  df['zcr'] = df['zcr'].apply(lambda x: np.array(x).flatten())
  df['zcr_min'] = df.zcr.apply(lambda x: min(x))
  df['zcr_mean'] = df.zcr.apply(lambda x: np.mean(x))
  df['zcr_max'] = df.zcr.apply(lambda x: max(x))

  return df
  
def read_baseline_data(train_filename, test_filename, full=True):

  """
    Read feature dataframe in chunks before combining back again
  """

  if full:
    # Read in test and train sets of baseline features
    # Note chunk size reads in x amount of rows from the json files to deal with low RAM
    yang_train_df = pd.read_json(train_filename, lines=True, chunksize=100)
    yang_test_df = pd.read_json(test_filename, lines=True, chunksize=100)
    # discrepancy_df = pd.read_json('gdrive/MyDrive/thesis/podcast_data/baseline_test_missing_ids_rec.json', lines=True, chunksize=100)

    # For each chunk, process and concat to final datafram
    # Again helps with reading smaller chunks into memory
    final_df = pd.DataFrame()
    for df in yang_train_df:
      df = format_data(df)
      final_df = pd.concat([final_df, df])
      print(final_df.shape)

    for df in yang_test_df:
      df = format_data(df)
      final_df = pd.concat([final_df, df])
      print(final_df.shape)

    # for df in discrepancy_df:
    #   df = format_data(df)
    #   final_df = pd.concat([final_df, df])
    #   print(final_df.shape)


    # Export full clean dataset
    # result = final_df.to_json(orient="records")

    # with open('gdrive/MyDrive/thesis/podcast_data/pop_test/baseline_clean_dataset.json', 'w') as fp:
    #   json.dump(result, fp)

  return final_df

def mean_lst(sr):

    return sr.apply(pd.Series).mean()

def merge_rows(df, method='concat'):
  """
    Merges rows with same ID, to form the master embedding across several snippets
  """

  if method=='concat':
    # Group by ID and merge lists via sum operation
    aggregation_functions = {'id':'first', 'trill_embedding': 'sum', 'label':'first'} 
    grouped_df = df.groupby(df['id']).agg(aggregation_functions)
    grouped_df['trill_embedding'] = grouped_df['trill_embedding'].apply(lambda x: np.array(x).flatten())
  else:
    df['trill_embedding'] = df.trill_embedding.apply(lambda x: np.array(x).flatten())
    aggregation_functions = {'id':'first', 'trill_embedding': mean_lst, 'label':'first'} 
    grouped_df = df.groupby(df['id']).agg(aggregation_functions)
    # df.reset_index(drop=True, inplace=True)

    # grouped_df = df.groupby('id').trill_embedding.apply(lambda x: np.mean(np.array(x).flatten(), axis=0))

  
  return grouped_df

def read_concat_pretrained_data(train, test):
  """
    Input: pretrained features in train/test sets
    
    Returns: cleaned dataset, merge features from same episode ID (due to gpu memory)
  """

  train_df = merge_rows(pd.read_json(train, lines=True))
  test_df = merge_rows(pd.read_json(test, lines=True))

  train_df = merge_rows(train_df, method='concat')
  test_df = merge_rows(test_df, method='concat')

  df = pd.concat([train_df, test_df])

  df.reset_index(drop=True, inplace=True)

  return df

def read_mean_pretrained_data(train, test):
  """
    Input: pretrained features in train/test sets
    
    Returns: cleaned dataset, concat features from same episode ID (due to gpu memory)
  """

  train_df = merge_rows(pd.read_json(train, lines=True))
  test_df = merge_rows(pd.read_json(test, lines=True))

  train_df = merge_rows(train_df, method='merge')
  test_df = merge_rows(test_df, method='merge')

  df = pd.concat([train_df, test_df])

  df.reset_index(drop=True, inplace=True)

  return df



---

# Read in Extracted Baseline and Pretrained feature datasets
1. Preprocess in batches (once off) due to RAM, cant read in whole feature dataset in one go
2. Export cleaned and smaller version for future use when training/eval


---





---



---



---



In [None]:
train_filename = 'gdrive/MyDrive/thesis/podcast_data/baseline_popularity_train_features_rec.json'
test_filename = 'gdrive/MyDrive/thesis/podcast_data/baseline_popularity_test_features_rec.json'
base_df = read_baseline_data(train_filename, test_filename)

In [None]:
end_train_filename = 'gdrive/MyDrive/thesis/podcast_data/end_baseline_pop_train_features_rec.json'
end_test_filename = 'gdrive/MyDrive/thesis/podcast_data/end_baseline_pop_test_features_rec.json'
end_base_df = read_baseline_data(end_train_filename, end_test_filename)

(100, 9)
(200, 9)
(300, 9)
(400, 9)
(500, 9)
(600, 9)
(700, 9)
(800, 9)
(900, 9)
(1000, 9)
(1100, 9)
(1200, 9)
(1300, 9)
(1400, 9)
(1500, 9)
(1600, 9)
(1700, 9)
(1800, 9)
(1900, 9)
(2000, 9)
(2100, 9)
(2164, 9)
(2264, 9)
(2364, 9)
(2464, 9)
(2564, 9)
(2664, 9)
(2764, 9)
(2864, 9)
(2964, 9)
(3064, 9)
(3164, 9)
(3264, 9)
(3364, 9)
(3464, 9)
(3564, 9)
(3630, 9)


In [None]:
lead_train_filename = 'gdrive/MyDrive/thesis/podcast_data/lead_baseline_pop_train_features_rec.json'
lead_test_filename = 'gdrive/MyDrive/thesis/podcast_data/lead_baseline_pop_test_features_rec.json'
lead_base_df = read_baseline_data(lead_train_filename, lead_test_filename)

(100, 9)
(200, 9)
(300, 9)
(400, 9)
(500, 9)
(600, 9)
(700, 9)
(800, 9)
(900, 9)
(1000, 9)
(1100, 9)
(1200, 9)
(1300, 9)
(1400, 9)
(1500, 9)
(1600, 9)
(1700, 9)
(1800, 9)
(1900, 9)
(2000, 9)
(2100, 9)
(2171, 9)
(2271, 9)
(2371, 9)
(2471, 9)
(2571, 9)
(2671, 9)
(2771, 9)
(2871, 9)
(2971, 9)
(3071, 9)
(3171, 9)
(3271, 9)
(3371, 9)
(3471, 9)
(3571, 9)
(3638, 9)


In [None]:
pretrained_train = 'gdrive/MyDrive/thesis/podcast_data/merged_data/trillsson_popularity_train_features_rec.json'
pretrained_test = 'gdrive/MyDrive/thesis/podcast_data/merged_data/trillsson_popularity_test_features_rec.json'
trill_df = read_concat_pretrained_data(pretrained_train, pretrained_test)

print(trill_df.shape)

(3629, 3)


In [None]:
end_base_df.shape

(3630, 9)

In [None]:
pretrained_train = 'gdrive/MyDrive/thesis/podcast_data/merged_data/trillsson_popularity_train_features_rec.json'
pretrained_test = 'gdrive/MyDrive/thesis/podcast_data/merged_data/trillsson_popularity_test_features_rec.json'
trill_mean_df = read_mean_pretrained_data(pretrained_train, pretrained_test)

In [None]:
end_base_df.head(5)

Unnamed: 0,index,id,mfcc,zcr,spec_centroid,label,zcr_min,zcr_mean,zcr_max
0,0,2b174447-e190-4f40-9685-cbd511a67f81,"[-83.41299793540873, 50.051582844179144, 19.39...","[0.01171875, 0.1049804688, 0.1772460938, 0.218...","[[768.6866243585, 3561.4714726621, 5018.869949...",1,0.004395,0.138678,0.571777
1,1,fa3fbe65-f4a4-4814-9f4d-3e9d0bb80909,"[-307.32266159146167, 85.16551326050208, -53.0...","[0.0732421875, 0.10791015620000001, 0.14355468...","[[1568.9712256543, 1552.9839013117, 1504.96427...",1,0.016602,0.152398,0.6875
2,2,1dd67a59-0c8e-41f5-8c1b-322b330528f2,"[-311.0815615208156, 106.2561491572576, -15.01...","[0.0190429688, 0.0693359375, 0.1274414062, 0.1...","[[1090.3646740156, 980.5521311021, 1067.618309...",1,0.001465,0.105882,0.679688
3,3,d4d13a4e-470d-42ad-be15-39424a1eec3a,"[-151.03507181564726, 103.06516134624295, -5.1...","[0.2446289062, 0.3671875, 0.4892578125, 0.3974...","[[5880.3050777409, 5933.6683141206, 5773.70050...",1,0.0,0.093549,0.679199
4,4,e368ca41-22c6-45a1-9010-be2d0f0fd6de,"[-222.93116546032454, 108.67984714101729, -10....","[0.1640625, 0.2373046875, 0.2783203125, 0.2080...","[[3859.0255597642, 4334.3690837603, 4229.53279...",1,0.008789,0.094387,0.728516


In [None]:
trill_mean_df.head(5)

Unnamed: 0,id,trill_embedding,label
0,0005b4e2-adad-4fab-bcce-7fd65bd17d67,"[-0.5438773036, 0.0037935674, -0.4936779737, -...",0
1,00287f38-ff72-4adf-87f2-8325aa3651e3,"[-0.4827392399, -0.2244028151, -0.4869424701, ...",1
2,002c23d0-cd29-4378-b075-aa7a657b1b57,"[-0.44797149300000005, -0.23329487440000002, -...",1
3,0035c812-bf68-40a2-8c61-80b82a4cf236,"[-0.4465982914, 0.037524193500000004, -0.63198...",0
4,0043b17a-f15f-41e1-850b-bac7f5b99a61,"[-0.4514972866, -0.0652327836, -1.0197277069, ...",0


In [None]:
base_df[ base_df['label'] == 1]

Unnamed: 0,index,id,title,mfcc,zcr,spec_centroid,label,zcr_min,zcr_mean,zcr_max
0,0.0,2b174447-e190-4f40-9685-cbd511a67f81,Release Yourself Radio Show 820,"[-98.01492554456388, 79.69141734200585, 12.468...","[0.2924804688, 0.4047851562, 0.4975585937, 0.4...","[[5021.6897556123, 5019.369706856, 5040.186264...",1,0.004883,0.118520,0.650879
1,1.0,fa3fbe65-f4a4-4814-9f4d-3e9d0bb80909,How to Achieve Radiant Skin & Better Health,"[-299.2231459681095, 84.06640659696733, -43.88...","[0.2875976562, 0.3286132812, 0.3461914062, 0.2...","[[5103.8965378717, 2455.2896611439, 2307.64886...",1,0.011719,0.145454,0.795410
2,2.0,1dd67a59-0c8e-41f5-8c1b-322b330528f2,LL 070917,"[-248.66206508753078, 89.30236727291066, 2.557...","[0.1025390625, 0.11572265620000001, 0.12988281...","[[1918.3117881537, 1847.0899555161, 1809.29486...",1,0.009766,0.123863,0.698730
3,3.0,6a916622-347e-4a7d-b93a-b4dd08c13cf7,190: Git: Keep Track Of Your Files As They Cha...,"[-266.22798362619744, 83.46188924110488, 9.539...","[0.2592773438, 0.38671875, 0.5073242188, 0.511...","[[5296.698250952, 5502.7276613255, 5568.188978...",1,0.008789,0.114545,0.770996
4,4.0,d4d13a4e-470d-42ad-be15-39424a1eec3a,Journal Jam 10 Part 2 Endovascular Therapy fo...,"[-227.02288494567736, 94.91036841596004, -10.9...","[0.048828125, 0.0595703125, 0.0693359375, 0.04...","[[644.3137041161, 988.2641210948, 1226.5068171...",1,0.000000,0.123583,0.725586
...,...,...,...,...,...,...,...,...,...,...
168,,fa8cf90b-6e79-46ef-ab22-92a9abdf0e59,TWiT 622: Running for Human,"[-201.60252456592133, 79.82332059204924, -4.90...","[0.2734375, 0.2998046875, 0.3178710938, 0.1518...","[[2301.1960474847, 2126.9182888554, 2283.75168...",1,0.004395,0.118477,0.656738
169,,0d07c803-277f-4d59-80d7-7b0f54876e99,"Donald Trump Jr.'s email chain, 'peaceful nuke...","[-236.02149740272998, 109.306577093912, -14.65...","[0.2954101562, 0.4233398438, 0.6381835938, 0.6...","[[5290.4922670663, 5344.3424039144, 5164.20514...",1,0.005371,0.107719,0.802246
170,,d617e34f-365d-498c-b563-d7667c4d1f42,#76 How Would You Introduce a New Transportati...,"[-212.1468616616567, 96.42584673771567, -10.51...","[0.0595703125, 0.09033203120000001, 0.10986328...","[[2544.0851039386, 3023.4392794223, 3104.42431...",1,0.000000,0.103869,0.638672
171,,ca4a2b55-d680-453b-a252-fb5f490b7dc5,"Episode #31: A Chat with Kara Goucher, Adam Go...","[-301.88381171477505, 95.9999947506132, -3.070...","[0.1962890625, 0.1987304688, 0.2094726562, 0.0...","[[299.0011272052, 333.8743897209, 574.04904376...",1,0.000488,0.129559,0.878418




---

# Remove minor discrepancies between the two datasets

1. May vary slightly due to URL inconsistencies
2. We want the exact same data for diff experiments


---



---



In [None]:
trill_df = pd.read_json("gdrive/MyDrive/thesis/podcast_data/clean_data/pretrained_final_dataset.json", orient='records', lines=True)

In [None]:
# Drop possible dupes, not likely but may have been processing discrepancy
lead_base_df = lead_base_df.drop_duplicates(subset='id', keep="first")
trill_df = trill_df.drop_duplicates(subset='id', keep="first")

# Merge on common IDs
merged = pd.merge(lead_base_df, trill_df, on=['id'], how='inner')
print("Both: ", merged.shape, lead_base_df.shape, trill_df.shape)

# Episodes in baseline dataset but not pretrained one 
outer = pd.merge(lead_base_df, trill_df, on=['id'], how='left', indicator=True)
left_only = outer[ outer['_merge'] == 'left_only' ]
print("Left only: ", left_only.shape)

# Episodes in pretrained dataset but not baseline one
outer = pd.merge(lead_base_df, trill_df, on=['id'], how='right', indicator=True)
right_only = outer[ outer['_merge'] == 'right_only' ]
print("Right only: ", right_only.shape)
# # outer.head()

# # right_only.head()

# # print(base_df[ base_df['label'] == 1].shape, trill_df[ trill_df['label'] == 1].shape)

# # missing_ids = right_only[ right_only.label_y == 1]
# missing_ids = right_only.id.tolist()

# # missing_ids.shape

# type(missing_ids)

# Drop outer records so two datasets are have the same episodes/labels
lead_base_df2 = lead_base_df[ ~lead_base_df['id'].isin(left_only.id.tolist())]
trill_df = trill_df[ ~trill_df['id'].isin(right_only.id.tolist())]

# Verify
merged = pd.merge(lead_base_df2, trill_df, on=['id'], how='inner')
print("Both: ", merged.shape, lead_base_df2.shape, trill_df.shape)

# outer = pd.merge(base_df, trill_df, on=['id'], how='right', indicator=True)
# right_only = outer[ outer['_merge'] == 'right_only' ]
# print("Right only: ", right_only.shape)

Both:  (3605, 11) (3638, 9) (3606, 3)
Left only:  (33, 12)
Right only:  (1, 12)
Both:  (3605, 11) (3605, 9) (3605, 3)


In [None]:
lead_base_df2.head(4)

Unnamed: 0,index,id,mfcc,zcr,spec_centroid,label,zcr_min,zcr_mean,zcr_max
0,0,2b174447-e190-4f40-9685-cbd511a67f81,"[-116.14805865530377, 96.76825211262086, 11.86...","[0.2924804688, 0.4047851562, 0.4975585938, 0.4...","[[5021.6897556123, 5019.369706856, 5040.186264...",1,0.004883,0.093827,0.576172
1,1,fa3fbe65-f4a4-4814-9f4d-3e9d0bb80909,"[-307.05680053396486, 86.79008118780918, -50.3...","[0.2875976562, 0.3286132812, 0.3461914062, 0.2...","[[5103.8965378717, 2455.2896611439, 2307.64886...",1,0.011719,0.151583,0.79541
2,2,1dd67a59-0c8e-41f5-8c1b-322b330528f2,"[-300.01835882646196, 81.83778398468816, 8.779...","[0.1025390625, 0.11572265620000001, 0.12988281...","[[1918.3117881537, 1847.0899555161, 1809.29486...",1,0.010742,0.120837,0.69873
3,3,d4d13a4e-470d-42ad-be15-39424a1eec3a,"[-227.45054433134482, 90.72449341556778, -8.52...","[0.048828125, 0.0595703125, 0.0693359375, 0.04...","[[644.3137041161, 988.2641210948, 1226.5068171...",1,0.0,0.128278,0.740234




---

# Export final datasets


---



In [None]:
# base_df.to_json("gdrive/MyDrive/thesis/podcast_data/clean_data/baseline_final_dataset.json", orient='records', lines=True)
# trill_df.to_json("gdrive/MyDrive/thesis/podcast_data/clean_data/pretrained_final_dataset.json", orient='records', lines=True)
# end_base_df2.to_json("gdrive/MyDrive/thesis/podcast_data/clean_data/baseline_end_final_dataset.json", orient='records', lines=True)
lead_base_df2.to_json("gdrive/MyDrive/thesis/podcast_data/clean_data/baseline_lead_final_dataset.json", orient='records', lines=True)



---

# Augmented


---



In [None]:
aug_pretrained_train = 'gdrive/MyDrive/thesis/podcast_data/merged_data/aug_trillsson_popularity_train_features_rec.json'
aug_pretrained_test = 'gdrive/MyDrive/thesis/podcast_data/merged_data/aug_trillsson_popularity_test_features_rec.json'
aug_pretrained = read_pretrained_data(aug_pretrained_train, aug_pretrained_test)

aug_pretrained.to_json("gdrive/MyDrive/thesis/podcast_data/clean_data/aug_pretrained_final_dataset.json", orient='records', lines=True)