<a href="https://colab.research.google.com/github/ffer200395/The-Joe-Rogan-Experience/blob/main/Download_podcasts_%26_scrap_metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

import requests
import urllib.request 
from bs4 import BeautifulSoup

import re
import pickle
from tqdm import tqdm

import os
import shlex
import subprocess

# Download The Joe Rogan Experience podcasts:
-From https://archive.org/, a non-profit library of millions of free books, movies, software, music, websites, and more.

In [None]:
def download_podcasts(path, start=1, end=1550):
    """Downloads all jre podcasts between start and end episodes
    Parameters
    ----------
    path : str
        Path where we want to store downloaded podcasts
    start : int 
        First episode to be downloaded
    end : int
        Last episode to be downloaded
    Returns
    -------
    None
        All podcasts are downloaded in path
    """
    for i in range(start,end+1):
        # Make a request for a specific podcast
        req = requests.get(f'https://archive.org/details/joe-rogan-podcast-audio-{i}')
        if req.status_code==200:
            print(i)
            # Parse hatml and find link to download
            soup = BeautifulSoup(req.content, 'html.parser')
            link=soup.find_all(attrs={'href': re.compile("https://archive.org/download")})[0].get('href')
            try:
                # If found retreive mp3 file
                urllib.request.urlretrieve(link, f'{path}jre{i}.mp3')
            except Exception as e:
                print(i,e)
        else:
            print(req.status_code)

In [None]:
# To download podcasts we call the above function
download_podcasts(path='data/podcasts/', start=1, end=1550)

# Scrap podcasts' metadata:
-From https://www.jrepodcast.com/ and https://jrelibrary.com/episode-list/

In [None]:
# DataFrame of categories and links for each one
df_cat = pd.read_csv('data/jre_podcast_categories.csv',sep=';')
df_cat.head(5)

Unnamed: 0,link,category
0,https://www.jrepodcast.com/episodes/activists/,Activists
1,https://www.jrepodcast.com/episodes/actors/,Actors
2,https://www.jrepodcast.com/episodes/artists/,Artists
3,https://www.jrepodcast.com/episodes/athletes-f...,"Athletes, Fighters, Martial Arts"
4,https://www.jrepodcast.com/episodes/authors/,Authors


In [None]:
def get_number_of_pages_cat(df):
    """Find number of pages for each category
    Parameters
    ----------
    df : DataFrame
        Contains a column for links and a column for categories
    Returns
    -------
    DataFrame
        An extra column is added for number of pages found
    """
    links, categories, n_pages = df['link'], df['category'], []
    for i,link in tqdm(enumerate(links)):
        # Access to category link
        req = requests.get(link)
        if req.status_code==200:
            # Find number of pages available
            pages = BeautifulSoup(req.content,'html.parser').find_all("a", {"class":"page-numbers"})
            if len(pages)!=0:
                n_pages.append(int(pages[-2].text)+1)
            else:
                n_pages.append(0)
        else:
            print(req.status_code)
    # Add new column
    df['n_pages'] = n_pages
    return df

In [None]:
df_pages = get_number_of_pages_cat(df_cat)
df_pages.head(5)

22it [00:17,  1.27it/s]


Unnamed: 0,link,category,n_pages
0,https://www.jrepodcast.com/episodes/activists/,Activists,5
1,https://www.jrepodcast.com/episodes/actors/,Actors,40
2,https://www.jrepodcast.com/episodes/artists/,Artists,0
3,https://www.jrepodcast.com/episodes/athletes-f...,"Athletes, Fighters, Martial Arts",35
4,https://www.jrepodcast.com/episodes/authors/,Authors,23


In [None]:
def get_metadata(link):
    """Given a link it captures all relevant metadata for podcasts
    Parameters
    ----------
    link : str
        Url from https://www.jrepodcast.com/
    Returns
    -------
    List
        A list of lists, each list correspond to a feature
    """
    req = requests.get(link)
    if req.status_code==200:
        soup = BeautifulSoup(req.content,'html.parser')
        ls_name = list(map(lambda x:x.text, soup.find_all("span", {"itemprop":"name"})))
        ls_date = list(map(lambda x:x.text, soup.find_all("time")))
        ls_view = list(map(lambda x:x.text, soup.find_all("span", {"class":"post-meta-span post-meta-span-views"})))
        ls_like = list(map(lambda x:x.text, soup.find_all("span", {"class":"post-meta-span post-meta-span-likes"})))
        ls_dislike = list(map(lambda x:x.text, soup.find_all("span", {"class":"post-meta-span post-meta-span-dislikes"})))
        ls_ratio = list(map(lambda x:x.text, soup.find_all("span", {"class":"post-meta-span post-meta-span-ratio"})))
    else:
        print(req.status_code)
    return [ls_name, ls_date, ls_view, ls_like, ls_dislike, ls_ratio]

def get_metadata_by_cat(df):
    """For every category obtains all metadata
    Parameters
    ----------
    df : DataFrame
        Columns for links, categories and number of pages for each category
    Returns
    -------
    DataFrame
        
    """
    links, categories, n_pages = df['link'], df['category'], df['n_pages']
    # Placeholder for metadata
    data, ls_cats = [[] for i in range(6)], []
    for i,url_base in tqdm(enumerate(links)):
        pages = n_pages[i]
        # Just one page so url_base is used
        if pages == 0:
            aux = get_metadata(url_base)
            data = [data[j]+aux[j] for j in range(6)]
            ls_cats+=[categories[i]]*len(aux[0])
        else:
            for page in range(1,pages):
                url = url_base+f'page/{page}/'
                aux = get_metadata(url)
                data = [data[j]+aux[j] for j in range(6)]
                ls_cats+=[categories[i]]*len(aux[0])
    df_res = pd.DataFrame({'Name':data[0],'Date':data[1],'Views':data[2],'Likes':data[3],'Dislikes':data[4],'Ratio':data[5],'Category':ls_cats})
    return df_res

In [None]:
raw_metadata = get_metadata_by_cat(df_pages)
raw_metadata.head(5)

Unnamed: 0,Name,Date,Views,Likes,Dislikes,Ratio,Category
0,Joe Rogan Experience #1512 – Ben Shapiro,"July 22, 2020","8,603,088 views","206,159 likes","21,102 dislikes",9.77 ratio,Activists
1,Joe Rogan Experience #1427 – Melissa Chen,"February 14, 2020","3,558,623 views","51,294 likes","3,679 dislikes",13.94 ratio,Activists
2,Joe Rogan Experience #1419 – Daryl Davis,"January 30, 2020","3,623,181 views","107,595 likes","3,045 dislikes",35.33 ratio,Activists
3,Joe Rogan Experience #1402 – Boyan Slat,"December 17, 2019","1,325,399 views","21,551 likes",841 dislikes,25.63 ratio,Activists
4,Joe Rogan Experience #1373 – Kyle Kulinski,"October 30, 2019","2,891,481 views","50,991 likes","6,986 dislikes",7.30 ratio,Activists


In [None]:
# One podcast belongs to several caegories
print(len(raw_metadata['Name']))
print(len(set(raw_metadata['Name'])))

5143
2517


In [None]:
# For each episode e get all categories it belongs to and store them in a dictionary
d_categories = dict()
for name in set(raw_metadata['Name']):
    d_categories[name] = list(raw_metadata[raw_metadata['Name']==name]['Category'].values)

In [None]:
# We then create a new df in which each row corresponds to an unique episode
df_meta = pd.DataFrame(columns = raw_metadata.columns)
for index, row in tqdm(raw_metadata.iterrows()):
    df_meta = df_meta.append({'Name':row['Name'],'Category':str(d_categories[row['Name']]),'Date':row['Date'],'Views':row['Views'],'Likes':row['Likes'],'Dislikes':row['Dislikes'],'Ratio':row['Ratio']}, ignore_index=True)
df_meta = df_meta.drop_duplicates()
df_meta.head(3)

5143it [00:44, 115.05it/s]


Unnamed: 0,Name,Date,Views,Likes,Dislikes,Ratio,Category
0,Joe Rogan Experience #1512 – Ben Shapiro,"July 22, 2020","8,603,088 views","206,159 likes","21,102 dislikes",9.77 ratio,"['Activists', 'Miscellaneous']"
1,Joe Rogan Experience #1427 – Melissa Chen,"February 14, 2020","3,558,623 views","51,294 likes","3,679 dislikes",13.94 ratio,['Activists']
2,Joe Rogan Experience #1419 – Daryl Davis,"January 30, 2020","3,623,181 views","107,595 likes","3,045 dislikes",35.33 ratio,"['Activists', 'Authors', 'Musicians']"


In [None]:
# Extra information is obtained from jrelibrary.com/episode-list/ and stored in jre_library_meta
df_lib = pd.read_csv('data/jre_library_meta.csv',sep=';')
df_lib.head(3)

Unnamed: 0,Episode,Date,Duration,n_guests,Guests,Category
0,1530,2020-08-31,05:31,1,['Duncan Trussell'],['Comedians']
1,1529,2020-08-21,03:18,2,"['Whitney Cummings', 'Annie Lederman']",['Comedians']
2,1528,2020-08-20,02:42,1,['Nikki Glaser'],['Comedians']


In [None]:
# Añadir Episode, Date, Duration, n_guests, Guests

# Obtain JRE episode number from name
names_jre, eps_jre = [], []
for name in df_meta['Name']:
    if name.startswith('Joe Rogan Experience #'):
        eps_jre.append(int(name.split('#')[1].split(' ')[0].split('-')[0]))
        names_jre.append(name)
        
# Filter data so just JRE programs are stored
df_meta_filt = df_meta[df_meta.Name.isin(names_jre)].copy()
# New column added
df_meta_filt['Episode'] = eps_jre

# Merge both df sort rows by episonde number and reset index
df_all = pd.merge(df_meta_filt, df_lib[['Episode','Duration','n_guests','Guests']], on='Episode')
df_all = df_all.sort_values(by ='Episode')
df_all = df_all.reset_index(drop=True)

In [None]:
print(df_all.shape)
df_all.head(3)

(1516, 11)


Unnamed: 0,Name,Date,Views,Likes,Dislikes,Ratio,Category,Episode,Duration,n_guests,Guests
0,Joe Rogan Experience #1 – Brian Redban,"January 17, 2013","696,642 views","6,855 likes",185 dislikes,37.05 ratio,['Comedians'],1,02:02,1,['Brian Redban']
1,Joe Rogan Experience #2 – Brian Redban,"January 17, 2013","137,946 views","1,036 likes",71 dislikes,14.59 ratio,['Comedians'],2,02:32,1,['Brian Redban']
2,"Joe Rogan Experience #3 – Ari Shaffir, Brian R...","January 17, 2013","135,861 views","1,027 likes",67 dislikes,15.33 ratio,['Comedians'],3,02:15,2,"['Ari Shaffir', ' Brian Redban']"


In [None]:
# Save it
df_all.to_csv('data/df_metadata.csv',sep=';')

# Format conversion (From MP3 to WAV files)
The WAV format is an audio lossless format (high-quality uncompressed file) we can obtain audio time series from wav files

In [None]:
mp3_path = 'data/podcasts/'
wav_path = 'data/podcasts_wav/'
files = os.listdir(mp3_path)
for file in tqdm(files):
    curl = f"sox -v 0.98 {mp3_path+file} -b 16 {wav_path+file.split('.')[0]+'.wav'} channels 1 rate 16000"
    args = shlex.split(curl)
    process = subprocess.Popen(args, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    process.communicate()

100%|██████████| 3/3 [03:14<00:00, 64.69s/it]
