# Load libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os

In [2]:
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Data Directory

In [3]:
file = 'comedy_link_list.csv'

In [4]:
data_directory = os.path.join('..','data','raw_data/{}'.format(file))
data_directory_saves = os.path.join( '..','data','raw_data/')

### Use same functions

In [5]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [6]:
def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')

## extract transcript data 

In [7]:
def bs_parser(response):
    text_str = ''
    page_html = BeautifulSoup(response.text, 'html.parser')
    #title = page_html.title.text
    text_containers = page_html.find('div', class_ = 'post-content')
    if text_containers:
        for h in text_containers.find_all('p'):
            text_str += '{}\n'.format(h.text)
    else:
        return text_str
    return text_str

## load dataframe

In [8]:
df = pd.read_csv(data_directory)

In [9]:
df.head()

Unnamed: 0,name,raw_title,summary,link
0,Nikki Glaser,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,https://scrapsfromtheloft.com/2019/10/04/nikki...
1,Ryan Hamilton,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,https://scrapsfromtheloft.com/2019/09/21/mark-...
3,George Carlin,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...",https://scrapsfromtheloft.com/2019/09/12/georg...
4,Bill Burr,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...",https://scrapsfromtheloft.com/2019/09/10/bill-...


# test

In [10]:
test_list = df.link[10:20]
test_list

10    https://scrapsfromtheloft.com/2019/08/08/kevin...
11    https://scrapsfromtheloft.com/2019/08/01/whitn...
12    https://scrapsfromtheloft.com/2019/07/07/ralph...
13    https://scrapsfromtheloft.com/2019/06/27/mike-...
14    https://scrapsfromtheloft.com/2019/06/22/adam-...
15    https://scrapsfromtheloft.com/2019/06/16/jo-ko...
16    https://scrapsfromtheloft.com/2019/06/15/jo-ko...
17    https://scrapsfromtheloft.com/2019/06/02/the-s...
18    https://scrapsfromtheloft.com/2019/05/22/wanda...
19    https://scrapsfromtheloft.com/2019/05/18/doug-...
Name: link, dtype: object

In [11]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in test_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(test_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 453 ms, sys: 31.2 ms, total: 484 ms
Wall time: 25.5 s


In [12]:
# create dataframe

test_df = pd.DataFrame({'transcript': transcripts,'link': links})
test_df

Unnamed: 0,transcript,link
0,Kevin Hart acts out fantasy scenarios of drawi...,https://scrapsfromtheloft.com/2019/08/08/kevin...
1,Ladies and gentlemen… Whitney Cummings!\nThis ...,https://scrapsfromtheloft.com/2019/08/01/whitn...
2,Ralphie May hosts his legendary Filthy Animals...,https://scrapsfromtheloft.com/2019/07/07/ralph...
3,♪ Washington D.C. are you ready? ♪\n♪ It’s abo...,https://scrapsfromtheloft.com/2019/06/27/mike-...
4,[rock music playing]\n[indistinct chatter]\nHe...,https://scrapsfromtheloft.com/2019/06/22/adam-...
5,Yo\nFinally here\nYou know I’m finally here\nF...,https://scrapsfromtheloft.com/2019/06/16/jo-ko...
6,[announcer] Hawaii.\nAre you ready?\n[cheering...,https://scrapsfromtheloft.com/2019/06/15/jo-ko...
7,"[announcer] And now, coming to the stage, Gina...",https://scrapsfromtheloft.com/2019/06/02/the-s...
8,Ladies and gentlemen… Wanda Sykes!\nYes. Yes. ...,https://scrapsfromtheloft.com/2019/05/22/wanda...
9,From “Dead Beat Hero” (2004)\nImmigration. The...,https://scrapsfromtheloft.com/2019/05/18/doug-...


## Create a list of links

In [13]:
link_list = df.link

In [15]:
len(link_list)

284

> **future note**
> change list `transcripts` & `links`to a dictionary

In [16]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in link_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(link_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 17.7 s, sys: 1.7 s, total: 19.4 s
Wall time: 15min


In [17]:
# create dataframe

data_df = pd.DataFrame({'transcript': transcripts,'link': links})
data_df

Unnamed: 0,transcript,link
0,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...
1,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...
3,From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...
4,"Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...
5,Tacked onto the end of the Dave Chappelle’s St...,https://scrapsfromtheloft.com/2019/08/29/dave-...
6,"On this episode of Patriot Act, Hasan breaks d...",https://scrapsfromtheloft.com/2019/08/26/brazi...
7,Sticks & Stones is Dave Chappelle’s fifth Netf...,https://scrapsfromtheloft.com/2019/08/26/dave-...
8,"Emily Heller compares Donald Trump to Air Bud,...",https://scrapsfromtheloft.com/2019/08/25/emily...
9,This is the full transcript of David Cross’ la...,https://scrapsfromtheloft.com/2019/08/13/david...


# check for errors

In [18]:
error_filter = data_df.transcript == 'error'

In [19]:
data_df[error_filter]

Unnamed: 0,transcript,link


# save

In [20]:
data_df.to_csv(data_directory_saves+'data_transcripts_OCT_19.csv')