In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os

In [3]:
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# data path

In [8]:
file = 'comedy_link_list.csv'

In [9]:
data_directory = os.path.join('..','data','raw_data/{}'.format(file))
data_directory_saves = os.path.join( '..','data','raw_data/')

In [4]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [5]:
def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')

In [6]:
def bs_parser(response):
    text_str = ''
    page_html = BeautifulSoup(response.text, 'html.parser')
    #title = page_html.title.text
    text_containers = page_html.find('div', class_ = 'post-content')
    if text_containers:
        for h in text_containers.find_all('p'):
            text_str += '{}\n'.format(h.text)
    else:
        return text_str
    return text_str

# get link data

In [10]:
df = pd.read_csv(data_directory)

In [12]:
df.head()

Unnamed: 0,name,raw_title,summary,link
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,George Carlin,Politically Correct Language – by George Carlin,George Carlin's critical thinking on pc langua...,https://scrapsfromtheloft.com/2019/02/18/polit...
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,Ray Romano,"Ray Romano: Right Here, Around the Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",https://scrapsfromtheloft.com/2019/01/30/gabri...


# test

In [13]:
test_list = df.link[10:20]
test_list

10    https://scrapsfromtheloft.com/2018/12/19/pete-...
11    https://scrapsfromtheloft.com/2018/12/15/vir-d...
12    https://scrapsfromtheloft.com/2018/11/27/volke...
13    https://scrapsfromtheloft.com/2018/11/21/trevo...
14    https://scrapsfromtheloft.com/2018/10/28/jeff-...
15    https://scrapsfromtheloft.com/2018/10/26/adam-...
16    https://scrapsfromtheloft.com/2018/10/23/jeff-...
17    https://scrapsfromtheloft.com/2018/10/19/ron-w...
18    https://scrapsfromtheloft.com/2018/10/13/comed...
19    https://scrapsfromtheloft.com/2018/10/11/mo-am...
Name: link, dtype: object

In [14]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in test_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(test_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 656 ms, sys: 141 ms, total: 797 ms
Wall time: 56 s


In [15]:
# create dataframe

test_df = pd.DataFrame({'transcript': transcripts,'link': links})
test_df

Unnamed: 0,transcript,link
0,(crowd murmurs) Announcer: Ladies and gentleme...,https://scrapsfromtheloft.com/2018/12/19/pete-...
1,I lost 80% of my mind. It’s very freeing. You ...,https://scrapsfromtheloft.com/2018/12/15/vir-d...
2,Last part of Volker Pispers’ program “Bis neul...,https://scrapsfromtheloft.com/2018/11/27/volke...
3,A NETFLIX ORIGINAL COMEDY SPECIAL [distant tra...,https://scrapsfromtheloft.com/2018/11/21/trevo...
4,When you start doing what Jeff and I do this i...,https://scrapsfromtheloft.com/2018/10/28/jeff-...
5,"[man] Okay, ready, and… Take your own cue, Ada...",https://scrapsfromtheloft.com/2018/10/26/adam-...
6,"Ladies and gentlemen, please welcome Jeff Foxw...",https://scrapsfromtheloft.com/2018/10/23/jeff-...
7,[Ron White] You ever take a crap so big your p...,https://scrapsfromtheloft.com/2018/10/19/ron-w...
8,Yes. I agree. I totally agree. Totally. It’s g...,https://scrapsfromtheloft.com/2018/10/13/comed...
9,A NETFLIX ORIGINAL COMEDY SPECIAL\n[audience c...,https://scrapsfromtheloft.com/2018/10/11/mo-am...


## all data

In [16]:
link_list = df.link

In [17]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in link_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(link_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 13.6 s, sys: 2.11 s, total: 15.7 s
Wall time: 19min 55s


In [18]:
# create dataframe

data_df = pd.DataFrame({'transcript': transcripts,'link': links})
data_df

Unnamed: 0,transcript,link
0,"A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...
2,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...
5,[instrumental music plays] ♪ All right ♪ [scre...,https://scrapsfromtheloft.com/2019/01/29/sebas...
6,There are people out there trying to rewrite o...,https://scrapsfromtheloft.com/2019/01/27/latin...
7,"Good, good. It feels good to be home in Chicag...",https://scrapsfromtheloft.com/2019/01/24/sebas...
8,"Please welcome Sebastian Maniscalco.\nSo good,...",https://scrapsfromtheloft.com/2019/01/24/sebas...
9,A NETFLIX ORIGINAL COMEDY SPECIAL\n[crowd chee...,https://scrapsfromtheloft.com/2018/12/22/ellen...


# check for errors

In [19]:
error_filter = data_df.transcript == 'error'

In [20]:
data_df[error_filter]

Unnamed: 0,transcript,link


# save

In [21]:
data_df.to_csv(data_directory_saves+'data_transcripts.csv')