In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os

import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Request functions

In [4]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')
        
        
def bs_parser(response):
    text_str = ''
    page_html = BeautifulSoup(response.text, 'html.parser')
    #title = page_html.title.text
    text_containers = page_html.find('div', class_ = 'post-content')
    if text_containers:
        for h in text_containers.find_all('p'):
            text_str += '{}\n'.format(h.text)
    else:
        return text_str
    return text_str

## load dataframe

In [5]:
file = 'raw_comedy_02.pkl'
data_directory = os.path.join('..','data','raw_data/{}'.format(file))
data_directory_saves = os.path.join( '..','data','raw_data/')
df = pd.read_pickle(data_directory)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       320 non-null    object
 1   raw_title  320 non-null    object
 2   summary    320 non-null    object
 3   link       320 non-null    object
dtypes: object(4)
memory usage: 10.1+ KB


In [6]:
df.head()

Unnamed: 0,name,raw_title,summary,link
0,Louis C.K.,SINCERELY LOUIS CK (2020),Great comedy is finally back. Louis C.K. is no...,https://scrapsfromtheloft.com/2020/05/02/since...
1,Jim Norton,JIM NORTON: AMERICAN DEGENERATE (2013) – FULL ...,"For his second EPIX comedy special, Jim Norton...",https://scrapsfromtheloft.com/2020/05/02/jim-n...
2,Jim Norton,JIM NORTON: MONSTER RAIN (2007) – FULL TRANSCRIPT,"His special ""Monster Rain,"" Jim Norton perform...",https://scrapsfromtheloft.com/2020/05/02/jim-n...
3,Daniel Sloss,DANIEL SLOSS: X (2019) – FULL TRANSCRIPT,Taking the stage before a sold-out audience at...,https://scrapsfromtheloft.com/2020/05/01/danie...
4,Maria Bamford,MARIA BAMFORD: WEAKNESS IS THE BRAND (2020) – ...,"In this standup special ""Weakness Is the Brand...",https://scrapsfromtheloft.com/2020/05/01/maria...


## Test

In [7]:
test_list = df.link[10:20]
test_list

10    https://scrapsfromtheloft.com/2020/03/21/bert-...
11    https://scrapsfromtheloft.com/2020/03/21/bert-...
12    https://scrapsfromtheloft.com/2020/03/12/marc-...
13    https://scrapsfromtheloft.com/2020/03/04/pete-...
14    https://scrapsfromtheloft.com/2020/03/01/pete-...
15    https://scrapsfromtheloft.com/2020/02/27/amand...
16    https://scrapsfromtheloft.com/2020/02/12/stewa...
17    https://scrapsfromtheloft.com/2020/02/10/stewa...
18    https://scrapsfromtheloft.com/2020/02/02/sara-...
19    https://scrapsfromtheloft.com/2020/01/23/dan-s...
Name: link, dtype: object

In [8]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in test_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(test_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 516 ms, sys: 93.8 ms, total: 609 ms
Wall time: 1min 34s


In [9]:
# create dataframe
test_df = pd.DataFrame({'transcript': transcripts,'link': links})
test_df

Unnamed: 0,transcript,link
0,[electronic music playing]\n[male announcer] L...,https://scrapsfromtheloft.com/2020/03/21/bert-...
1,"The machine, Mr. Bert Kreischer, everybody. Le...",https://scrapsfromtheloft.com/2020/03/21/bert-...
2,[audience chattering indistinctly]\n[man] Ladi...,https://scrapsfromtheloft.com/2020/03/12/marc-...
3,"[vocal music]\n♪ Uh-uh, yeah ♪ Uh, uh-uh-uh, y...",https://scrapsfromtheloft.com/2020/03/04/pete-...
4,"So, Louis C.K. tried to get me fired from SNL ...",https://scrapsfromtheloft.com/2020/03/01/pete-...
5,"Now, y’all keep asking me, “Amanda, who is thi...",https://scrapsfromtheloft.com/2020/02/27/amand...
6,(’70s GERMAN ROCK MUSIC PLAYING)\nANNOUNCER: L...,https://scrapsfromtheloft.com/2020/02/12/stewa...
7,This programme contains very strong language a...,https://scrapsfromtheloft.com/2020/02/10/stewa...
8,CHEERING AND APPLAUSE\nThank you so much! Than...,https://scrapsfromtheloft.com/2020/02/02/sara-...
9,"Announcer: Ladies and gentlemen, Dan Soder!\n(...",https://scrapsfromtheloft.com/2020/01/23/dan-s...


## Create a list of links

In [10]:
link_list = df.link
len(link_list)

320

## Loop over the list to get transcript
~1hr

In [11]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in link_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(link_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 17.2 s, sys: 1.84 s, total: 19 s
Wall time: 55min 24s


## Create Dataframe

In [12]:
# create dataframe

data_df = pd.DataFrame({'transcript': transcripts,'link': links})
data_df
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  320 non-null    object
 1   link        320 non-null    object
dtypes: object(2)
memory usage: 5.1+ KB


In [13]:
data_df.head()

Unnamed: 0,transcript,link
0,Great comedy is finally back. Louis C.K. is no...,https://scrapsfromtheloft.com/2020/05/02/since...
1,This year I wanted the opening of my new speci...,https://scrapsfromtheloft.com/2020/05/02/jim-n...
2,"Thank you! Aw, thank you very much. Thank you,...",https://scrapsfromtheloft.com/2020/05/02/jim-n...
3,A man offered us a billion dollars. – Bolt the...,https://scrapsfromtheloft.com/2020/05/01/danie...
4,[microphone droans]\n[music playing]\n[crowd c...,https://scrapsfromtheloft.com/2020/05/01/maria...


## Check for errors

In [14]:
error_filter = data_df.transcript == 'error'
data_df[error_filter]

Unnamed: 0,transcript,link
35,error,https://scrapsfromtheloft.com/2019/09/21/mark-...
165,error,https://scrapsfromtheloft.com/2018/01/14/chris...
167,error,https://scrapsfromtheloft.com/2018/01/14/kevin...
198,error,https://scrapsfromtheloft.com/2017/12/07/aziz-...
281,error,https://scrapsfromtheloft.com/2017/06/12/jim-g...
283,error,https://scrapsfromtheloft.com/2017/06/01/louis...


In [16]:
value = data_df.iloc[33].link
value

'https://scrapsfromtheloft.com/2019/10/04/nikki-glaser-bangin-transcript/'

## Drop from df

In [None]:
#data_df = data_df[~error_filter]

In [None]:
#data_df[error_filter]

## Save data

In [20]:
#data_df.to_pickle('data/raw_data_transcript_01.pkl')
data_df.to_pickle(data_directory_saves+'raw_data_transcript_01.pkl')

In [22]:
data_df.head()

Unnamed: 0,transcript,link
0,Great comedy is finally back. Louis C.K. is no...,https://scrapsfromtheloft.com/2020/05/02/since...
1,This year I wanted the opening of my new speci...,https://scrapsfromtheloft.com/2020/05/02/jim-n...
2,"Thank you! Aw, thank you very much. Thank you,...",https://scrapsfromtheloft.com/2020/05/02/jim-n...
3,A man offered us a billion dollars. – Bolt the...,https://scrapsfromtheloft.com/2020/05/01/danie...
4,[microphone droans]\n[music playing]\n[crowd c...,https://scrapsfromtheloft.com/2020/05/01/maria...
