In [26]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os

In [3]:
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Creat functions

In [4]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [5]:
def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')

In [11]:
def bs_parser_post(response):
    titles = []
    summaries = []
    links = []
    tags = []
    page_html = BeautifulSoup(response.text, 'html.parser')
    containers = page_html.find_all('div', class_ = 'fusion-post-content post-content')
    for i in containers:
        titles.append(i.h2.text)
        links.append(i.a['href'])
        tags.append(i.select('span')[6].get_text(strip=True)[5:])
        summaries.append(i.find('div', class_ = 'fusion-post-content-container').p.text)
    return titles,summaries,tags,links

In [12]:
def bs_parser(response):
    text_str = ''
    page_html = BeautifulSoup(response.text, 'html.parser')
    #title = page_html.title.text
    text_containers = page_html.find('div', class_ = 'post-content')
    if text_containers:
        for h in text_containers.find_all('p'):
            text_str += '{}\n'.format(h.text)
    else:
        return text_str
    return text_str

In [13]:
url = 'https://scrapsfromtheloft.com/tag/stand-up-transcripts/'

In [14]:
%%time
response = get_response(url)
titles,summaries,tags,links = bs_parser_post(response)

df = pd.DataFrame({
                   'raw_title': titles,
                   'summary': summaries,
                   'tags':tags,
                   'link': links})

Took 1.3802299499511719 seconds
CPU times: user 375 ms, sys: 46.9 ms, total: 422 ms
Wall time: 1.76 s


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 4 columns):
raw_title    244 non-null object
summary      244 non-null object
tags         244 non-null object
link         244 non-null object
dtypes: object(4)
memory usage: 7.7+ KB


In [17]:
df.head()

Unnamed: 0,raw_title,summary,tags,link
0,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...","Enissa Amani,Stand-up transcripts",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,Politically Correct Language – by George Carlin,George Carlin's critical thinking on pc langua...,"George Carlin,Stand-up transcripts",https://scrapsfromtheloft.com/2019/02/18/polit...
2,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,"Ken Jeong,Stand-up transcripts",https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,"Ray Romano: Right Here, Around the Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,"Ray Romano,Stand-up transcripts",https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...","Gabriel Iglesias,Stand-up transcripts",https://scrapsfromtheloft.com/2019/01/30/gabri...


## Cleanup tags

In [20]:
df.tags = df.tags.str.replace('Stand-up transcripts','')
df.tags = df.tags.str.replace('Gun control','')
df.tags = df.tags.str.replace('SATURDAY NIGHT LIVE','')
df.tags = df.tags.str.replace('Abortion','')
df.tags = df.tags.str.replace('Religion','')
df.tags = df.tags.str.replace(',','')

# 
df.tags = df.tags.str.title()

In [22]:
# rename column
df.rename(columns={"tags": "name"},inplace=True)

In [24]:
# order columns
df = df[['name','raw_title','summary','link']]

In [28]:
df.head()

Unnamed: 0,name,raw_title,summary,link
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,George Carlin,Politically Correct Language – by George Carlin,George Carlin's critical thinking on pc langua...,https://scrapsfromtheloft.com/2019/02/18/polit...
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,Ray Romano,"Ray Romano: Right Here, Around the Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",https://scrapsfromtheloft.com/2019/01/30/gabri...


## save directory

In [30]:
data_directory_saves = os.path.join( '..','data','raw_data/')

In [32]:
df.to_csv(data_directory_saves+'comedy_link_list.csv',index=False)