# Load libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os

In [2]:
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Creat functions
- source :https://www.peterbe.com/plog/best-practice-with-retries-with-requests


In [3]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [4]:
def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')

In [5]:
def bs_parser_post(response):
    '''extract title, link, tags, & summary
    from request data'''
    titles = []
    summaries = []
    links = []
    tags = []
    page_html = BeautifulSoup(response.text, 'html.parser')
    containers = page_html.find_all('div', class_ = 'fusion-post-content post-content')
    for i in containers:
        titles.append(i.h2.text)
        links.append(i.a['href'])
        tags.append(i.select('span')[6].get_text(strip=True)[5:])
        summaries.append(i.find('div', class_ = 'fusion-post-content-container').p.text)
    return titles,summaries,tags,links

## Create dataframe from scrapped data

In [6]:
url = 'https://scrapsfromtheloft.com/tag/stand-up-transcripts/'

In [7]:
%%time
response = get_response(url)
titles,summaries,tags,links = bs_parser_post(response)

df = pd.DataFrame({
                   'raw_title': titles,
                   'summary': summaries,
                   'tags':tags,
                   'link': links})

Took 1.754042625427246 seconds
CPU times: user 578 ms, sys: 46.9 ms, total: 625 ms
Wall time: 2.31 s


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 4 columns):
raw_title    284 non-null object
summary      284 non-null object
tags         284 non-null object
link         284 non-null object
dtypes: object(4)
memory usage: 9.0+ KB


In [9]:
df.head()

Unnamed: 0,raw_title,summary,tags,link
0,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,"Nikki Glaser,Stand-up transcripts",https://scrapsfromtheloft.com/2019/10/04/nikki...
1,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,"Ryan Hamilton,Stand-up transcripts",https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,"Mark Normand,Stand-up transcripts,The Tonight ...",https://scrapsfromtheloft.com/2019/09/21/mark-...
3,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...","George Carlin,Stand-up transcripts",https://scrapsfromtheloft.com/2019/09/12/georg...
4,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...","Bill Burr,Stand-up transcripts",https://scrapsfromtheloft.com/2019/09/10/bill-...


In [11]:
df.tags.value_counts()[:10]

George Carlin,Stand-up transcripts      20
Dave Chappelle,Stand-up transcripts     11
Louis C.K.,Stand-up transcripts         10
Ricky Gervais,Stand-up transcripts       7
Bill Burr,Stand-up transcripts           7
Jim Jefferies,Stand-up transcripts       7
Kevin Hart,Stand-up transcripts          5
Stand-up transcripts,Trevor Noah         5
Chris Rock,Stand-up transcripts          4
SARAH SILVERMAN,Stand-up transcripts     4
Name: tags, dtype: int64

## Minor cleanup
## Clean tags column

In [12]:
df.tags = df.tags.str.replace('Stand-up transcripts','')
df.tags = df.tags.str.replace('Gun control','')
df.tags = df.tags.str.replace('SATURDAY NIGHT LIVE','')
df.tags = df.tags.str.replace('Abortion','')
df.tags = df.tags.str.replace('Religion','')
df.tags = df.tags.str.replace(',','')

# Firstname Lastname
df.tags = df.tags.str.title()

In [13]:
# rename column
df.rename(columns={"tags": "name"},inplace=True)

In [14]:
# order columns
df = df[['name','raw_title','summary','link']]

In [15]:
df.head()

Unnamed: 0,name,raw_title,summary,link
0,Nikki Glaser,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,https://scrapsfromtheloft.com/2019/10/04/nikki...
1,Ryan Hamilton,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,https://scrapsfromtheloft.com/2019/09/21/mark-...
3,George Carlin,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...",https://scrapsfromtheloft.com/2019/09/12/georg...
4,Bill Burr,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...",https://scrapsfromtheloft.com/2019/09/10/bill-...


## Save

In [16]:
data_directory_saves = os.path.join( '..','data','raw_data/')

In [18]:
df.to_csv(data_directory_saves+'comedy_link_list_OCT_19.csv',index=False)