## Load libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os
#from tqdm import tqdm
from tqdm.notebook import trange, tqdm


import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Create functions

In [2]:
def requests_retry_session(
    retries=6,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [3]:
def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')

In [4]:
def bs_parser_post(response):
    '''extract title, link, tags, & summaryfrom request data'''
    titles = []
    summaries = []
    links = []
    tags = []
    for p in tqdm(range(len(response))):
        page_html = BeautifulSoup(response[p].text, 'html.parser')
        containers = page_html.find_all('div', class_ = 'fusion-post-content post-content')
        #print(len(containers))
        for i in containers:
            ti =  i.h2.text
            li = i.a['href']
            ta = i.select('span')[6].get_text(strip=True)[5:]
            su = i.find('div', class_ = 'fusion-post-content-container').p.text
            titles.append(ti)
            links.append(li)
            tags.append(ta)
            summaries.append(su)
    return titles,summaries,tags,links

## Create list of responses

In [5]:
rlist = []
# loop over 32 times
for i in tqdm(range(1,33)):
    url = 'https://scrapsfromtheloft.com/tag/stand-up-transcripts/page/{}'.format(i)
    r = get_response(url)
    rlist.append(r)

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Took 9.939382791519165 seconds
Took 7.959158420562744 seconds
Took 7.605671167373657 seconds
Took 8.832727909088135 seconds
Took 8.18500018119812 seconds
Took 18.871034145355225 seconds
Took 18.044118881225586 seconds
Took 10.832559823989868 seconds
Took 39.50902509689331 seconds
Took 8.183029413223267 seconds
Took 8.40703535079956 seconds
Took 7.720157861709595 seconds
Took 7.674432277679443 seconds
Took 7.0679919719696045 seconds
Took 8.047590970993042 seconds
Took 13.96164345741272 seconds
Took 7.424077987670898 seconds
Took 8.039018630981445 seconds
Took 9.720714569091797 seconds
Took 7.673600196838379 seconds
Took 7.478991746902466 seconds
Took 7.253503322601318 seconds
Took 17.983152151107788 seconds
Took 6.83796763420105 seconds
Took 8.128286123275757 seconds
Took 6.817831039428711 seconds
Took 7.973235607147217 seconds
Took 8.67385196685791 seconds
Took 7.248182535171509 seconds
Took 7.725471258163452 seconds
Took 7.009956121444702 seconds
Took 7.387735366821289 seconds



In [6]:
len(rlist)

32

## Create Dataframe with data

In [7]:
titles,summaries,tags,links = bs_parser_post(rlist)

df = pd.DataFrame({
                   'raw_title': titles,
                   'summary': summaries,
                   'tags':tags,
                   'link': links})
df.info()

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320 entries, 0 to 319
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   raw_title  320 non-null    object
 1   summary    320 non-null    object
 2   tags       320 non-null    object
 3   link       320 non-null    object
dtypes: object(4)
memory usage: 10.1+ KB


In [8]:
df.head()

Unnamed: 0,raw_title,summary,tags,link
0,SINCERELY LOUIS CK (2020),Great comedy is finally back. Louis C.K. is no...,"Louis C.K.,Stand-up transcripts",https://scrapsfromtheloft.com/2020/05/02/since...
1,JIM NORTON: AMERICAN DEGENERATE (2013) – FULL ...,"For his second EPIX comedy special, Jim Norton...","Jim Norton,Stand-up transcripts",https://scrapsfromtheloft.com/2020/05/02/jim-n...
2,JIM NORTON: MONSTER RAIN (2007) – FULL TRANSCRIPT,"His special ""Monster Rain,"" Jim Norton perform...","Jim Norton,Stand-up transcripts",https://scrapsfromtheloft.com/2020/05/02/jim-n...
3,DANIEL SLOSS: X (2019) – FULL TRANSCRIPT,Taking the stage before a sold-out audience at...,"Daniel Sloss,Stand-up transcripts",https://scrapsfromtheloft.com/2020/05/01/danie...
4,MARIA BAMFORD: WEAKNESS IS THE BRAND (2020) – ...,"In this standup special ""Weakness Is the Brand...","Maria Bamford,Stand-up transcripts",https://scrapsfromtheloft.com/2020/05/01/maria...


## Save dataframe

In [9]:
data_directory_saves = os.path.join( '..','data','raw_data/')

In [11]:
df.to_pickle(data_directory_saves+'raw_comedy.pkl')