# Load libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from IPython.core.display import clear_output
from warnings import warn
import os

In [2]:
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Data Directory

In [3]:
file = 'comedy_link_list.csv'

In [4]:
data_directory = os.path.join('..','data','raw_data/{}'.format(file))
data_directory_saves = os.path.join( '..','data','raw_data/')

### Use same functions

In [5]:
def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [6]:
def get_response(url):
    t0 = time.time()
    try:
        response = requests_retry_session().get(
            url,
            timeout=10
        )
        return response
    except Exception as x:
        print('It failed :(', x.__class__.__name__)
    else:
        print('It eventually worked', response.status_code)
    finally:
        t1 = time.time()
        print('Took', t1 - t0, 'seconds')

## extract transcript data 

In [7]:
def bs_parser(response):
    text_str = ''
    page_html = BeautifulSoup(response.text, 'html.parser')
    #title = page_html.title.text
    text_containers = page_html.find('div', class_ = 'post-content')
    if text_containers:
        for h in text_containers.find_all('p'):
            text_str += '{}\n'.format(h.text)
    else:
        return text_str
    return text_str

## load dataframe

In [8]:
df = pd.read_csv(data_directory)

In [9]:
df.head()

Unnamed: 0,name,raw_title,summary,link
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,George Carlin,Politically Correct Language – by George Carlin,George Carlin's critical thinking on pc langua...,https://scrapsfromtheloft.com/2019/02/18/polit...
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,Ray Romano,"Ray Romano: Right Here, Around the Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",https://scrapsfromtheloft.com/2019/01/30/gabri...


# test

In [None]:
test_list = df.link[10:20]
test_list

In [None]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in test_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(test_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

In [None]:
# create dataframe

test_df = pd.DataFrame({'transcript': transcripts,'link': links})
test_df

## Create a list of links

In [10]:
link_list = df.link

> **future note**
> change list `transcripts` & `links`to a dictionary

In [11]:
%%time
transcripts = []
links = []

rqst = 0
for link_page in link_list:
    links.append(link_page)
    response = get_response(link_page)
    rqst+=1
    print('request:{}/{}'.format(rqst,len(link_list)))
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    if response:
        transcript = bs_parser(response)
        transcripts.append(transcript)
    else:
        transcripts.append('error')

CPU times: user 13.3 s, sys: 1.91 s, total: 15.2 s
Wall time: 47min 24s


In [None]:
# create dataframe

data_df = pd.DataFrame({'transcript': transcripts,'link': links})
data_df

# check for errors

In [None]:
error_filter = data_df.transcript == 'error'

In [None]:
data_df[error_filter]

# save

In [None]:
data_df.to_csv(data_directory_saves+'data_transcripts.csv')