## Loading the data from Web pages for NLP Applications

In [3]:
import requests
from bs4 import BeautifulSoup
import pickle

In [3]:
def url_to_transcript(url):
# read transcript data from scrapsfromtheloft.com
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

In [1]:
# URLs 
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/']

# Comedian names
comedians = ['louis', 'dave', 'ricky']

In [5]:
transcripts = [url_to_transcript(u) for u in urls]

http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/
http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/
http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/


In [6]:
# Saving the data as pickle
for i, c in enumerate(comedians):
    with open(c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [4]:
# Loading the conetent for processing
data = {}
for i, c in enumerate(comedians):
    with open(c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Merge all the samples to one large file
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

In [6]:
merged_data = {key: [combine_text(value)] for (key, value) in data.items()}

In [7]:
#stroing is as a data frame
import pandas as pd
pd.set_option('max_colwidth',200)

text_df = pd.DataFrame.from_dict(merged_data).transpose()
text_df.columns = ['transcript']
text_df = text_df.sort_index()
text_df

Unnamed: 0,transcript
dave,"This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the alchemist’s fire that transforms fear and tragedy ..."
louis,"Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily agree with you, but I appreciate very much. Well, t..."
ricky,"Hello. Hello! How you doing? Great. Thank you. Wow. Calm down. Shut the fuck up. Thank you. What a lovely welcome. I’m gonna try my hardest tonight. You’re thinking, “Relax, we’ve had our money’s ..."


In [15]:
# RE text cleaning
import re
import string

def clean_text_round1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [16]:
data_clean = pd.DataFrame(text_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
dave,this is dave he tells dirty jokes for a living that stare is where most of his hard work happens it signifies a profound train of thought the alchemists fire that transforms fear and tragedy into ...
louis,introfade the music out lets roll hold there lights do the lights thank you thank you very much i appreciate that i dont necessarily agree with you but i appreciate very much well this is a nice p...
ricky,hello hello how you doing great thank you wow calm down shut the fuck up thank you what a lovely welcome im gonna try my hardest tonight youre thinking relax weve had our moneys worth just seeing ...


In [21]:
# Creating Term Document matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaah,aah,ability,abject,able,abortions,accept,access,accident,accurate,...,youngest,youre,youth,youthis,youtube,youve,yulin,yummy,zero,zoo
dave,1,0,0,0,0,1,0,0,0,0,...,0,15,0,1,0,5,0,0,0,0
louis,0,3,0,0,1,0,1,2,1,1,...,0,50,1,0,1,0,0,0,2,0
ricky,0,0,1,1,2,0,0,0,0,1,...,1,41,0,0,1,10,1,1,0,1
