In [None]:
# Project 5 - Web Scraper + Regular Expression

#1: Get the text into Python.
#2: Clean out the data using python or packages.
#3: Get the count of the most words used via pandas.
#4: Export the data into excel and ensure that the data is clean.

In [None]:
# Overall goal: Finding out the most words that MLK during his Famouns "I have a dream" speech. Used for hospitals and personal projects

In [None]:
# Step 1: import packages

In [1]:
from bs4 import BeautifulSoup
import requests

In [None]:
# Step 2: Creating the url var, page var (ensure it works), html var

In [5]:
url = r'http://analytictech.com/mb021/mlk.htm'

page = requests.get(url)

soup = BeautifulSoup(page.text,'html')

In [7]:
print(soup)

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="Microsoft FrontPage 4.0" name="GENERATOR"/>
<title>Martin Luther King Jr.'s 1962 Speech</title>
</head>
<body alink="#FF0000" bgcolor="#FFFFFF" link="#0000FF" text="#000000" vlink="#551A8B">
<h1><font size="5">Transcript of speech by </font><br/>
Dr. Martin Luther King Jr. <br/>
August 28, 1963. Lincoln Memorial in Washington D.C. </h1>
<hr color="#008080" noshade="" size="5"/>
<p>I am happy to join with you today in what will go down in
history as the greatest demonstration for freedom in the history
of our nation. </p>
<p>Five score years ago a great American in whose symbolic shadow
we stand today signed the Emancipation Proclamation. This
momentous decree came as a great beckoning light of hope to
millions of Negro slaves who had been seared in the flames of
withering injustice. It came as a joyous daybreak to end the long
night of their c

In [13]:
# Step 3: Finding where the text starts and where it ends using the <p> tag in the html code. Verifying the code and then naming it a variable
# rather than the syntax being used

In [15]:
mlk_speech = soup.find_all('p')

In [19]:
# Step 4: Breaking up all of the <p> tags. Using a list comprehension

In [21]:
speech_combined = [p.text for p in mlk_speech]

print(speech_combined)

['I am happy to join with you today in what will go down in\r\nhistory as the greatest demonstration for freedom in the history\r\nof our nation. ', 'Five score years ago a great American in whose symbolic shadow\r\nwe stand today signed the Emancipation Proclamation. This\r\nmomentous decree came as a great beckoning light of hope to\r\nmillions of Negro slaves who had been seared in the flames of\r\nwithering injustice. It came as a joyous daybreak to end the long\r\nnight of their captivity. ', 'But one hundred years later the Negro is still not free. One\r\nhundred years later the life of the Negro is still sadly crippled\r\nby the manacles of segregation and the chains of discrimination. ', 'One hundred years later the Negro lives on a lonely island of\r\npoverty in the midst of a vast ocean of material prosperity. ', 'One hundred years later the Negro is still languishing in the\r\ncomers of American society and finds himself in exile in his own\r\nland. ', "We all have come to t

In [23]:
# Step 5: Creating all one speech instead of it being broken up. The ', ' in the previous syntax was breaking up the speech. 
# We only need text here.

In [25]:
' '.join(speech_combined)

'I am happy to join with you today in what will go down in\r\nhistory as the greatest demonstration for freedom in the history\r\nof our nation.  Five score years ago a great American in whose symbolic shadow\r\nwe stand today signed the Emancipation Proclamation. This\r\nmomentous decree came as a great beckoning light of hope to\r\nmillions of Negro slaves who had been seared in the flames of\r\nwithering injustice. It came as a joyous daybreak to end the long\r\nnight of their captivity.  But one hundred years later the Negro is still not free. One\r\nhundred years later the life of the Negro is still sadly crippled\r\nby the manacles of segregation and the chains of discrimination.  One hundred years later the Negro lives on a lonely island of\r\npoverty in the midst of a vast ocean of material prosperity.  One hundred years later the Negro is still languishing in the\r\ncomers of American society and finds himself in exile in his own\r\nland.  We all have come to this hallowed spo

In [27]:
# Step 6: Creating it as a variable in order to refer to it quickly rather than the entire syntax

In [29]:
string_speech = ' '.join(speech_combined)

In [31]:
# Step 7: Data cleaning the \r\n. After checking and verifying that it is removed, you'll need to save it 
# as a variable to ensure that it's easily referrable.

In [37]:
string_speech_cleaned = string_speech.replace('\r\n',' ')

In [44]:
# Step 8: Getting rid of all the punctuation 

In [54]:
import re

speech_no_punct = re.sub(r'[^\w\s]','',string_speech_cleaned)

print(speech_no_punct)

I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation  Five score years ago a great American in whose symbolic shadow we stand today signed the Emancipation Proclamation This momentous decree came as a great beckoning light of hope to millions of Negro slaves who had been seared in the flames of withering injustice It came as a joyous daybreak to end the long night of their captivity  But one hundred years later the Negro is still not free One hundred years later the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination  One hundred years later the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity  One hundred years later the Negro is still languishing in the comers of American society and finds himself in exile in his own land  We all have come to this hallowed spot to remind America of the fierce urgency of 

In [56]:
# Step 9: Making it lowercase since the count will be read differently

In [60]:
speech_no_punct_lower = speech_no_punct.lower()

In [68]:
# Step 10: All of the words needs to be in it's own value. Split it off of white space.
# Also need to name this as a variable since it is easy to refer to 

In [70]:
speech_broken_out = re.split(r'\s+', speech_no_punct_lower)

In [78]:
# Step 11: Finding the count of the letters in the speech

In [80]:
import pandas as pd

In [116]:
df = pd.DataFrame(speech_broken_out).value_counts() # Check the values first and then name it df inorder to run it as a package

In [118]:
# Step 12: Exporting it to excel 

In [120]:
df.to_csv(r'C:\Users\mejia\OneDrive\Documents\Analyst Builder Notes\Python Files\Project 5\MLK Speech Count.csv', header = 'Counts', 
          index_label = 'Word')