In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import webbrowser

In [3]:
# A bit more-advanced tidying...
# Let's also set Jupyter up so that all results from each cell are printed 
from IPython.core.interactiveshell import InteractiveShell # see here https://archive.ph/QJFOK
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# We begin with a response, which just records the result of an HTTP ("HypterText Transfer Protocol") request. 
response = requests.get("http://static.decontextualize.com/kittens.html")
print("This is a good response:", response) # "200" is good
# A bad response looks like this
bad_response = requests.get("http://static.decontextualize.com/kissens.html") 
print("This is a bad response:", bad_response) # "404" is, more notoriously, bad

This is a good response: <Response [200]>
This is a bad response: <Response [404]>


In [5]:
# Now we pull the HypterText Markup Language (HTML) information from our website above by adding ".text"
html_string = response.text
print(html_string) # this is a bunch of text from a sample website about cats.
webbrowser.open("http://static.decontextualize.com/kittens.html") # this is that website

<!doctype html>
<html>
	<head>
		<title>Kittens!</title>
		<style type="text/css">
			span.lastcheckup { font-family: "Courier", fixed; font-size: 11px; }
		</style>
	</head>
	<body>
		<h1>Kittens and the TV Shows They Love</h1>
		<div class="kitten">
			<h2>Fluffy</h2>
			<div><img src="http://placekitten.com/120/120"></div>
			<ul class="tvshows">
				<li>
					<a href="http://www.imdb.com/title/tt0106145/">Deep Space Nine</a>
				</li>
				<li>
					<a href="http://www.imdb.com/title/tt0088576/">Mr. Belvedere</a>
				</li>
			</ul>
			Last check-up: <span class="lastcheckup">2014-01-17</span>
		</div>
		<div class="kitten">
			<h2>Monsieur Whiskeurs</h2>
			<div><img src="http://placekitten.com/110/110"></div>
			<ul class="tvshows">
				<li>
					<a href="http://www.imdb.com/title/tt0106179/">The X-Files</a>
				</li>
				<li>
					<a href="http://www.imdb.com/title/tt0098800/">Fresh Prince</a>
				</li>
			</ul>
			Last check-up: <span class="lastcheckup">2013-11-02</span>
		</div

True

In [6]:
# We can also parse the text a little more cleanly ... 
document = BeautifulSoup(html_string, "html.parser") # the second option is the parser type, which is usually this one
print(document)

<!DOCTYPE html>

<html>
<head>
<title>Kittens!</title>
<style type="text/css">
			span.lastcheckup { font-family: "Courier", fixed; font-size: 11px; }
		</style>
</head>
<body>
<h1>Kittens and the TV Shows They Love</h1>
<div class="kitten">
<h2>Fluffy</h2>
<div><img src="http://placekitten.com/120/120"/></div>
<ul class="tvshows">
<li>
<a href="http://www.imdb.com/title/tt0106145/">Deep Space Nine</a>
</li>
<li>
<a href="http://www.imdb.com/title/tt0088576/">Mr. Belvedere</a>
</li>
</ul>
			Last check-up: <span class="lastcheckup">2014-01-17</span>
</div>
<div class="kitten">
<h2>Monsieur Whiskeurs</h2>
<div><img src="http://placekitten.com/110/110"/></div>
<ul class="tvshows">
<li>
<a href="http://www.imdb.com/title/tt0106179/">The X-Files</a>
</li>
<li>
<a href="http://www.imdb.com/title/tt0098800/">Fresh Prince</a>
</li>
</ul>
			Last check-up: <span class="lastcheckup">2013-11-02</span>
</div>
</body>
</html>



In [7]:
# we can extract desired elements...
document.find("h1")
# and pull just the text
document.find("h1").text
# We can look for types of things, such as images 
document.find("img")
document.find_all("img") # This finds all such instances
# We can also look for other HTML elements, such as "div" (divisions). 
document.find_all("div", attrs={"class": "kitten"})
# 
all_h2_headers = document.find_all("h2")

<h1>Kittens and the TV Shows They Love</h1>

'Kittens and the TV Shows They Love'

<img src="http://placekitten.com/120/120"/>

[<img src="http://placekitten.com/120/120"/>,
 <img src="http://placekitten.com/110/110"/>]

[<div class="kitten">
 <h2>Fluffy</h2>
 <div><img src="http://placekitten.com/120/120"/></div>
 <ul class="tvshows">
 <li>
 <a href="http://www.imdb.com/title/tt0106145/">Deep Space Nine</a>
 </li>
 <li>
 <a href="http://www.imdb.com/title/tt0088576/">Mr. Belvedere</a>
 </li>
 </ul>
 			Last check-up: <span class="lastcheckup">2014-01-17</span>
 </div>,
 <div class="kitten">
 <h2>Monsieur Whiskeurs</h2>
 <div><img src="http://placekitten.com/110/110"/></div>
 <ul class="tvshows">
 <li>
 <a href="http://www.imdb.com/title/tt0106179/">The X-Files</a>
 </li>
 <li>
 <a href="http://www.imdb.com/title/tt0098800/">Fresh Prince</a>
 </li>
 </ul>
 			Last check-up: <span class="lastcheckup">2013-11-02</span>
 </div>]

In [9]:
h2_headers = []
for header in all_h2_headers:
    header_contents = header.text
    h2_headers.append(header_contents)
h2_headers

['Fluffy', 'Monsieur Whiskeurs']

In [16]:
response = requests.get("https://en.wikipedia.org/wiki/Windham_Hill_Records")
html_str = response.text

document = BeautifulSoup(html_str, "html.parser")
document

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Windham Hill Records - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"77701543-8b6e-48b2-a817-2f1a7a685937","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Windham_Hill_Records","wgTitle":"Windham Hill Records","wgCurRevisionId":1089496738,"wgRevisionId":1089496738,"wgArticleId":30871569,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Use mdy dates from May 2020","Articles needing ad

In [26]:
# First, Walsh's method
article_text = document.find("p").text
print(article_text)





In [39]:
# OK, that needs some work...
# source: https://archive.ph/Crh6A
text = '' # First we make an empty container...
# So, let's first show all paragraphs here. 
document.find_all('p')
# Now we write a quick loop. For all "paragraphs" in the list defined by the function shown above...
# this loop uses an augmented assignment (https://archive.ph/BBeF3) to cumulatively add each 
# paragraph to the document. Then we show what it looks like. 
for paragraph in document.find_all('p'):
    paragraph += paragraph.text
print(text)

[<p class="mw-empty-elt">
 </p>,
 <p><b>Windham Hill Records</b> was an independent record label that specialized in instrumental <a href="/wiki/Acoustic_music" title="Acoustic music">acoustic music</a>. It was founded by guitarist <a href="/wiki/William_Ackerman" title="William Ackerman">William Ackerman</a> and Anne Robinson (née McGilvray)<sup class="reference" id="cite_ref-Mellow_2-0"><a href="#cite_note-Mellow-2">[2]</a></sup> in 1976 and was popular in the 1980s and 1990s.
 </p>,
 <p>The label was purchased by <a href="/wiki/Bertelsmann_Music_Group" title="Bertelsmann Music Group">BMG</a> through a series of buyouts from 1992 through 1996 and is currently a subsidiary of <a class="mw-redirect" href="/wiki/Sony_Music_Entertainment" title="Sony Music Entertainment">Sony Music Entertainment</a> after BMG's subsequent merger in 2008. <a href="/wiki/Private_Music" title="Private Music">Private Music</a>, also a subsidiary of BMG, has issued some back-catalog releases under the Windham

TypeError: unsupported operand type(s) for +=: 'Tag' and 'str'

In [25]:
article_title = document.find("h1").text
print(article_title)

Windham Hill Records


In [54]:
# experiment
response2 = requests.get("https://www.marxists.org/archive/marx/works/1867-c1/ch04.htm")
html_str2 = response2.text
document2 = BeautifulSoup(html_str2, "html.parser")
ch4 = '' # First we make an empty container...
# Now we write a quick loop. For all "paragraphs" in the list defined by the function shown above...
# this loop uses an augmented assignment (https://archive.ph/BBeF3) to cumulatively add each 
# paragraph to the document. Then we show what it looks like. 
for chunk in document2.find_all('p'):
    ch4 += chunk.text
print(ch4)

Karl Marx. Capital Volume One The circulation of commodities is the starting-point
of capital. The production of commodities, their circulation, and that
more developed form of their circulation called commerce, these form the
historical ground-work from which it rises. The modern history of capital
dates from the creation in the 16th century of a world-embracing commerce
and a world-embracing market.
If we abstract from the material substance of the circulation
of commodities, that is, from the exchange of the various use-values, and
consider only the economic forms produced by this process of circulation,
we find its final result to be money: this final product of the circulation
of commodities is the first form in which capital appears.

 As a matter of history, capital, as opposed to landed property,
invariably takes the form at first of money; it appears as moneyed wealth,
as the capital of the merchant and of the usurer. [1]
But we have no need to refer to the orig

In [58]:
missy_lyrics = document2.find("p").text
print(missy_lyrics)

Karl Marx. Capital Volume One
