# Using urllib

First, we need to import urllib. urllib is pretty straight forward. It goes out and downloads the document from a given website/URL. 

In [2]:
#Used to make requests
import urllib

f = urllib.request.urlopen("https://prototype.visualization.vpr.psu.edu/open/workshop/")
f.read()

b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml">\r\n\t<head>\r\n\t\t<link href="//fonts.googleapis.com/css?family=Lato" rel="stylesheet">\r\n\t\t<style>\r\n\t\t\ttable {\r\n\t\t\t\tfont-family: \'Lato\', sans-serif;\r\n\r\n\t\t\t}\r\n\t\t</style>\r\n\t</head>\r\n\t<body>\r\n\t\t<table>\r\n\t\t<tr>\r\n\t\t\t<th>data1</th><th>data2</th><th>data3</th>\r\n\t\t</tr>\r\n\t\t<tr>\r\n\t\t\t<td>1</td><td>10.2</td><td>PA</td>\r\n\t\t</tr>\r\n\t\t<tr>\r\n\t\t\t<td>2</td><td>12.2</td><td>OH</td>\r\n\t\t</tr>\r\n\t\t<tr>\r\n\t\t\t<td>3</td><td>22.3</td><td>NY</td>\r\n\t\t</tr>\r\n\t\t</table>\r\n\t</body>\r\n</html>'

Let's take a look at the HTML:

In [3]:
%%html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
    <head>
        <link href="//fonts.googleapis.com/css?family=Lato" rel="stylesheet">
        <style>
            table {
                font-family: 'Lato', sans-serif;

            }
        </style>
    </head>
    <body>
        <table>
        <tr>
            <th>data1</th><th>data2</th><th>data3</th>
        </tr>
        <tr>
            <td>1</td><td>10.2</td><td>PA</td>
        </tr>
        <tr>
            <td>2</td><td>12.2</td><td>OH</td>
        </tr>
        <tr>
            <td>3</td><td>22.3</td><td>NY</td>
        </tr>
        </table>
    </body>
</html>

data1,data2,data3
1,10.2,PA
2,12.2,OH
3,22.3,NY


Let's clean this up a bit and add some line numbers as well.

In [12]:
f = urllib.request.urlopen("https://prototype.visualization.vpr.psu.edu/open/workshop/")
counter = 0;
text = str(f.read())
for line in text.split("\\r\\n"):
    print (counter, "\t", line)
    counter = counter + 1


0 	 b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1 	 <html xmlns="http://www.w3.org/1999/xhtml">
2 	 \t<head>
3 	 \t\t<link href="//fonts.googleapis.com/css?family=Lato" rel="stylesheet">
4 	 \t\t<style>
5 	 \t\t\ttable {
6 	 \t\t\t\tfont-family: \'Lato\', sans-serif;
7 	 
8 	 \t\t\t}
9 	 \t\t</style>
10 	 \t</head>
11 	 \t<body>
12 	 \t\t<table>
13 	 \t\t<tr>
14 	 \t\t\t<th>data1</th><th>data2</th><th>data3</th>
15 	 \t\t</tr>
16 	 \t\t<tr>
17 	 \t\t\t<td>1</td><td>10.2</td><td>PA</td>
18 	 \t\t</tr>
19 	 \t\t<tr>
20 	 \t\t\t<td>2</td><td>12.2</td><td>OH</td>
21 	 \t\t</tr>
22 	 \t\t<tr>
23 	 \t\t\t<td>3</td><td>22.3</td><td>NY</td>
24 	 \t\t</tr>
25 	 \t\t</table>
26 	 \t</body>
27 	 </html>'


So knowing the structure, this should be pretty easy to parse. 

Let's start by getting the headers and the data. 

In [6]:
f = urllib.urlopen("https://prototype.visualization.vpr.psu.edu/open/workshop/")
counter = 0;
dataFromURL = []
for line in f.read().split("\r\n"):
    cleanLine = line.replace("\t","")
    if (cleanLine.startswith("<th>")) or (cleanLine.startswith("<td>")):
        dataFromURL.append(cleanLine)
print dataFromURL

['<th>data1</th><th>data2</th><th>data3</th>', '<td>1</td><td>10.2</td><td>PA</td>', '<td>2</td><td>12.2</td><td>OH</td>', '<td>3</td><td>22.3</td><td>NY</td>']


Now let's spilt the data and combine it for a single data set. 

In [23]:
header = dataFromURL[0].split("</th><th>")
print "Header Stuff"
print header

header[0] = header[0].replace("<th>","")
header[len(header)-1] = header[len(header)-1].replace("</th>","")
print "Header - Check!",header

print "Data Stuff"
print dataFromURL
dataFromTable = []
for line in dataFromURL[1:]:
    dataFromTable.append(line.split("</td><td>"))

print dataFromTable    
for row in range(len(dataFromTable)):
    dataFromTable[row][0] = dataFromTable[row][0].replace("<td>","")
    dataFromTable[row][len(dataFromTable[row])-1] = dataFromTable[row][len(dataFromTable[row])-1].replace("</td>","")
print "Data - Check!", dataFromTable 
    
# Combine the header and the data
print "Combining Stuff"
data = {}
for colLabels in range(len(header)):
    data[header[colLabels]] = []
    for rowData in range(len(dataFromTable)):
        data[header[colLabels]].append(dataFromTable[rowData][colLabels])
print data


Header Stuff
['<th>data1', 'data2', 'data3</th>']
Header - Check! ['data1', 'data2', 'data3']
Data Stuff
['<th>data1</th><th>data2</th><th>data3</th>', '<td>1</td><td>10.2</td><td>PA</td>', '<td>2</td><td>12.2</td><td>OH</td>', '<td>3</td><td>22.3</td><td>NY</td>']
[['<td>1', '10.2', 'PA</td>'], ['<td>2', '12.2', 'OH</td>'], ['<td>3', '22.3', 'NY</td>']]
Data - Check! [['1', '10.2', 'PA'], ['2', '12.2', 'OH'], ['3', '22.3', 'NY']]
Combining Stuff
{'data1': ['1', '2', '3'], 'data3': ['PA', 'OH', 'NY'], 'data2': ['10.2', '12.2', '22.3']}


### Regular Expressions

Let's learn about this first.
Link to [Learn about Regular Expressions](2.RegularExpressions.ipynb)

Now let's try this with regular expressions. Much easier!

In [8]:
import re

header = re.findall(r"<th>(.+?)</th>", dataFromURL[0])
dataset = {}
for m in header:
    dataset[m] = []

for line in dataFromURL[1:]:
    data = re.findall(r"<td>(.+?)</td>", line)
    for col in dataset:
        dataset[col] = data
print dataset

{'data1': ['3', '22.3', 'NY'], 'data3': ['3', '22.3', 'NY'], 'data2': ['3', '22.3', 'NY']}


### But what if things weren't so.. neat?

In [9]:
f = urllib.urlopen("https://prototype.visualization.vpr.psu.edu/open/workshop/anotherSite.html")
counter = 0;
for line in f.read().split("\r\n"):
    print counter, "\t", line
    counter = counter + 1

0 	<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
1 	<html xmlns="http://www.w3.org/1999/xhtml">
2 		<head>
3 			<link href="//fonts.googleapis.com/css?family=Lato" rel="stylesheet">
4 			<style>
5 				table {
6 					font-family: 'Lato', sans-serif;
7 	
8 				}
9 			</style>
10 		</header>
11 		<body>
12 			<table>
13 			<tr>
14 				<th>data1</th><th>data2</TH><th>data3</th>
15 			</tr>
16 			<tr>
17 				<TD>1</td><td>10.2</TD><td>PA</td>
18 			</tr>
19 			<tr>
20 				<td>2</td><td>12.2</td><td>OH</td>
21 			</tr>
22 			<tr>
23 				<td>3</td><td>22.3</td><td>NY</td>
24 			</tr>
25 			</tabler>
26 		<body>
27 	</html>


### Introducing.... Beautiful Soup!

In [10]:
from bs4 import BeautifulSoup
f = urllib.urlopen("https://prototype.visualization.vpr.psu.edu/open/workshop/anotherSite.html")
soup = BeautifulSoup(f, 'html.parser')
print soup
print soup.findAll("th")
print soup.findAll("td")

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<link href="//fonts.googleapis.com/css?family=Lato" rel="stylesheet"/>
<style>
			table {
				font-family: 'Lato', sans-serif;

			}
		</style>
</head></html>
<body>
<table>
<tr>
<th>data1</th><th>data2</th><th>data3</th>
</tr>
<tr>
<td>1</td><td>10.2</td><td>PA</td>
</tr>
<tr>
<td>2</td><td>12.2</td><td>OH</td>
</tr>
<tr>
<td>3</td><td>22.3</td><td>NY</td>
</tr>
</table></body>
<body>
</body>
[<th>data1</th>, <th>data2</th>, <th>data3</th>]
[<td>1</td>, <td>10.2</td>, <td>PA</td>, <td>2</td>, <td>12.2</td>, <td>OH</td>, <td>3</td>, <td>22.3</td>, <td>NY</td>]


BeautifulSoup is the way to go!

In [11]:
from bs4 import BeautifulSoup
f = urllib.urlopen("https://prototype.visualization.vpr.psu.edu/open/workshop/anotherSite.html")
soup = BeautifulSoup(f, 'html.parser')

dataset = {}

for header in soup.findAll("th"):
    dataset[header.text] = []
    for tr in soup.findAll('tr')[1:]:
        temp  = []
        for td in tr.findAll("td"):
            temp.append(td.text)
    dataset[header.text] = temp
print dataset

{u'data1': [u'3', u'22.3', u'NY'], u'data3': [u'3', u'22.3', u'NY'], u'data2': [u'3', u'22.3', u'NY']}


### Used to demo layout of an html page

I added this section to showcase the layout of most HTML files, which took way too many extra lines of code. 

In [13]:
from anytree import AnyNode
from anytree.exporter import JsonExporter

HTML = AnyNode(name="HTML")
header = AnyNode(name="header", parent=HTML)
AnyNode(name="title", parent=header)
AnyNode(name="sytle", parent=header)
AnyNode(name="script", parent=header)
body = AnyNode(name="body", parent=HTML)
table = AnyNode(name="table", parent=body)
tr = AnyNode(name="tr", parent=table)
AnyNode(name="th", parent=tr)
AnyNode(name="td", parent=tr)

exporter = JsonExporter(indent=2, sort_keys=False)

from IPython.display import Javascript
Javascript("""window.data={};""".format(exporter.export(HTML)))

<IPython.core.display.Javascript object>

In [14]:
%%javascript
element.append('<div id="graph" style="min-width: 310px; height: 300px; margin: 0 auto"></div>');
require.config({paths: {d3: "//d3js.org/d3.v5.min",}}); // Home directory of D3.JS: https://d3js.org/
// Original Example: https://bl.ocks.org/mbostock/4060606
require(["d3"], function(d3) {

// set the dimensions and margins of the diagram
var margin = {top: 20, right: 90, bottom: 30, left: 90},
    width = 660 - margin.left - margin.right,
    height = 300 - margin.top - margin.bottom;

// declares a tree layout and assigns the size
var treemap = d3.tree()
    .size([height, width]);

//  assigns the data to a hierarchy using parent-child relationships
var nodes = d3.hierarchy(data, function(d) { return d.children; });

// maps the node data to the tree layout
nodes = treemap(nodes);

var svg = d3.select("div#graph").append("svg")
    .attr("width", width + margin.left + margin.right)
    .attr("height", height + margin.top + margin.bottom)
var g = svg.append("g")
    .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

// adds the links between the nodes
var link = g.selectAll(".link").data( nodes.descendants().slice(1))
  .enter().append("path")
    .attr("class", "link")
    .style("stroke", "lightgrey")
     .style("fill", "none")
    .attr("d", function(d) {
       return "M" + d.y + "," + d.x
         + "C" + (d.y + d.parent.y) / 2 + "," + d.x
         + " " + (d.y + d.parent.y) / 2 + "," + d.parent.x
         + " " + d.parent.y + "," + d.parent.x;
       });

// adds each node as a group
var node = g.selectAll(".node").data(nodes.descendants())
    .enter().append("g")
    .attr("class", function(d) { return "node" + (d.children ? " node--internal" : " node--leaf"); })
    .attr("transform", function(d) { return "translate(" + d.y + "," + d.x + ")"; });


// adds the text to the node
node.append("text")
  .attr("dy", ".35em")
  .style("text-anchor", function(d) { return d.children ? "end" : "start"; })
  .text(function(d) { return d.data.name; });

})

<IPython.core.display.Javascript object>