# Reading an HTML File and Extracting Its Contents Using Beautiful Soup

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

In [2]:
url = ('https://raw.githubusercontent.com/PacktWorkshops/'
'The-Data-Wrangling-Workshop/master/Chapter05/datasets/test.html')

urlretrieve(url, 'test.html')

('test.html', <http.client.HTTPMessage at 0x7fc220879d90>)

In [3]:
with open('test.html', 'r') as f:
  soup = BeautifulSoup(f)
  print(type(soup))

<class 'bs4.BeautifulSoup'>


### Print the contents of the file in a nice way

In [None]:
print(soup.prettify())

<html>
 <body>
  <h1>
   Lorem ipsum dolor sit amet consectetuer adipiscing 
elit
  </h1>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa
   <strong>
    strong
   </strong>
   . Cum sociis natoque penatibus 
et magnis dis parturient montes, nascetur ridiculus 
mus. Donec quam felis, ultricies nec, pellentesque 
eu, pretium quis, sem. Nulla consequat massa quis 
enim. Donec pede justo, fringilla vel, aliquet nec, 
vulputate eget, arcu. In enim justo, rhoncus ut, 
imperdiet a, venenatis vitae, justo. Nullam dictum 
felis eu pede
   <a class="external ext" href="#">
    link
   </a>
   mollis pretium. Integer tincidunt. Cras dapibus. 
Vivamus elementum semper nisi. Aenean vulputate 
eleifend tellus. Aenean leo ligula, porttitor eu, 
consequat vitae, eleifend ac, enim. Aliquam lorem ante, 
dapibus in, viverra quis, feugiat a, tellus. Phasellus 
viverra nulla ut metus varius laoreet. Quisque rutrum. 
Aenean imperdiet. Etiam

### get first paragraph

In [None]:
with open('test.html', 'r') as f:
  soup = BeautifulSoup(f)
  print(soup.p)

<p>Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa 
<strong>strong</strong>. Cum sociis natoque penatibus 
et magnis dis parturient montes, nascetur ridiculus 
mus. Donec quam felis, ultricies nec, pellentesque 
eu, pretium quis, sem. Nulla consequat massa quis 
enim. Donec pede justo, fringilla vel, aliquet nec, 
vulputate eget, arcu. In enim justo, rhoncus ut, 
imperdiet a, venenatis vitae, justo. Nullam dictum 
felis eu pede <a class="external ext" href="#">link</a> 
mollis pretium. Integer tincidunt. Cras dapibus. 
Vivamus elementum semper nisi. Aenean vulputate 
eleifend tellus. Aenean leo ligula, porttitor eu, 
consequat vitae, eleifend ac, enim. Aliquam lorem ante, 
dapibus in, viverra quis, feugiat a, tellus. Phasellus 
viverra nulla ut metus varius laoreet. Quisque rutrum. 
Aenean imperdiet. Etiam ultricies nisi vel augue. 
Curabitur ullamcorper ultricies nisi.</p>


### Use the findall method to extract the content from the tag

In [None]:
with open('test.html', 'r') as f:
  soup = BeautifulSoup(f)
  all_ps = soup.find_all('p')
  print(f'Total number of <p> --- {len(all_ps)}')

Total number of <p> --- 6


### how to get the contents of a particular HTML tag

In [None]:
with open('test.html', 'r') as f:
  soup = BeautifulSoup(f)
  table = soup.table
  print(table.contents)

['\n', <tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>, '\n', <tr>
<td>Entry First Line 1</td>
<td>Entry First Line 2</td>
<td>Entry First Line 3</td>
<td>Entry First Line 4</td>
</tr>, '\n', <tr>
<td>Entry Line 1</td>
<td>Entry Line 2</td>
<td>Entry Line 3</td>
<td>Entry Line 4</td>
</tr>, '\n', <tr>
<td>Entry Last Line 1</td>
<td>Entry Last Line 2</td>
<td>Entry Last Line 3</td>
<td>Entry Last Line 4</td>
</tr>, '\n']


###  Traverse the children of a particular node

- 1st method:

In [None]:
with open('test.html', 'r') as f:
  soup = BeautifulSoup(f)
  table = soup.table
  for c in table.children:
    print(c)
    print('*'*5)



*****
<tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>
*****


*****
<tr>
<td>Entry First Line 1</td>
<td>Entry First Line 2</td>
<td>Entry First Line 3</td>
<td>Entry First Line 4</td>
</tr>
*****


*****
<tr>
<td>Entry Line 1</td>
<td>Entry Line 2</td>
<td>Entry Line 3</td>
<td>Entry Line 4</td>
</tr>
*****


*****
<tr>
<td>Entry Last Line 1</td>
<td>Entry Last Line 2</td>
<td>Entry Last Line 3</td>
<td>Entry Last Line 4</td>
</tr>
*****


*****


- 2nd method

In [None]:
with open("test.html", "r") as f:
  soup = BeautifulSoup(f)
  table = soup.table
  children = table.children
  des = table.descendants
  print(len(list(children)), len(list(des)))

9 61


##  DataFrames and BeautifulSoup

In [4]:
with open('test.html') as f:
  soup = BeautifulSoup(f)
  tr = soup.findAll('tr')
  print(f'Data is a {type(tr)} and {len(tr)} items long')

Data is a <class 'bs4.element.ResultSet'> and 4 items long


In [5]:
data = tr[1:]
header = tr[0]

In [6]:
header

<tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>