# Exploring BeautifulSoup

In [69]:
# importing libraries
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd

In [38]:
# defining a html
html= "<!DOCTYPE html><html> <head><title>Page Title</title> </head><body> <h3><b id='boldest'>Lebron James</b></h3><p> Salary: $92,000,000 </p><h3>Stephen Curry</h3 <p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"
html

"<!DOCTYPE html><html> <head><title>Page Title</title> </head><body> <h3><b id='boldest'>Lebron James</b></h3><p> Salary: $92,000,000 </p><h3>Stephen Curry</h3 <p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [39]:
# creating the object 'soup', which represents the document as a nested data structure
soup = BeautifulSoup(html, 'html')
soup

<!DOCTYPE html>
<html> <head><title>Page Title</title> </head><body> <h3><b id="boldest">Lebron James</b></h3><p> Salary: $92,000,000 </p><h3>Stephen Curry</h3> Salary: $85,000, 000 <h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>

In [40]:
tag_object = soup.title
tag_object

<title>Page Title</title>

In [41]:
tag_object = soup.h3
tag_object

<h3><b id="boldest">Lebron James</b></h3>

In [47]:
tag_child = tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [50]:
print(tag_child.attrs)
tag_child.string

{'id': 'boldest'}


'Lebron James'

In [43]:
parent_tag = tag_child.parent
parent_tag

<h3><b id="boldest">Lebron James</b></h3>

In [44]:
sibling1 = tag_object.next_sibling
sibling1

<p> Salary: $92,000,000 </p>

In [45]:
sibling2 = sibling1.next_sibling
sibling2

<h3>Stephen Curry</h3>

## <br> Let's see how the 'find_all' method works <br>
#### This is a filter, you can use filters to filter based on a tag's name, its attributes, the text of a string, or on some combination of these.


In [115]:
# this time, we're naming as a table
html = "<table><tr><td>Pizza Place</td><td>Orders</td><td>Slices</td></tr><tr><td>Domino'sPizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td>144</td></table>"
table = BeautifulSoup(html, 'html')
table

<html><body><table><tr><td>Pizza Place</td><td>Orders</td><td>Slices</td></tr><tr><td>Domino'sPizza</td><td>10</td><td>100</td></tr><tr><td>Little Caesars</td><td>12</td><td>144</td></tr></table></body></html>

In [133]:
# dataframing the html so we can properly see the data
df = pd.read_html(StringIO(html))[0]
df.columns = df.iloc[0]
df = df[1:]
df = df.reset_index(drop = True)
df

Unnamed: 0,Pizza Place,Orders,Slices
0,Domino'sPizza,10,100
1,Little Caesars,12,144


In [135]:
table_rows = table.find_all(name = 'tr')
table_rows

[<tr><td>Pizza Place</td><td>Orders</td><td>Slices</td></tr>,
 <tr><td>Domino'sPizza</td><td>10</td><td>100</td></tr>,
 <tr><td>Little Caesars</td><td>12</td><td>144</td></tr>]

In [136]:
first_row = table_rows[0]
first_row

<tr><td>Pizza Place</td><td>Orders</td><td>Slices</td></tr>

In [147]:
for i, row in enumerate(table_rows):
    print('Row', i)
    cells = row.find_all('td')
    
    for j, cell in enumerate(cells):
        print(f'Column {j} Cell: {cell}')

Row 0
Column 0 Cell: <td>Pizza Place</td>
Column 1 Cell: <td>Orders</td>
Column 2 Cell: <td>Slices</td>
Row 1
Column 0 Cell: <td>Domino'sPizza</td>
Column 1 Cell: <td>10</td>
Column 2 Cell: <td>100</td>
Row 2
Column 0 Cell: <td>Little Caesars</td>
Column 1 Cell: <td>12</td>
Column 2 Cell: <td>144</td>
