# Beautiful Soup

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
response = requests.get("https://www.scrapingbee.com/blog/")
soup = BeautifulSoup(response.content, 'html.parser')

## html element by class

In [7]:
h1 = soup.find(attrs={"class": "mb-21"})
print(h1.string)

The ScrapingBee Blog


## By Attributes

In [9]:
tooltips = soup.find_all("button", attrs={"data-microtip-size": "medium"})
for tooltip in tooltips:
  print(tooltip.get("aria-label"))

## Multiple Tags


In [15]:
headers_and_bold_text = soup.find_all(["h1", "li"])
i=0
for element in headers_and_bold_text:
  print(element)
  i = i + 1
  if (i >= 5) : break

<li class="px-15 lg:px-21"><a class="block hover:underline" href="https://app.scrapingbee.com/account/login">Login</a></li>
<li class="px-15 lg:px-21"><a class="btn btn-black-o text-16 px-21 h-40 md:h-48 border-white md:border-black-100 text-white md:text-black-100 hover:bg-white md:hover:bg-black-100 hover:text-black-100 md:hover:text-white transition-all" href="https://app.scrapingbee.com/account/register">Sign Up</a></li>
<li class="relative md:px-15 lg:px-21 mb-20 md:mb-0"><a class="block hover:underline" href="/#pricing">Pricing</a></li>
<li class="relative md:px-15 lg:px-21 mb-20 md:mb-0"><a class="block hover:underline" href="/#faq">FAQ</a></li>
<li class="relative md:px-15 lg:px-21 mb-20 md:mb-0"><a class="block hover:underline" href="/blog/">Blog</a></li>


## find all links

In [16]:
links = soup.find_all("a") # Find all elements with the tag <a>
i = 0
for link in links:
  print("Link:", link.get("href"), "Text:", link.string)
  i +=1
  if (i >=5) : break

Link: / Text: None
Link: https://app.scrapingbee.com/account/login Text: Login
Link: https://app.scrapingbee.com/account/register Text: Sign Up
Link: /#pricing Text: Pricing
Link: /#faq Text: FAQ


## Tables

In [17]:
url2 = "https://demo.scrapingbee.com/table_content.html"

In [21]:
response = requests.get(url2)
soup = BeautifulSoup(response.content, 'html.parser')

data = []
table = soup.find('table')
table_body = table.find('tbody')

rows = table.find_all('tr')
for row in rows:
    cols = row.find_all(['td', 'th'])
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])
#print(data)

In [46]:
#df = pd.DataFrame(data)
df = pd.DataFrame(data, columns =['Symbol', 'Name','Price', 'Change','%change'])
df=df.drop(df.index[0])
#print(df)
df.head()

Unnamed: 0,Symbol,Name,Price,Change,%change
1,AMD,Advanced Micro Devices Inc,89.48,-5.34,-5.63
2,ADBE,Adobe Inc.,378.07,-15.76,-4.0
3,ABNB,Airbnb Inc,99.91,-9.01,-8.27
4,ALGN,Align Technology Inc,247.75,-9.3,-3.62
5,AMZN,Amazon.com Inc,103.87,-5.78,-5.27


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 1 to 102
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Symbol   102 non-null    object
 1   Name     102 non-null    object
 2   Price    102 non-null    object
 3   Change   102 non-null    object
 4   %change  102 non-null    object
dtypes: object(5)
memory usage: 4.1+ KB


In [50]:
#df= df.convert_dtypes()
asTypes = {'Price':'float64', 'Change':'float64', '%change':'float64'}
#df = df.astype({'col1': 'object', 'col2': 'int'})
#df= df.convert_dtypes()
df[['Price','Change','%change']] = df[['Price','Change','%change']].apply(pd.to_numeric, errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 1 to 102
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Symbol   102 non-null    object 
 1   Name     102 non-null    object 
 2   Price    99 non-null     float64
 3   Change   102 non-null    float64
 4   %change  102 non-null    float64
dtypes: float64(3), object(2)
memory usage: 4.1+ KB


In [49]:
df.describe(include='all')

Unnamed: 0,Symbol,Name,Price,Change,%change
count,102,102,99.0,102.0,102.0
unique,102,102,,,
top,AMD,Advanced Micro Devices Inc,,,
freq,1,1,,,
mean,,,172.455758,-9.782745,-4.022255
std,,,147.193845,18.281,2.39799
min,,,6.03,-151.96,-10.86
25%,,,72.455,-9.3975,-5.39
50%,,,130.02,-5.37,-3.65
75%,,,202.305,-2.4575,-2.27


## Values between 2 nodes

In [53]:
html_content = '''
<h1>Starting Header</h1><p>Element 1</p><p>Element 2</p><p>Element 3</p><h1>Ending Header</h1>
'''

In [54]:
soup = BeautifulSoup(html_content, 'html.parser')
elements = []
for tag in soup.find("h1").next_siblings:
    if tag.name == "h1":
        break
    else:
        elements.append(tag)

print(elements)

[<p>Element 1</p>, <p>Element 2</p>, <p>Element 3</p>]


In [58]:
for i in elements:
    print(i.text)

Element 1
Element 2
Element 3
