# Requests Module

install requests 

In [113]:
import requests
from requests.auth import HTTPBasicAuth
from bs4 import BeautifulSoup
import csv
import pandas as pd
import json

In [114]:
# GET request. This is a simple get request method where we are hitting to the URI, to get the resource 
# resource can be anything like html, image, text, json etc. (JSON in below)

reponse = requests.get("https://dummyjson.com/products/1")

# HTML page will be response

googleRes= requests.get("https://www.google.com/")


In [124]:
# Get request method, here we are giving the parameter passed to the URL as wel. So, the data will be send within the URL
# the below URL can be used as "https://www.google.com/search?q=hello" in browser

payload= {"q":"hello"}
gSearRes = requests.get("https://www.google.com/search", params=payload)

In [152]:
# there are many method that we are use on reponse object. 

reponse.text  #give the body in form of text
reponse.status_code  #gives the status code 
reponse.json()  #if the resource is json, this will parse the json to dict. will return a dict
reponse.headers  #gives the info of the header of response. this will return python dict, so we can access elements
reponse.content #this will be good in case of image, we will get the data in bytes
reponse.request.headers #to see the request header that was sent. this will return python dict
reponse.request.body  #to see the request body that was sent
reponse.ok  #if status code<400 return True, else False
reponse.encoding  #gives the encoding info

# check online for more attributes and methods, or run the dir() function on the reponse obj
# dir(reponse)

bytes

In [2]:
# below code can be used to download the image from the resource

imgGet = requests.get("https://requests.readthedocs.io/en/latest/_static/requests-sidebar.png")
imgBytes= imgGet.content
with open("logo.png", "wb") as file:
    file.write(imgBytes)

In [43]:
# Authentication using Python Requests
# We do a get request on the page which is having Auth on it.
# Provide the creds to auth argument in the get request. like below

authRes = requests.get("https://httpbin.org/basic-auth/user/pass", auth=HTTPBasicAuth("user", "pass"))
authRes.text

'{\n  "authenticated": true, \n  "user": "user"\n}\n'

In [6]:
# for ssl verfication, use the verify argument in the get request. and provide the path to ssl certificate

# response = requests.get('https://github.com', verify ='/path/to/certfile')d

In [133]:
# Post Request. Here the data to the server will not go within the URl, it will go inside the request body
# here, we can provide the data to the server via "data" or "json" argument.

postRes= requests.post("https://reqres.in/api/users", data={
    "name": "morpheus11",
    "job": "leader"
})

postRes.json()

{'name': 'morpheus11',
 'job': 'leader',
 'id': '132',
 'createdAt': '2023-03-11T04:07:59.241Z'}

In [135]:
url_post='http://httpbin.org/post'
payload={"name":"Joseph","ID":"123"}
r_post=requests.post(url_post,data=payload)
r_post.json() #here in reponse, we have a form key which is having the key-values pairs that we passed

{'args': {},
 'data': '',
 'files': {},
 'form': {'ID': '123', 'name': 'Joseph'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '18',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.28.2',
  'X-Amzn-Trace-Id': 'Root=1-640c0036-2e4d7fff60f443f91abf08c1'},
 'json': None,
 'origin': '103.146.217.131',
 'url': 'http://httpbin.org/post'}

In [47]:
# the timeout argument is passed. so the get request will wait for 10s only for the response.
# if we dont provide that, then it will wait for response until the site will respond.
# if site takes more than 5s to respond, then it will throw an error

delayRes= requests.get("https://httpbin.org/delay/3", timeout=5)

In [48]:
# A session object keeps the tcp connection alive to the server
# Session object allows one to persist certain parameters across requests like auth, headers, cookies, connection etc

s= requests.Session()  #creating session object before calling the pages multiple times bcoz 
# after first time, it will use the session parameters to make calls
for i in range(1,21):
    s.get(f"https://www.scrapethissite.com/pages/forms/?page_num={i}")  #calling the site multiple times
s.close()  #closing the session

In [112]:
# get the data from the API, which we return a json. then use it to convert it to pandas Df

data= requests.get("https://www.fishwatch.gov/api/species")
jsonData=json.loads(data.text)
pd.DataFrame(jsonData)

Unnamed: 0,Fishery Management,Habitat,Habitat Impacts,Image Gallery,Location,Management,NOAA Fisheries Region,Population,Population Status,Scientific Name,Species Aliases,Species Illustration Photo,Species Name,Animal Health,Availability,Biology,Bycatch,Calories,Carbohydrate,Cholesterol,Color,Disease Treatment and Prevention,Diseases in Salmon,Displayed Seafood Profile Illustration,Ecosystem Services,Environmental Considerations,Environmental Effects,Farming Methods,Farming Methods_,"Fat, Total",Feeds_,Feeds,"Fiber, Total Dietary",Fishing Rate,Harvest,Harvest Type,Health Benefits,Human_Health_,Human Health,Physical Description,Production,Protein,Quote,Quote Background Color,Research,"Saturated Fatty Acids, Total",Selenium,Serving Weight,Servings,Sodium,Source,"Sugars, Total",Taste,Texture,Path,last_update
0,"<ul>\n<li><a href=""https://www.fisheries.noaa....",,"Bottomfish fishing operations, typically using...",[{'src': 'https://origin-east-01-drupal-fishwa...,<ul>\n<li>Crimson jobfish are distributed thro...,,Pacific Islands,"The population level is unknown, but the stock...",<ul>\n<li>There are five stocks of crimson job...,Pristipomoides filamentosus,"<a href=""/species-aliases/opakapaka"" typeof=""s...",{'src': 'https://origin-east-01-drupal-fishwat...,Crimson Jobfish,,<p>Year-round</p>\n,<ul>\n<li>Crimson jobfish reach sexual maturit...,Regulations are in place to minimize bycatch.,100,0 g,37 mg,"<p>Crimson jobfish has a clear, light pink fle...",,,,,,,,,1.34 g,,,0 g,Reduced to end overfishing of the bottomfish c...,"<ul>\n<li>Fishery:\n<ul>\n<li>Commercial, recr...",Wild,"<p>Crimson jobfish is a great source of lean, ...",,,<ul>\n<li>Crimson jobfish have skin that is sl...,,20.5,U.S. wild-caught crimson jobfish is a smart se...,NB,,0.285 g,38.2 mcg,100 g (raw),1,64 mg,<p>U.S -wild caught around Hawaii and Pacific ...,0 g,<p>Crimson jobfish has a delicate flavor.</p>\n,<p>Crimson jobfish has&nbsp;a firm texture and...,/profiles/crimson-jobfish,06/21/2022 - 12:00
1,"<ul>\n<li><a href=""https://www.fisheries.noaa....","<ul>\n<li>White hake are groundfish, meaning t...",Area closures and gear restrictions protect ha...,[{'src': 'https://origin-east-01-drupal-fishwa...,<ul>\n<li>White hake are found in the northwes...,,Greater Atlantic,The stock is not overfished.,<ul>\n<li>According to the 2022&nbsp;stock ass...,Urophycis tenuis,"<a href=""/species-aliases/mud-hake"" typeof=""sk...",{'src': 'https://origin-east-01-drupal-fishwat...,White Hake,,<p>Year-round.</p>\n,<ul>\n<li>White hake can grow up to 53 inches ...,Regulations are in place to minimize bycatch.,90,0 g,67 mg,,,,,,,,,,1.31 g,,,0 g,Not subject to overfishing.,<ul>\n<li>Commercial fishery:\n<ul>\n<li>In 20...,Wild,<p>White hake is a lean source of protein.</p>\n,,,<ul>\n<li>White hake have a large mouth that e...,,18.31 g,U.S. wild-caught white hake is a smart seafood...,#555555,,0.247 g,32.1 mcg,100 g (raw),1,72 mg,<p>White hake is wild-caught from Maine to sou...,0 g,"<p>White hake has a mild, almost sweet&nbsp;ta...","<p>White hake has a soft, delicate texture tha...",/profiles/white-hake,01/11/2023 - 17:11
2,"<ul>\n<li>\n<p><a href=""https://archive.fisher...",<ul>\n<li>Atlantic chub mackerel are a schooli...,Area closures and gear restrictions protect ha...,[{'src': 'https://origin-east-01-drupal-fishwa...,<ul>\n<li>Atlantic chub mackerel in the wester...,,Greater Atlantic,"The population level is unknown, but manageme...",<p>Atlantic chub mackerel has never been asses...,Scomber colias,"<a href=""/species-aliases/mackerel"" typeof=""sk...",{'src': 'https://origin-east-01-drupal-fishwat...,Atlantic Chub Mackerel,,<p>Year-round.</p>\n,<ul>\n<li>Atlantic chub mackerel grow up to 22...,Regulations are in place to minimize bycatch.,205,0 g,70 mg,,,,,,,,,,13.89 g,,,0 g,"Overfishing status is unknown, but catch is at...",<ul>\n<li>Commercial Fishery\n<ul>\n<li>In 202...,Wild,<p>Chub mackerel is high in omega-3 fatty acid...,,,<ul>\n<li>Atlantic chub mackerel are elongated...,,18.6 g,U.S. wild-caught Atlantic chub mackerel is a s...,#555555,,3.257 g,44.1 mcg,100 g (raw),1,90 mg,<p>U.S. wild caught from Southern New England ...,0 g,"<p>Rich, pronounced flavor.</p>\n","<p>Soft, flaky, and moist.</p>\n",/profiles/atlantic-chub-mackerel,01/25/2023 - 09:57
3,"<ul>\n<li><a href=""https://www.fisheries.noaa....",<ul>\n<li>Shortfin squid live in deep and shal...,Fishing gears used to harvest shortfin squid h...,,<ul>\n<li>Shortfin squid inhabits the continen...,,Greater Atlantic,"The population level is unknown, but managemen...",<ul>\n<li>According to the latest assessment (...,Illex illecebrosus,"<a href=""/species-aliases/illex-squid"" typeof=...",{'src': 'https://origin-east-01-drupal-fishwat...,Shortfin Squid,,<p>Summer and fall.</p>\n,<ul>\n<li>Shortfin squid live for less than on...,Regulations are in place to minimize bycatch.,92,3.08 g,233 mg,<p>Raw squid is ivory colored with orange spec...,,,,,,,,,1.38 g,,,0 g,Not subject to overfishing.,<ul>\n<li>Commercial fishery\n<ul>\n<li>In 202...,Wild,"<p>Squid are an excellent source of selenium, ...",,,<ul>\n<li>Female shortfin squid range from 7 t...,,15.58 g,U.S. wild-caught shortfin squid is a smart sea...,cc9966,,0.358 g,44.8 mcg,100 g,1,44 mg,<p>U.S. wild-caught from Maine to North Caroli...,0 g,"<p>Mild, and subtly sweet.</p>\n<p>&nbsp;</p>\n",<p>Firm and meaty.</p>\n,/profiles/shortfin-squid,01/24/2023 - 12:42
4,"<ul>\n<li>The states and <a href=""https://www....",<ul>\n<li>American lobsters live on the ocean ...,Fishing gears used to harvest American lobster...,[{'src': 'https://origin-east-01-drupal-fishwa...,<ul>\n<li>American lobsters are found in the n...,,Greater Atlantic,The Gulf of Maine/Georges Bank stock is not ov...,"<ul>\n<li>According to the 2020&nbsp;<a href=""...",Homarus americanus,"<a href=""/species-aliases/lobster"" typeof=""sko...",{'src': 'https://origin-east-01-drupal-fishwat...,American Lobster,,"<p>Year-round. In New England, where most lobs...",<ul>\n<li>American lobsters have a long life s...,Regulations are in place to minimize bycatch.,90,0.5 g,95 mg,<p>The meat is white with red tinges.</p>\n,,,,,,,,,0.9 g,,,0 g,Not subject to overfishing.,<ul>\n<li>Commercial fishery:\n<ul>\n<li>In 20...,Wild,<p>Lobster is low in saturated fat and is a ve...,,,<ul>\n<li>American lobster is a crustacean wit...,,18.80 g,U.S. wild-caught American lobster is a smart s...,#996633,"<ul>\n<li>State scientists, in cooperation wit...",0.18 g,41.4 mcg,100 g (raw),1,296 mg,<p>U.S. wild-caught from Maine to North Caroli...,0 g,<p>Mild and sweet.</p>\n,<p>The meat is firm and somewhat fibrous. The ...,/profiles/american-lobster,01/26/2023 - 15:22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,"<ul>\n<li><a href=""https://www.fisheries.noaa....",<ul>\n<li>Younger pollock live in the mid-wate...,The Alaska pollock fishery uses midwater trawl...,[{'src': 'https://origin-east-01-drupal-fishwa...,<ul>\n<li>Alaska pollock are found throughout ...,,Alaska,"The Aleutian Islands, Eastern Bering Sea, and ...",<ul>\n<li>There are five stocks of walleye pol...,Gadus chalcogrammus,"<a href=""/species-aliases/pollock"" typeof=""sko...",{'src': 'https://origin-east-01-drupal-fishwat...,Alaska Pollock,,<p>Fresh from January to April and June to Oct...,<ul>\n<li>Alaska pollock grow fast and have a ...,The Alaska pollock fishery is one of the clean...,81,0 g,71 mg,,,,,,,,,,0.8 g,,,0 g,Not subject to overfishing.,<ul>\n<li>Commercial fishery:\n<ul>\n<li>The A...,Wild,<p>Alaska pollock is a good source of omega-3 ...,,,<ul>\n<li>Pollock is a member of the cod famil...,,17.18 g,U.S. wild-caught Alaska pollock is a smart sea...,#746867,"<ul>\n<li><a href=""https://www.afsc.noaa.gov/N...",0.164 g,21.9 mcg,100 g (raw),1,99 mg,"<p>U.S. wild-caught, mainly in the Bering Sea ...",0 g,<p>Pollock has mild-tasting flesh and is simil...,<p>Pollock has a relatively fine texture and i...,/profiles/alaska-pollock,01/20/2023 - 15:00
112,"<ul>\n<li><a href=""https://www.greateratlantic...","<ul>\n<li>Monkfish live on the ocean floor, ty...",Area closures and gear restrictions protect ha...,[{'src': 'https://origin-east-01-drupal-fishwa...,<ul>\n<li>Monkfish are found in the Northwest ...,,"Greater Atlantic, Southeast",The population level is unknown and management...,<ul>\n<li>There are two stocks of monkfish: Gu...,Lophius americanus,"<a href=""/species-aliases/goosefish"" typeof=""s...",{'src': 'https://origin-east-01-drupal-fishwat...,Monkfish,,"<p>Year-round, with peaks in the late fall and...",<ul>\n<li>Female monkfish grow larger and live...,Regulations limit possession of bycatch specie...,76,0 g,25 mg,,,,,,,,,,1.52 g,,,0 g,"Overfishing status is unknown, but catch is at...",<ul>\n<li>Commercial fishery:\n<ul>\n<li>In 20...,Wild,"<p>Low in sodium; a good source of niacin, vit...",,,<ul>\n<li>Monkfish have mottled dark brown to ...,,14.48 g,U.S. wild-caught monkfish is a smart seafood c...,#684627,"<ul>\n<li>NOAA’s <a href=""https://www.fisherie...",0.34 g,36.5 mcg,100 g (raw),1,18 mg,<p>U.S. wild-caught from Maine to North Caroli...,0 g,<p>Mild.</p>\n,"<p>The tail meat is firm, dense, and relativel...",/profiles/monkfish,01/11/2023 - 15:47
113,"<ul>\n<li><a href=""http://sero.nmfs.noaa.gov/""...",<ul>\n<li>Mahi mahi live near the surface in t...,Fishing gear used to catch mahi mahi rarely co...,[{'src': 'https://origin-east-01-drupal-fishwa...,"<ul>\n<li>Mahi mahi are found in the Atlantic,...",,"Greater Atlantic, Southeast",The South Atlantic stock is not overfished.,<p>There are two stocks of dolphinfish: a Sout...,Coryphaena hippurus,"<a href=""/species-aliases/mahimahi"" typeof=""sk...",{'src': 'https://origin-east-01-drupal-fishwat...,Atlantic Mahi Mahi,,<p>Year-round.</p>\n,<ul>\n<li>Atlantic mahi mahi grow up to almost...,Regulations are in place to minimize bycatch.,85,0 g,73 mg,"<p>The raw flesh is pinkish to grayish-white, ...",,,,,,,,,0.7 g,,,0 g,Not subject to overfishing.,<ul>\n<li>Commercial fishery:\n<ul>\n<li>In 20...,Wild,<p>Low in saturated fat and a good source of v...,,,<ul>\n<li>Brightly colored back is an electric...,,18.5 g,U.S. wild-caught mahi mahi is a smart seafood ...,#83a54b,"<ul>\n<li>An ongoing&nbsp;<a href=""http://www....",0.188 g,36.5 mcg,100 g (raw),1,88 mg,<p>Wild-caught from Massachusetts to Texas.</p>\n,0 g,"<p>Mahi mahi has a sweet, mild flavor. For a m...",<p>Mahi mahi is lean and fairly firm with larg...,/profiles/atlantic-mahi-mahi,01/26/2023 - 14:51
114,"<ul>\n<li><a href=""https://sero.nmfs.noaa.gov/...",<ul>\n<li>Cobia are found near structures in t...,Fishing gear used to catch cobia rarely contac...,[{'src': 'https://origin-east-01-drupal-fishwa...,"<ul>\n<li>In U.S. waters, cobia are most abund...",,"Greater Atlantic, Southeast",The stock is not overfished.,<ul>\n<li>\n<p>According to the 2020 stock ass...,Rachycentron canadum,"<a href=""/species-aliases/crabeater"" typeof=""s...",{'src': 'https://origin-east-01-drupal-fishwat...,Cobia,,<p>Year-round.</p>\n,<ul>\n<li>Cobia are the only member of the fam...,"Minimal, as commercial fishermen rarely target...",87,0 g,40 mg,<p>Raw cobia meat is light tan and turns snowy...,,,,,,,,,0.64 g,,,0 g,Subject to overfishing.,<ul>\n<li>Commercial fishery:\n<ul>\n<li>\n<p>...,Wild,<p>Cobia is a good source of low-fat protein. ...,,,<ul>\n<li>Cobia are dark brown with a single d...,,18.99 g,U.S. wild-caught cobia is a smart seafood choi...,#452f21,<ul>\n<li>Most of the cobia you will find in t...,0.12 g,36.5 mcg,100 g (raw),1,135 mg,<p>Wild-caught from Virginia to Texas. Cobia i...,0 g,"<p>Cobia has a sweet, rich flavor.</p>\n","<p>Cobia is lean, moist, and firm with a nice ...",/profiles/cobia,01/18/2023 - 12:23


# Beautiful Soup

install bs4, lxml, html5lib

In [136]:
# get the data from the website, it will reture html file
# we convert it to text, with .text attribute

data = requests.get("https://www.scrapethissite.com/pages/forms/?page_num=1").text

In [137]:
# converting the html string data to soup object
# soup object will allow us to navigate the HTML as a tree and/or filter out what we are looking for and get info
# users use lxml for speed and it is recommended to use lxml or html5lib parser

soup=BeautifulSoup(data, "lxml")

type(soup)

bs4.BeautifulSoup

In [None]:
#  we can also pass the document(if available offline) through open filehandle

# with open("example.html") as fp:
#    soup = BeautifulSoup(fp)

In [155]:
# printing the soup object in good format

print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta con

In [139]:
# Grabbing the data from the soup object

# we can go like that, considering tags as attributes and chaining them to get to the desired tag 
pageTitle=soup.html.head.title

# get the text using the .text attribute
pageTitle.text

# but we dont have to be too specific while chaining, as if there is only one tag of title is present in dom, 
# and soup.title will return the first title tag matched within the dom, we can do like
soup.title.text

'Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping'

In [141]:
# we can use .parent to navigate to the parent of the current tagObject
link = soup.a

link.parent #it will give the tagObject of its object

<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>

In [150]:
#  get the sibiling of the current tagObject by .next_sibiling or previous_sibiling
lists=soup.li

lists.next_sibling #as there is a \n here, so it will consider this as a sibling also. so it will return \n


'\n'

In [15]:
# this will find the first div inside the dom, it will be descendant finding. So, anywhere in the dom.

soup.find("div") #it is same as soup.div

<div class="container">
<div class="col-md-12">
<ul class="nav nav-tabs">
<li id="nav-homepage">
<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>
</li>
<li id="nav-sandbox">
<a class="nav-link" href="/pages/">
<i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                Sandbox
                            </a>
</li>
<li id="nav-lessons">
<a class="nav-link" href="/lessons/">
<i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                Lessons
                            </a>
</li>
<li id="nav-faq">
<a class="nav-link" href="/faq/">
<i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                FAQ
                            </a>
</li>
<li class="pull-right" id="nav-login">
<a class="nav-link" href="/login/">
                                Login

In [16]:
# we can provide attribute to find method to find specific tag with given attributes
# it will find first tag with class "row", here for class attr use class_, as python has class keyword reserved

soup.find("div", class_="row")

<div class="row">
<div class="col-md-12">
<h1>
                            Hockey Teams: Forms, Searching and Pagination
                            <small>25 items</small>
</h1>
<hr/>
</div>
</div>

In [6]:
# we can use the attr attribute within the find method to find tag with specific attributes like aria-lable, href etc

tag1=soup.find("a", attrs={"href":"http://www.opensourcesports.com/hockey/"})
tag1

<a class="data-attribution" href="http://www.opensourcesports.com/hockey/" target="_blank">http://www.opensourcesports.com/hockey/</a>

In [86]:
# this will get the table from page
tableData= soup.table

# now we can get the futher data from this tableData object, no need of using soup object

headerRow=tableData.tr
print(headerRow)

firstHeader=tableData.tr.th.text
print("first header is ",firstHeader)

<tr>
<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>
</tr>
first header is  
                            Team Name
                        


['\n',
 <tr>
 <th>
                             Team Name
                         </th>
 <th>
                             Year
                         </th>
 <th>
                             Wins
                         </th>
 <th>
                             Losses
                         </th>
 <th>
                             OT Losses
                         </th>
 <th>
                             Win %
                         </th>
 <th>
                             Goals For (GF)
                         </th>
 <th>
                             Goals Against (GA)
                         </th>
 <th>
                             + / -
                         </th>
 </tr>,
 '\n',
 <tr class="team">
 <td class="name">
                             Boston Bruins
                         </td>
 <td class="year">
                             1990
                         </td>
 <td class="wins">
                             44
                         </td>
 <td class="losse

In [104]:
tableData.contents #to get the content (tags and string) inside this tagObject.

<tr class="team">
<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            24
                        </td>
<td class="ot-losses">
</td>
<td class="pct text-success">
                            0.55
                        </td>
<td class="gf">
                            299
                        </td>
<td class="ga">
                            264
                        </td>
<td class="diff text-success">
                            35
                        </td>
</tr>

In [25]:
# We can access the attributes of tags like a dict with tagObject["attr"] 

link=soup.find("a", class_="data-attribution")
link["href"]

# or

link.get("href")

'http://www.opensourcesports.com/hockey/'

In [157]:
# find_all method will return list of tagObject that matched from the soup
# It looks through a tag’s descendants and retrieves all descendants that match your filters

allTr=soup.find_all("tr")
allTr

# If we use a list we can match against any item in that list. it will find all tr and td separately
list_input=soup.find_all(name=["tr", "td"])
list_input

# we can use the attributes of tag to find them using find_all
# list_input=table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")

# the below code will find all tags with any id. Prvoided it must have id attr
ids=soup.find_all(id=True)
ids

# With string, you can search for strings instead of tags where we find all the elments with "Florida"
table_bs.find_all(string="Florida")




[<nav id="site-nav">
 <div class="container">
 <div class="col-md-12">
 <ul class="nav nav-tabs">
 <li id="nav-homepage">
 <a class="nav-link hidden-sm hidden-xs" href="/">
 <img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                 Scrape This Site
                             </a>
 </li>
 <li id="nav-sandbox">
 <a class="nav-link" href="/pages/">
 <i class="glyphicon glyphicon-console hidden-sm hidden-xs"></i>
                                 Sandbox
                             </a>
 </li>
 <li id="nav-lessons">
 <a class="nav-link" href="/lessons/">
 <i class="glyphicon glyphicon-education hidden-sm hidden-xs"></i>
                                 Lessons
                             </a>
 </li>
 <li id="nav-faq">
 <a class="nav-link" href="/faq/">
 <i class="glyphicon glyphicon-flag hidden-sm hidden-xs"></i>
                                 FAQ
                             </a>
 </li>
 <li class="pull-right" id="nav-login">
 <a class="nav-link" hre

In [63]:
# printing the table to console
# formating logic

allTr=soup.find_all("tr")
allTr
# we can use the allTr list iterable to get the data
# we have to write the logic to parse or get the data in proper format
headerTr= allTr[0].find_all("th")
for i in headerTr:
    print(i.text.split("\n")[1].strip(), end=",")
print()
for i in allTr[1:]:
    for k in i.find_all("td"):
        if len(k.text)==1:
            print(None, end=",")
        else:
            print(k.text.split("\n")[1].strip(), end=",")

Team Name,Year,Wins,Losses,OT Losses,Win %,Goals For (GF),Goals Against (GA),+ / -,
Boston Bruins,1990,44,24,None,0.55,299,264,35,Buffalo Sabres,1990,31,30,None,0.388,292,278,14,Calgary Flames,1990,46,26,None,0.575,344,263,81,Chicago Blackhawks,1990,49,23,None,0.613,284,211,73,Detroit Red Wings,1990,34,38,None,0.425,273,298,-25,Edmonton Oilers,1990,37,37,None,0.463,272,272,0,Hartford Whalers,1990,31,38,None,0.388,238,276,-38,Los Angeles Kings,1990,46,24,None,0.575,340,254,86,Minnesota North Stars,1990,27,39,None,0.338,256,266,-10,Montreal Canadiens,1990,39,30,None,0.487,273,249,24,New Jersey Devils,1990,32,33,None,0.4,272,264,8,New York Islanders,1990,25,45,None,0.312,223,290,-67,New York Rangers,1990,36,31,None,0.45,297,265,32,Philadelphia Flyers,1990,33,37,None,0.412,252,267,-15,Pittsburgh Penguins,1990,41,33,None,0.512,342,305,37,Quebec Nordiques,1990,16,50,None,0.2,236,354,-118,St. Louis Blues,1990,47,22,None,0.588,310,250,60,Toronto Maple Leafs,1990,23,46,None,0.287,241,318,-77,Va

In [68]:
# Writing to csv, with formating logic

# open or create the csv file
csvFile =open("data.csv", "w")

# create a writer object with csv.writer
writer=csv.writer(csvFile)

allTr=soup.find_all("tr")
allTr
# we can use the allTr list iterable to get the data
# we have to write the logic to parse or get the data in proper format

headerTr= allTr[0].find_all("th")

# writing single row to the csv. Actually im writing headers here
writer.writerow([i.text.split("\n")[1].strip() for i in headerTr])

# writing rows data
for i in allTr[1:]:
    writer.writerow([k.text.split("\n")[1].strip() if len(k.text)>1 else None for k in i.find_all("td")])
csvFile.close()


Your logic will work only if all the element all present and will not behave differently, else we might get an error
to resolve this, use try expect block to handle the errors

In [8]:
# here if we see, the html is not proper. But the lxml has parsed the html properly

html = '''<b>tutorialspoint</b>, <i>&web scraping &data science;</i>'''
soup = bs4.BeautifulSoup(html, 'lxml')
print(soup)

<html><body><b>tutorialspoint</b>, <i>&amp;web scraping &amp;data science;</i></body></html>


In [14]:
# We can also use bs4 to parse xml files

with open ("data.xml", "r") as file:
    xmlSoup=BeautifulSoup(file, "xml")  #here the parser is xml. for html, the parser is lxml or html5lib
    
xmlSoup

<?xml version="1.0" encoding="utf-8"?>
<CATALOG>
<CD>
<TITLE>Empire Burlesque</TITLE>
<ARTIST>Bob Dylan</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Columbia</COMPANY>
<PRICE>10.90</PRICE>
<YEAR>1985</YEAR>
</CD>
<CD>
<TITLE>Hide your heart</TITLE>
<ARTIST>Bonnie Tyler</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>CBS Records</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1988</YEAR>
</CD>
<CD>
<TITLE>Greatest Hits</TITLE>
<ARTIST>Dolly Parton</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>RCA</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1982</YEAR>
</CD>
<CD>
<TITLE>Still got the blues</TITLE>
<ARTIST>Gary Moore</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Virgin records</COMPANY>
<PRICE>10.20</PRICE>
<YEAR>1990</YEAR>
</CD>
<CD>
<TITLE>Eros</TITLE>
<ARTIST>Eros Ramazzotti</ARTIST>
<COUNTRY>EU</COUNTRY>
<COMPANY>BMG</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1997</YEAR>
</CD>
<CD>
<TITLE>One night only</TITLE>
<ARTIST>Bee Gees</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Polydor</COMPANY>
<PRICE>10.90</PRICE>
<YEAR>1998</YEAR>
</CD>
<CD

In [83]:
# When we passed a html document or string to a beautifulsoup constructor, 
# beautifulsoup basically converts a complex html page into different python objects.

# 1) Tag Objects - A tag object in BeautifulSoup corresponds to an HTML or XML tag in the actual page or document.

soup = BeautifulSoup('<b class="boldest">TutorialsPoint</b>')
bTag= soup.b #bTag is tag object

bTag.name  #to get the name of tag
bTag.name="p"  #we can change the tag name like this. It will change the tag in soup object as well
bTag.attrs  #to get the tag's element defined in the html. we can also access like tagobject["attr"] example bTag["class"]

#we can modify the attributes of the tag using the tagObject
# Adding attrs
bTag["class"]=["ClassA", "ClassB"]
bTag["style"]="some_style_using_css"
bTag    #<p class="ClassA ClassB" style="some_style_using_css">TutorialsPoint</p>
soup   #<html><body><p class="ClassA ClassB" style="some_style_using_css">TutorialsPoint</p></body></html>

# Deleting attrs
del bTag["style"]
bTag     #<p class="ClassA ClassB">TutorialsPoint</p>
soup   #<html><body><p class="ClassA ClassB">TutorialsPoint</p></body></html>. this will affect the soup as well,remove from it
# accessing attrs after deleting will cause keyerror

# some tags have multi valued attrs
css_soup = BeautifulSoup('<p class="body bold"></p>')
css_soup.p.get("class")  #['body', 'bold']


<html><body><p class="ClassA ClassB">TutorialsPoint</p></body></html>

In [72]:
#  2) NavigableString object
# The navigablestring objects are used to represent text within tags, rather than the tags themselves.
# It is used to represent the contents of a tag. To access the contents, use “.string” with tag.

bTag.string  #'TutorialsPoint'

bTag.string.replace_with(" Home point Tuts")

bTag.string  #' Home point Tuts' soup will also get updated to this new string

' Home point Tuts'

In [74]:
# 3) BeautifulSoup object- BeautifulSoup is the object created when we try to scrape a web resource. 
# So, it is the complete document which we are trying to scrape

soup.name

'[document]'

In [79]:
# 4) Comments object -The comment object illustrates the comment part of the web document. 
# It is just a special type of NavigableString.

soup = BeautifulSoup('<p><!-- Everything inside it is COMMENTS --></p>')
comment = soup.p.string
type(comment)    #bs4.element.Comment
soup.p.prettify()

'<p>\n <!-- Everything inside it is COMMENTS -->\n</p>'

Rest API’s function by sending a request, the request is communicated via HTTP message. The HTTP message usually contains a JSON file. This contains instructions for what operation we would like the service or resource to perform. In a similar manner, API returns a response, via an HTTP message, this response is usually contained within a JSON

In [162]:
# convert HTML table to pandas HTML

url = "https://en.wikipedia.org/wiki/World_population"

data  = requests.get(url).text

soup = BeautifulSoup(data,"html.parser")

tables = soup.find_all('table')

for index,table in enumerate(tables):
    if ("10 most densely populated countries" in str(table)):
        table_index = index
        
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
#         population_data = population_data.append({"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}, ignore_index=True)
        population_data = pd.concat([population_data, pd.DataFrame([{"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}])], ignore_index=True)

population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,\n Palestine[103]\n\n,5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419


In [170]:
# We can now use the pandas function read_html and give it the string version of the table to create the df
# provide the flavor which is the parsing engine bs4.

htmlTab= pd.read_html(str(tables[table_index]), flavor="bs4")

# The function read_html always returns a list of DataFrames so we must pick the one we want out of the list.
htmlTab[0]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[103],5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419


In [174]:
# We can also use the read_html function to directly get DataFrames from a url.
# the dataframe_list will contain all the tables of url html page

dataframe_list = pd.read_html(url, flavor='bs4')

dataframe_list[5]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2),Population trend[citation needed]
0,1,India,1389637446,3287263,423,Growing
1,2,Pakistan,242923845,796095,305,Rapidly growing
2,3,Bangladesh,165650475,148460,1116,Growing
3,4,Japan,124214766,377915,329,Declining[104]
4,5,Philippines,114597229,300000,382,Growing
5,6,Vietnam,103808319,331210,313,Growing
6,7,United Kingdom,67791400,243610,278,Growing
7,8,South Korea,51844834,99720,520,Steady
8,9,Taiwan,23580712,35980,655,Steady
9,10,Sri Lanka,23187516,65610,353,Growing


In [175]:
# We can also use the `match` parameter to select the specific table we want. 
# If the table contains a string matching the text it will be read.

pd.read_html(url, match="10 most densely populated countries", flavor='bs4')[0]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5921231,719,8235
1,2,Bangladesh,165650475,148460,1116
2,3,Palestine[103],5223000,6025,867
3,4,Taiwan,23580712,35980,655
4,5,South Korea,51844834,99720,520
5,6,Lebanon,5296814,10400,509
6,7,Rwanda,13173730,26338,500
7,8,Burundi,12696478,27830,456
8,9,India,1389637446,3287263,423
9,10,Netherlands,17400824,41543,419
