In [None]:
### Notebook for tutorial and assingment to understand web scraping
### Sections will cover:
### 1. Basics of HTML
### 2. Web scraping using Python
### 3. Parsing HTML using BeautifulSoup
### 4. Web Scraping Guidelines
### 5. Scraping a webpage
### 6. Scraping multiple pages
### 7. Scraping multiple items
### 8. Scraping data from tables
### 9. Scraping data from forms
### 10. Scraping data from APIs

# Web Scraping using Python

## Requests library

In [None]:
# this cell demoonstrates the request library

import requests
import pprint

url = 'https://www.google.com'
response = requests.get(url)
print(f'Response status code: {response.status_code}')
print("---------------------------------")
print(f'Response headers: {response.headers}')
print("---------------------------------")
print(f'Response text: {response.text}')
print("---------------------------------")
print(f'Pretty printed response text: {pprint.pprint(response.text)}') #search for logo to verify

# note: the requests.get() method sends a GET request to the specified URL and returns a response object containing the server's response to the request
# it return a status code, header and text. these are explained below
# status code: a 3-digit code that indicates the status of the response. common status codes are 200 (OK), 404 (Not Found), and 500 (Internal Server Error)
# headers: a dictionary containing the response headers
# text: the content of the response in Unicode

Response status code: 200
---------------------------------
Response headers: {'Date': 'Thu, 02 Jan 2025 09:06:13 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'Content-Security-Policy-Report-Only': "object-src 'none';base-uri 'self';script-src 'nonce-nKbj4cxG19loV2BPZLub1Q' 'strict-dynamic' 'report-sample' 'unsafe-eval' 'unsafe-inline' https: http:;report-uri https://csp.withgoogle.com/csp/gws/other-hp", 'Accept-CH': 'Sec-CH-Prefers-Color-Scheme', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': 'AEC=AZ6Zc-VKeuvKutY7MGZwPwcJy8J8mTkBLysJuHt4iFtHJX1SLG1BTUlUH6k; expires=Tue, 01-Jul-2025 09:06:13 GMT; path=/; domain=.google.com; Secure; HttpOnly; SameSite=lax, NID=520=bKMtaEyxQTajO6c9X7TjxMcnqfz0ID_Uj2a_A3mMOIs7s4MSCVVxwrHTb2v6_1Oml3zkSw8OZAUiX8qPh6EhM_MYwnuSPTwmpJIyUkZN9i32nCwNfKEqVt2

In [5]:
# more with requests

# sending a GET request with parameters
url = 'https://www.google.com/search'
params = {'q': 'python'}
response = requests.get(url, params=params) #search for href to see links
#pretty print the response text
pprint.pprint(response.text)

('<!doctype html><html lang="en-IN"><head><meta charset="UTF-8"><meta '
 'content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" '
 'itemprop="image"><title>python - Google Search</title><script '
 'nonce="ONMBllgaQkDOWB1Po66dzw">(function(){\n'
 'document.documentElement.addEventListener("submit",function(b){var '
 'a;if(a=b.target){var '
 'c=a.getAttribute("data-submitfalse");a=c==="1"||c==="q"&&!a.elements.q.value?!0:!1}else '
 'a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var '
 'a;a:{for(a=b.target;a&&a!==document.documentElement;a=a.parentElement)if(a.tagName==="A"){a=a.getAttribute("data-nohref")==="1";break '
 'a}a=!1}a&&b.preventDefault()},!0);}).call(this);(function(){window.google=window.google||{};var '
 'a=window.performance&&window.performance.timing&&"navigationStart"in '
 'window.performance.timing,b=google.stvsc&&google.stvsc.ns,c=a?b||window.performance.timing.navigationStart:void '

In [None]:
# sending a POST request
# POST requests are used to submit data to the server. the data is sent in the request body instead of the URL

url = 'https://www.google.com'
data = {'key1': 'value1', 'key2': 'value2'}
response = requests.post(url, data=data)