In [None]:
import http.server as SimpleHTTPServer
import socketserver as SocketServer
from threading import Thread
from time import sleep
import requests
import ipywidgets as widgets
from IPython.display import HTML, display
import logging

In [None]:
out = widgets.Output(layout={'border': '1px solid black'})

In [None]:
out.clear_output()

In [None]:
with out:
    display(HTML("<h1>Request Headers</h1>"))

In [None]:
out

In [None]:
PORT = 8000

class GetHandler(
        SimpleHTTPServer.SimpleHTTPRequestHandler
        ):

    def do_GET(self):
        logging.error(self.headers)
        SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self)


Handler = GetHandler
httpd = SocketServer.TCPServer(("", PORT), Handler)

In [None]:
t = Thread(target=httpd.serve_forever)

In [None]:
t.start()

In [None]:
r = requests.get("http://localhost:8000")
r.text

In [None]:
# requests not a browser, but it still sends a header with its HTTP requests
# headers contain metadata about the reuqest, including what kind of data formats (content) the user "agent" will accept, 

In [None]:
t.join()

### Scraping the GW Schedule of Classes

In [None]:
from bs4 import BeautifulSoup
import requests
import re

In [None]:
depts_url = 'https://my.gwu.edu/mod/pws/subjects.cfm'
params = {'campus_id': '1', # Main Campus
            'term_id': '202303'}

In [None]:
# Getting a single page
depts_page = requests.get(depts_url, params=params)

In [None]:
depts_page

In [None]:
depts_page.text
# Introduce browser inspector at this point

In [None]:
soup = BeautifulSoup(depts_page.text, features="html.parser")


In [None]:
soup

In [None]:
# Explain how to derive this
soup.find("div", class_="subjectsMain").find_all("a")

In [None]:
links = [l['href'] for l in soup.find("div", class_="subjectsMain").find_all("a")]

In [None]:
links

In [None]:
# we can construct the URL from one of these strings and the base URL
course_page = requests.get('https://my.gwu.edu/mod/pws/' + links[0])
course_page

In [None]:
soup = BeautifulSoup(course_page.text, features="html.parser")
soup

In [None]:
# Let's say we're interested in extracting the times each course meets, along with its dept, course number, course title, and section number
# We'll need the 3, 4, 5, and 9th cell <td> from the first row <tr> of each <table> with a class "courseListing"

In [None]:
tables = soup.find_all('table', class_='courseListing')
tables

In [None]:
# Add splitting as iterative steps
courses = []
for table in tables:
    cells = table.find('tr').find_all('td')
    course = {'course_code': cells[2].text.split(),
            'section': cells[3].text,
            'title': cells[4].text,
            'times': cells[8].text.split('AND')}
    courses.append(course)

In [None]:
courses[0]

In [None]:
courses[-1]

In [None]:
courses[2]

In [None]:
# Refactoring to retrieve all course info
def scrape_course_info(page):
    soup = BeautifulSoup(page.text, features="html.parser")
    tables = soup.find_all('table', class_='courseListing')
    courses = []
    for table in tables:
        cells = table.find('tr').find_all('td')
        course = {'course_code': cells[2].text.split(),
                'section': cells[3].text,
                'title': cells[4].text,
                'times': cells[8].text.split('AND')}
        courses.append(course)
    return courses

all_courses = []
for link in links:
    course_page = requests.get('https://my.gwu.edu/mod/pws/' + link)
    courses = scrape_course_info(course_page)
    all_courses.extend(courses)

In [None]:
len(all_courses)

In [None]:
all_courses[-1]

In [None]:
# To export to JSON
import json
with open('./courses-fall-2023-times-main-campus.json', 'w') as f:
    json.dump(all_courses, f)

In [None]:
# How about getting the textbooks associated with the courses?
books = requests.get('https://www.bkstr.com/webApp/discoverView?bookstore_id-1=122&term_id-1=202303&div-1=&dept-1=ACA&course-1=6201&section-1=10')

In [None]:
books

In [None]:
books.text

In [None]:
# Trying with a sandbox site
r = requests.get('https://www.scrapethissite.com/pages/advanced/?gotcha=headers')

In [None]:
r

In [None]:
r.text

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
            'Accept': 'text/html'}
r = requests.get('https://www.scrapethissite.com/pages/advanced/?gotcha=headers', headers=headers)

In [None]:
r

In [None]:
# Finding the hidden API in the Nintendo E-shop
# 1. Construct the search
# 2. Identify the data XHR
# 3. Extract the request info
# 4. Show how to perform pagination

In [None]:
nintendo_headers = '''
Accept: */*
Accept-Encoding: gzip, deflate, br
Accept-Language: en-US,en;q=0.9
Cache-Control: no-cache
Connection: keep-alive
Content-Length: 345
Host: u3b6gr4ua3-dsn.algolia.net
Origin: https://www.nintendo.com
Pragma: no-cache
Referer: https://www.nintendo.com/
Sec-Fetch-Dest: empty
Sec-Fetch-Mode: cors
Sec-Fetch-Site: cross-site
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36
content-type: application/x-www-form-urlencoded
sec-ch-ua: "Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "macOS"
x-algolia-api-key: a29c6927638bfd8cee23993e51e721c9
x-algolia-application-id: U3B6GR4UA3
'''
nintendo_headers = dict([h.split(sep=': ', maxsplit=1) for h in nintendo_headers.split('\n') if h])

In [None]:
nintendo_url = 'https://u3b6gr4ua3-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(4.19.0)%3B%20Browser%3B%20JS%20Helper%20(3.13.5)%3B%20react%20(17.0.2)%3B%20react-instantsearch%20(6.40.3)'
nintendo_json = {"requests":[{"indexName":"store_game_en_us","params":"analytics=true&attributesToHighlight=%5B%22description%22%5D&clickAnalytics=true&facetFilters=%5B%22playerCount%3A2%2B%22%5D&facetingAfterDistinct=true&facets=%5B%22*%22%5D&filters=&highlightPostTag=%5E*&highlightPreTag=%5E*%5E%5E&hitsPerPage=100&maxValuesPerFacet=100&page=2&tagFilters="}]}

In [None]:
r = requests.post(nintendo_url, headers=nintendo_headers, json=nintendo_json)

In [None]:
results = r.json()
len(results['results'][0]['hits'])

In [None]:
results