In [2]:
import os
import urllib
import re
import json
import requests
from datetime import timedelta, date
from time import sleep
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException

#### Basic html parsing
The below illustrates the use of beautiful soup for parsing and cleaning the raw content of a webpage. We will use a faculty-favorite, all-too-familiar use case: trying to get a plain text list of the students in one's class, using Banner. It turns out it is easier to do this by writing a web scraper than using the Banner interface itself. #UIGoals

In [6]:
#html_doc = open("../code/Summary_Class_List.html")
req = urllib.request.Request("https://cs.brown.edu/people/epavlick/Summary_Class_List.html")
response = urllib.request.urlopen(req)
html_doc = response.read()
html_dump = BeautifulSoup(html_doc, 'html.parser')
#print("The raw html:", html_dump.title)
#print("The name of the html tag:", html_dump.title.name)
#print("The content between the tags:", html_dump.title.string)
#print("The content between the tags:", html_dump.title.text)

# list of all the things with the lable "table"
html_tables = html_dump.find_all('table')
#print(len(html_tables))
class_list_html = None
for table in html_tables:
    if table.caption and table.caption.string == "Summary Class List":
        class_list_html = table
#print(class_list_html)

rows = class_list_html.find_all('tr')
header = rows[0]
col_names = [c.string for c in header.find_all('th')]
#print(col_names)
idx = col_names.index("Student Name")
#print(idx)
for row in rows[1:]: # first one is header
    cols = row.find_all('td')
    student_name_col = cols[idx]
    print(student_name_col.a.string)

Shmo, Joe
Shmane, Jane
Mouse, Mickey
Schmikey, Mickey
Man, The
Guy, That
Duck, Donald
Disney, Walt
Shmo, Joe


#### Basic web scraping
The below illustrates a basic script for crawling a webpage and recursively following links. For this, we will use the Alexa top sites page to get a list of the most popular domains in a given category.

In [None]:
def bare_bones_request(url):
    req = urllib.request.Request(url)
    response = urllib.request.urlopen(req)
    print(response.read())
    
def main(TOP_DOMAIN="Top"):
    
    base_url = 'https://www.alexa.com/topsites/category/'
    
    # Sometimes you need to give more info in the request, in which case you can use a header
    # E.g. this says "pretend I am coming from firefox"
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'} 

    # Basic depth first search
    stack = [base_url]
    while len(stack) > 0:
        url = stack.pop() # Python "pops" from the end of a list,
                          # so append/pop gives stack behavior
                          # (idk about you, but this clashes a bit with my mental model,
                          # so wanted to comment on it)
        try:
            req = urllib.request.Request(url, headers=hdr)
            response = urllib.request.urlopen(req)
        except(UnicodeEncodeError, urllib.error.HTTPError, urllib.error.URLError):
            print("Error on request: %s"%url)
            continue
            
        txt = response.read()
        doc = BeautifulSoup(txt, 'html.parser')

        for item in doc.find_all('a'):
            child = item.get('href')
            #print(child)
            if child:
                #print(display_text)
                if child.startswith("/topsites/category/%s"%(TOP_DOMAIN)):
                    child = child.replace("/topsites/category/", "")
                    stack.append(base_url+child)
                elif child.startswith("/siteinfo/"):
                    print('%s\t%s'%(item.string,
                        url.replace("https://www.alexa.com/topsites/category/Top/", "")))
                    
#bare_bones_request('https://www.alexa.com/topsites/category/')
main() #TOP_DOMAIN="Top/Society") 

#### Basic API calls
The below illustrates the construction of a basic API call, using NY Times API as an example. It shows how to both 1) construct the API call yourself and 2) use a python library to make it a bit cleaner/more readable.

In [None]:
KEY = "1vReULwjoRj03Qqyb7uIcxlwdSnTJeB0"

base_url = "https://api.nytimes.com/svc/"
# usual formal of API calls: {endpoint}?key1=value1&key2=value2&...keyn=valuen

# search endpoint
# search/v2/articlesearch.json?q={query}&fq={filter}

call = (base_url + "/search/v2/articlesearch.json?q=providence&" + 
        "begin_date=20180101&end_date=20181201&api-key=" + KEY)

# Other fun calls to try
# share endpoint
# mostpopular/v2/shared/{period}/{share_type}.json
# most shared on facebook for past 1 day
# call = "https://api.nytimes.com/svc/mostpopular/v2/shared/1/facebook.json?api-key=%s"

# most emailed endpoint
# mostpopular/v2/emailed/{period}.json
# most emailed for last 7 days
# call = "%s/mostpopular/v2/emailed/7.json?api-key=%s"%(base_url, KEY)

#print(call)
req = urllib.request.Request(call)
response = urllib.request.urlopen(req)
blob = response.read()
data = json.loads(blob)
#print(data['status'])
for article in data['response']['docs']:
    print('%s\n%s\n'%(article['web_url'], article['snippet']))

In [None]:
KEY = "1vReULwjoRj03Qqyb7uIcxlwdSnTJeB0"

base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
query = "providence"
start = "20180101"
end = "20181201"

params = { 'api-key': KEY,
           'q': query, 
           'begin_date': start, 
           'end_date': end}

response = requests.get(base_url, params=params)
response.raise_for_status() # will throw an error if status is not OK
data = response.json()

print(data)

for article in data['response']['docs']:
    print('%s\n%s\n'%(article['web_url'], article['snippet']))

#### Bot?! Who are you calling a bot?
The below illustrates some another sometimes useful library which actually launches your browser so that it is less obvious that you are a person. I used this successfully for a while until they started introducing captchas. ::eyeroll:: Surely there are better/more advanced libraries for this now, but this may serve as a useful starting point...

In [None]:
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

sdate = "2019-01-01"
edate = "2019-01-08"
sy, sm, sd = sdate.split('-')
ey, em, ed = edate.split('-')
start_date = date(int(sy), int(sm), int(sd))
end_date = date(int(ey), int(em), int(ed))

search_terms = ["data science"]

print("Search from %s to %s for terms %s\n"%(start_date, end_date, str(search_terms)))

def main():
    chromedriver = '/Users/ellie/Downloads/chromedriver'
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)

    seen = set()
    # Iterate through dates
    for single_date in daterange(start_date, end_date):
        print(single_date)
        for term in search_terms:
            url = 'https://www.google.com/search?'
            values = {'q' : term,
              'hl' : 'en',
              'gl' : 'us',
              'authuser' : '0',
              'source' : 'lnt',
              'tbs' : ('cdr:1,cd_min:' +
                       single_date.strftime("%m/%d/%Y") +
                       ",cd_max:" + single_date.strftime("%m/%d/%Y")),
              'tbm' : 'nws',
              'start' : '0' }

        #print(url + urllib.parse.urlencode(values))
        #req = urllib.request.Request(url + urllib.parse.urlencode(values))
        #response = urllib.request.urlopen(req) # Throws HTTP Error 403: Forbidden
        
        try:
            driver.get(url + urllib.parse.urlencode(values))
        except:
            continue
        for i,a in enumerate(driver.find_elements_by_tag_name('a')):
            try:
                link = a.get_attribute('href')
                while (link is not None) and (link.startswith('https://ipv4.google.com/sorry/') or link.startswith('https://ipv6.google.com/sorry/')): #captchas
                    print("Blocked on: %s\n"%(str(link)))
                    sleep(10)
                    link = a.get_attribute('href')
                if (link is not None) and (link not in seen) :
                    #remove some obvious ads and junk <a> elements
                    if (("google.com" not in link) and
                        ("webcache.googleusercontent" not in link)):
                        print(link)
                    seen.add(link)
            except StaleElementReferenceException:
                print("Stale element: %s\n"%str(a))
    driver.close()

main()