In [1]:
# Web Scraping Basics w/Requests and Beautiful Soup

# The basic mechanisms to `scrape` the web. 

# Requests and BeautifulSoup are very popular libraries for web scraping

import requests
from bs4 import BeautifulSoup
import os

In [2]:
html_target = "a" # the first html tag you want to scrape data from. 

tag = "href" # the string information you want to grab from a tag in that html_target

f_ext = ".pdf" # the filename extension you want to search for (scraping pdf files)

dir_name = "Ghodsi_Ali" # the name of the path you want to store the files in

url = 'https://www.cs.berkeley.edu/~alig/papers' # the base url you want to scrape.

In [3]:
# We instantiate a request object and call the `.get` method on it. 
# `r` is our `HTTP 1.1` response. 

# From here we have:
# *    status
# *    encoding
# *    text of the body --- should type check this
# *    content of the body --- type binary

In [4]:
r = requests.get(url)
r

<Response [200]>

In [5]:
status = r.status_code
status

200

In [6]:
encoding = r.encoding # should type check this
encoding

'ISO-8859-1'

In [7]:
html_doc = r.text

In [8]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [9]:
anchor = soup(html_target) # get all the anchor tags into a list

In [10]:
def make_dir(directory):
    """
    return: None
    Makes directory if does not already exist
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

In [11]:
# This function makes a new request each time it's called. 
# It writes the binary content to file.
# This could be two functions. 
#     One to get the new request object/content. 
#     The other actually writes that content to file. 
# This modular design can be implemented by the reader if the reader is so inclined.

def download_url(url, endpoint):
    """
    return: None
    downloads file, requires url in global or class scope.
    """
    url_addr = "{url}/{endpoint}".format(url=url, endpoint=endpoint)
    file_path = "{directory}/{endpoint}".format(directory=dir_name, endpoint=endpoint)
    
    r = requests.get(url_addr)
    content_file = r.content
    
    with open(file_path, 'wb') as f:
        print ("""Downloading From: {url}\nWriting to: {file_path}""".format(
                                                url=url_addr, 
                                                file_path=file_path
                                                                    ))
        f.write(content_file)
        
# This is the script in action. Isolated like this, it looks very meager. 
# It will be reconfigured as a series of method calls in the next iteration.

In [12]:
print ("""Status: {status}\nEncoding: {encoding}""".format(status=status, encoding=encoding))

Status: 200
Encoding: ISO-8859-1


In [13]:
print ("Begin downloading")

Begin downloading


In [14]:
make_dir(dir_name)
for a in anchor:
    endpoint = a[tag]
    if endpoint[-4:] == f_ext:
            download_url(url, endpoint)
            print ("Finished Download -- {tag}".format(tag=endpoint))
            #print "miss: {tag}".format(tag=endpoint)

Downloading From: https://www.cs.berkeley.edu/~alig/papers/architecting-for-innovation.pdf
Writing to: Ghodsi_Ali/architecting-for-innovation.pdf
Finished Download -- architecting-for-innovation.pdf
Downloading From: https://www.cs.berkeley.edu/~alig/papers/bolt-on-causal-consistency.pdf
Writing to: Ghodsi_Ali/bolt-on-causal-consistency.pdf
Finished Download -- bolt-on-causal-consistency.pdf
Downloading From: https://www.cs.berkeley.edu/~alig/papers/cap-for-networks.pdf
Writing to: Ghodsi_Ali/cap-for-networks.pdf
Finished Download -- cap-for-networks.pdf
Downloading From: https://www.cs.berkeley.edu/~alig/papers/choosy.pdf
Writing to: Ghodsi_Ali/choosy.pdf
Finished Download -- choosy.pdf
Downloading From: https://www.cs.berkeley.edu/~alig/papers/content-oriented-naming.pdf
Writing to: Ghodsi_Ali/content-oriented-naming.pdf
Finished Download -- content-oriented-naming.pdf
Downloading From: https://www.cs.berkeley.edu/~alig/papers/dangers-causal-consistency.pdf
Writing to: Ghodsi_Ali/dan

In [15]:
print ("Finished Downloading")

Finished Downloading
