# Web Scraping of [PROJECT NAME] Using Python and BeautifulSoup Version 1
### David Lowe
### January 10, 2020

SUMMARY: The purpose of this project is to practice web scraping by extracting specific pieces of information from a website. The web scraping Python code leverages the BeautifulSoup module.

INTRODUCTION: The Conference on Neural Information Processing Systems covers a wide range of topics in neural information processing systems and research for the biological, technological, mathematical, and theoretical applications. Neural information processing is a field which benefits from a combined view of biological, physical, mathematical, and computational sciences. This web scraping script will automatically traverse through the entire web page and collect all links to the PDF and PPTX documents. The script will also download the documents as part of the scraping process. The Python script ran in the Google Colaboratory environment and can be adapted to run in any Python environment without the Colab-specific configuration.

Starting URLs: https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019

## Section 0. Prepare Environment

In [None]:
# Colab-Specific Setup - Refresh Linux package repositories and set up additional Linux and Python tools
# !apt-get update
# !apt install chromium-chromedriver
# !pip install -q pymysql selenium

In [None]:
import numpy as np
import pandas as pd
import os
import shutil
import smtplib
import sys
from email.message import EmailMessage
from datetime import datetime
import requests
from requests.exceptions import HTTPError
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from selenium import webdriver
import pymysql

In [None]:
# Begin the timer for the script processing
startTimeScript = datetime.now()

# Set up the verbose and debug flags to print detailed messages for debugging (setting True will activate!)
verbose = True
debug = False

# Set up the flag to send status emails (setting to True will send the status emails!)
notifyStatus = False

# Set up the mountStorage flag to mount G Drive for storing files (setting True will mount the drive!)
mountStorage = False

# Set up the executeDownload flag to download files (setting True will download!)
executeDownload = False

In [None]:
# Colab-Specific Setup - Mount Google Drive for storing downloaded files
if (mountStorage):
    from google.colab import drive
    drive.mount('/content/gdrive')

In [None]:
# Set up the email notification function
def email_notify(msg_text):
    sender = os.environ.get('MAIL_SENDER')
    receiver = os.environ.get('MAIL_RECEIVER')
    gateway = os.environ.get('SMTP_GATEWAY')
    smtpuser = os.environ.get('SMTP_USERNAME')
    password = os.environ.get('SMTP_PASSWORD')
    if sender==None or receiver==None or gateway==None or smtpuser==None or password==None:
        sys.exit("Incomplete email setup info. Script Processing Aborted!!!")
    msg = EmailMessage()
    msg.set_content(msg_text)
    msg['Subject'] = 'Notification from Python Web Scraping Script'
    msg['From'] = sender
    msg['To'] = receiver
    server = smtplib.SMTP(gateway, 587)
    server.starttls()
    server.login(smtpuser, password)
    server.send_message(msg)
    server.quit()

In [None]:
def download_to_local(doc_path):
#    local_file = os.path.basename(doc_path)
    local_file = doc_path.split('/')[-1]
    with requests.get(doc_path, stream=True) as r:
        with open(local_file, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    print('Downladed file: ' + local_file)

In [None]:
def download_to_gdrive(doc_path):
#    local_file = os.path.basename(doc_path)
    local_file = doc_path.split('/')[-1]
    gdrivePrefix = '/content/gdrive/My Drive/Colab_Downloads/'
    dest_file = gdrivePrefix + local_file
    with requests.get(doc_path, stream=True) as r:
        with open(dest_file, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    print('Downladed file: ' + dest_file)

In [None]:
if (notifyStatus): email_notify("Phase 0 Prepare Environment completed! "+datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))

## Section 1. Perform the Scraping and Processing

In [None]:
if (notifyStatus): email_notify("Phase 1 Perform the Scraping and Processing has begun! "+datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))

In [None]:
# Specifying the URL of desired web page to be scrapped
starting_url = "https://papers.nips.cc/book/advances-in-neural-information-processing-systems-32-2019"
website_url = "https://papers.nips.cc"

# Creating an html document from the URL
uastring = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0"
headers={'User-Agent': uastring}

In [None]:
# Access and test the starting URL
try:
    s = requests.Session()
    resp = s.get(starting_url, headers=headers)
    if (debug): print(resp.text)
except HTTPError as e:
    print('The server could not serve up the web page!')
    sys.exit("Script processing cannot continue!!!")
except ConnectionError as e:
    print('The server could not be reached due to connection issues!')
    sys.exit("Script processing cannot continue!!!")

if (resp.status_code==requests.codes.ok):
    print('Successfully accessed the web page: ' + starting_url)
    web_page = BeautifulSoup(resp.text, 'lxml')

In [None]:
# # Gather all document links from the starting URL (One Level)
# collection = web_page.find_all("a")
# i = 0

# for item in collection:
#     if (verbose): print(item)
#     doc_path = item['href']
#     if doc_path.lower().endswith(".pdf") | doc_path.lower().endswith(".pptx") | doc_path.lower().endswith(".zip"):
#         i = i + 1
#         doc_path = website_url + doc_path
#         # Adding random wait time so we do not hammer the website needlessly
#         print("Waiting " + str(waitTime) + " seconds to retrieve " + doc_path)
#         waitTime = randint(2,5)
#         sleep(waitTime)
#         if (executeDownload):
#             if (mountStorage):
#                 download_to_gdrive(doc_path)
#             else:
#                 download_to_local(doc_path)

# print('Finished finding all available documents on the web page!')
# print('Number of documents processed:', i)

In [None]:
# Gather all document links from the starting URL (Two Levels)
collection = web_page.find_all('li')
i = 0

# Delete the first li element as it is not a regular list item we need
collection.pop(0)

for item in collection:
    if (verbose): print(item)
    doc_title = item.a.string
    author_group = item.find_all('a', {'class':'author'})
    author_list = []
    for each_author in author_group:
        author_list.append(each_author.string)
    authors = ''.join(author_list)
    doc_link = website_url + item.a['href']

    # Adding random wait time so we do not hammer the website needlessly
    waitTime = randint(2,5)
    sleep(waitTime)
    print("Waited " + str(waitTime) + " seconds to retrieve the next URL.")
    try:
        s = requests.Session()
        resp = s.get(doc_link, headers=headers)
        if (debug): print(resp.text)
    except HTTPError as e:
        print('The server could not serve up the web page!')
        sys.exit("Script processing cannot continue!!!")
    except ConnectionError as e:
        print('The server could not be reached due to connection issues!')
        sys.exit("Script processing cannot continue!!!")

    if (resp.status_code==requests.codes.ok):
        print('Successfully accessed the document page: ' + doc_link)
        doc_page = BeautifulSoup(resp.text, 'lxml')

    artifact_list = doc_page.find('div', class_="main wrapper clearfix").find_all('a')
    for artifact_item in artifact_list:
        if artifact_item.string == "[PDF]":
            doc_path = website_url + artifact_item['href']
            # Adding random wait time so we do not hammer the website needlessly
            print("Waiting " + str(waitTime) + " seconds to retrieve " + doc_path)
            waitTime = randint(2,5)
            sleep(waitTime)
            if (executeDownload):
                if (mountStorage):
                    download_to_gdrive(doc_path)
                else:
                    download_to_local(doc_path)

    abstract = doc_page.find('p', class_="abstract").string
    i = i + 1

print('Finished finding all available documents on the web pages!')
print('Number of documents processed:', i)

In [None]:
if (notifyStatus): email_notify("Phase 1 Perform the Scraping and Processing completed! "+datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))

In [None]:
print ('Total time for the script:',(datetime.now() - startTimeScript))