In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

import json
from pymongo import MongoClient

In [2]:
def start_selenium_driver():
    
    '''
    Starts selenium driver in Google Chrome
    '''
    
    chromedriver = "/Applications/chromedriver"
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)
    return driver

In [3]:
def login_adhesives(driver):
    
    '''
    Opens Adhesives & Sealants Magazine web site log in page and submits login information.  Requires running
    start_selenium_driver() function first to provide driver
    '''
    
    driver.get('https://www.sub-forms.com/dragon/init.do?site=BNP7351_ADHESI528login&pk=RSIGN&returnurl=http%253A%252F%252Fwww.adhesivesmag.com%252Fuser%252Fpostlogin%253Fredirect%253Dhttps%253A%252F%252Fwww.adhesivesmag.com%252F')
    driver.find_element_by_id("id13").send_keys('ericwk@comcast.net')
    driver.find_element_by_id("id16").send_keys('nowme123')
    driver.find_element_by_class_name('submit').click()

In [4]:
def get_current_month_article_urls(current_month):
    
    '''
    Navigates to the current month issue of 'Adhesives & Sealants Industry' magazine and appends
    all article urls from each issue to a list. 
    
    Input Arguments: Current month date in format ("month_name year")
    
    Returns urls of articles in the issue.
    '''
    
    # Navigate to current issue via Selenium and collect article urls via BeautifulSoup
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.implicitly_wait(15)
    driver.find_element_by_link_text(current_month).click()
    bsObj = BeautifulSoup(driver.page_source, 'html.parser')
    urls = []
    for link in bsObj.findAll("a", href=re.compile("/articles/[1-9]")):
        urls.append(link['href'])
    
    # Remove duplicate and unwanted ("comments collection") urls
    final_urls = []
    for url in urls:
        if 'comments-container' not in url:
            final_urls.append(url)
    final_urls_set = set(final_urls)
    final_urls = list(final_urls_set)
    return final_urls

In [5]:
def get_articles(urls):
    
    '''
    Navigates to article url via Selenium driver, retrieves via BeautifulSoup and stores in dictionary
    article 'title', 'date', 'author' (if present, otherwise fills in "staff"), and 'text_body' into a dictionary.
    Inserts dictionary as a document into the 'articles' collection of the mongo 'adhesives' database. 
    
    Requires running start_selenium_driver(), login_adhesives() and open_mongo_database_client functions first.
    
    Requires opening mongo database client using 'adhesives' database and "articles" collection.

        client= MongoClient()
        db = client.adhesives
        articles_collection = db.articles
    
    Input Arguments:  A list of urls.  Can be output of get_current_month_article_urls or
    get_archived_article_urls functions
    '''
    
    for url in urls:
        # Navigate to url via Selenium (after login) and open page in BeautifulSoup
        driver.get(url)
        bsObj = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Retrieve title, date, author and text body from article and insert into dictionary.
        article = {}
        try:
            article['title'] = bsObj.find('h1').text
        except(AttributeError):
            article['title'] = -9
            print("No title found for:", url)
        try:
            article['date'] = bsObj.find(class_='date').text
        except(AttributeError):
            article['date'] = -9
            print("No date found for:", url)
        try:
            article['author'] = bsObj.find(class_='author').find('a').text
        except(AttributeError):
            article['author'] = 'staff'
        try:
            output = ''
            for x in bsObj.find(class_='body gsd-paywall').find_all('p'):
                output += x.text + ' '
            article['text_body'] = output
        except:
            article['text_body'] = -9
            print("No text body found for:", url)
    
        # Insert dictionary as a record into articles table in the adhesives Mongo database.
        articles_collection.insert_one(article)
        
        print(article['title'])

In [6]:
driver = start_selenium_driver()

In [8]:
login_adhesives(driver)

In [9]:
current_urls = get_current_month_article_urls('August 2018')

In [10]:
client= MongoClient()
db = client.adhesives
articles_collection = db.articles

In [11]:
articles_collection.delete_many({})
get_articles(current_urls)

Applying Rapid-Cure Adhesives with a Meter/Mix/Dispense System
Industry Leaders to Gather at ASC Executive Leadership Conference
Evaluating Adhesive Options in Packaging Applications
3M Sales Rise 7% in Second Quarter 2018
Sika Automotive Receives Product Leadership Award
Evonik Reorganizes Distribution Structure
Ranking the Leading Adhesives and Sealants Manufacturers in the ASI Top 25
Liquid Polysulfide Polymers for Chemical- and Solvent-Resistant Sealants
2018 ASI Top 25:  Leading Global Manufacturers of Adhesives and Sealants
Afera First Global Adhesive Tape Summit  a Success
Pilot Chemical Opens Innovation Center


In [12]:
def get_archived_article_urls(current_month, oldest_issue):
    
    '''
    Navigates to the current month issue of Adhesives & Sealants Magazine following login via
    login_adhesives() function.  Navigates to past month drop down list (archived issues),
    loops through past month issues and appends all article urls from each issue to a list.  Deletes
    duplicate urls and urls for 'comments-container' for each article urls.
    
    Input Arguments: 
    'current_month' - Current month date in format ("month name year")
    'oldest_issue' - Number of issue months past at which to stop
    
    Returns list of article urls
    ''' 
    
    # Navigate to current issue via Selenium and collect article urls via BeautifulSoup
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.implicitly_wait(30)
    # Open drop down list to select archived issue
    driver.find_element_by_link_text(current_month).click()
    driver.implicitly_wait(10)
    
    # Loop through archived issues appending article urls to list
    urls = []
    for issue in range(1, oldest_issue):
        driver.find_element_by_xpath('//div[@class="selectize-control single"]').click()
        driver.implicitly_wait(10)
        path = '//div[@class="selectize-dropdown-content"]/div[{}]'.format(issue)
        driver.find_element_by_xpath(path).click()
    
        bsObj = BeautifulSoup(driver.page_source, 'html.parser')
    
        for link in bsObj.findAll("a", href=re.compile("/articles/[1-9]")):
            urls.append(link['href'])
    
    # Remove duplicate and unwanted ("comments collection") urls
    final_urls = []
    for url in urls:
        if 'comments-container' not in url:
            final_urls.append(url)
    final_urls_set = set(final_urls)
    final_urls = list(final_urls_set)
    return final_urls

In [13]:
driver = start_selenium_driver()

In [14]:
login_adhesives(driver)

In [16]:
archived_urls = get_archived_article_urls('August 2018', 61)

In [17]:
get_articles(archived_urls)

3M Sales Rise 7% in Second Quarter 2018
UV-Curable Ink Applications Increase
Advancing Adhesives: A New Wave of Waterborne Dispersions for Adhesives and Coatings
A Natural Choice for Adhesives and Sealants
Mergers and Acquisitions in Adhesives and Sealants: 2015-2016 Review and Forecast
Strategic Solutions: Championing Executive Leadership in Internet Marketing
Vienna to Host 2016 FEICA Conference
Nashville Hosts the Adhesive and Sealant Council Spring 2015 Convention and Expo
North America Dominates Oilfield Biocides Market
Ask Dr. Dave: Should we consider silicones or polyurethanes for our construction applications?
Case Study: Managing Energy Use in Manufacturing
FEICA 2017 Will Focus on Innovation
Ask Dr. Dave
Editor’s Memo: And the Winners Are…
Planning Ahead for Adhesive and Sealant Success
Ask Dr. Dave: October 2013
2015 Adhesives and Sealants Raw Materials and Chemicals Roundtable
Boosting ROI with Powerful PR and Social Media
Converting Equipment for Pressure-Sensitive Adhesiv

Advancing Adhesives: Bio-Based Succinic Acid Polyester Polyols
Editor's Memo: Taking Things Easy?
Las Vegas Hosts PSTC Tape Summit
Solid Reasons for Solid Systems
Improving Operations with ERP
Understanding the Value of Your Business
Should I use a cyanoacrylate superglue for household repairs and assembly projects?
Market Trends: "Smart" Packaging on the Rise
Rotary Batch Mixer Blends Wood Powders for Plywood Adhesives
Adhesive and Sealant Council Announces Fall Convention Keynote Speakers
Adhesives at Work: U.S. Coast Guard Turns to Electrocoat Primer System
Regulatory Review: Into the West
Top 30 ASI Feature Articles of 2015
Auto, Packaging, Construction Lead Adhesives and Sealants Market Growth
Automation in the Metering, Mixing and Dispensing of Adhesives and Sealants
Transparency through an Agency Website? Maybe Not.
2016 Raw Materials and Chemicals Roundtable
We’ve been trying to get the metal button of a rear-view mirror to bond to a new windshield.
Sustainable Safety
Failure-F

Ask Dr. Dave: What is the best way to prepare aluminum surfaces for bonding with epoxy or acrylic adhesives?
Focus On: Henkel is Connecting to Success
Strategic Solutions: Construction, Crude Oil, and Natural Gas Outlook
Nasal Strip Adhesion
Focus On: One Supplier to the Adhesives and Sealants Industry is Finding Diamonds in the Rough
Advancements in UV LED Curing Technology for Adhesives
Distributor Roundtable: Exploring the Relationship Between Distributors and Adhesives and Sealants Manufacturers
Ask Dr. Dave: What are the best products for filling the seams after assembling acrylic countertops?
Environmentally Sound Pressure-Sensitive Adhesive Tape Manufacturing
Marketing and the Myth of “A New Idea”
Surface Treatment Effects
Assembly in Action
Market Trends: Worldwide Composites Markets Growing
Taking the Lead of the Adhesives and Sealants Industry
Clean and Green PSAs 
Hot Melts: A Dynamic, Growing Market
A Fast Guide to Fast-Curing Floors 
Increasing Production Marketing, Uptime

Ask Dr. Dave: What is the best way to judge the effectiveness of an adhesive bond?
Flexible Formulation of Waterborne Caulks and Sealants
2018 ASC Convention and Expo Attendees Can Benefit from Multiple Networking Opportunities
New Orleans Hosts Adhesive and Sealant Council Spring 2016 Convention and Expo
2015 ASI Top 25: Leading Worldwide Manufacturers of Adhesives and Sealants
ASI Top 25 Rankling of Adhesives and Sealants Manufacturers
Q&A About Polyurethanes: Coatings Raw Materials
Delivering Value with Adhesives and Sealants
Solving Engineering Challenges with Functional Coatings
Is there a PSA that can be removed and reapplied a number of times without losing adhesion?
Editor's Memo: Distributor Directory
Case Study: A Speedy Solution for Glue Extender Production
Staying Ahead of the Curve
Construction Adhesives and Sealants Manufacturer Celebrates a Milestone
Thinking Green: Sustainable Adhesive and Sealant Packaging Options
Mixing Two-Component Fluids in Exacting Proportions
Ain

In [18]:
articles_collection.count()

  """Entry point for launching an IPython kernel.


597