### extract-pages-from-mongo
SanjayKAroraPhD@gmail.com <br>
November 2018

## Description
This notebook extracts groups of pages from mongodb by firm_name to create firm-centric page output files that can later be topic modeled.  In doing so, it removes repetitive content (e.g., repeated menu items) and garbage content (e.g., improperly parsed HTML code). 

## Change log
v3 adds the python boilerplate  api for web page cleaning.

## TODO:
* Need to make better use of all pages in the site, e.g., to improve data quality and use additional paragraph data found on non-homepages 
* Standardize firm name cleaning code utility into a universally accessible module
* Train a more effective boilerplate model to capture meaningful text from firm web pages

In [2]:
# import data processing and other libraries
import csv
import sys
import requests
import os
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
import io
from IPython.display import display
import time
import numpy as np
from bs4 import BeautifulSoup
import string

In [3]:
from boilerpipe.extract import Extractor

In [4]:
MONGODB_DB = "FirmDB_20181203"
MONGODB_COLLECTION = "pages_depth0"
CONNECTION_STRING = "mongodb://localhost"
username = "scrapy"
password = "eager"
authSource = "FirmDB"
authMechanism='SCRAM-SHA-1'

client = pymongo.MongoClient(CONNECTION_STRING, username=username, password=password, authSource=authSource, authMechanism=authMechanism)
db = client[MONGODB_DB]
col = db[MONGODB_COLLECTION]

DATA_DIR = '/Users/sarora/dev/EAGER/data/orgs/depth0_boilerpipe/'

In [5]:
# gather unique firm_names from mongodb

def get_firm_aggregates ():
    query = [ { "$group": {"_id":"$firm_name" , "number":{"$sum":1}} } ]
    results = col.aggregate(query)

    mongo_dict = {}
    for result in results:
        key = (result['_id'])
        if key:
            mongo_dict[key[0]] = result['number']
    
    return mongo_dict

results_dict = get_firm_aggregates()
firm_names = results_dict.keys()
print (len(firm_names))
pp = pprint.PrettyPrinter()
pp.pprint(firm_names)

1152
[u'Little Kids',
 u'Daylight Solutions',
 u'Biocon Limited',
 u'Sony Corporation',
 u'Magna International Inc.',
 u'Supremex Inc.',
 u'Honeywell International Inc.',
 u'Aurrion',
 u'The Jackson Laboratory',
 u'Hewlett Packard Enterprise Development LP',
 u'CUMMINS FILTRATION IP',
 u'Renewable Power Conversion',
 u'Fuji Electric Co.',
 u'Canon Kabushiki Kaisha',
 u'Sola U.S.A. Inc.',
 u'Chromalox',
 u'Empire Technology Development LLC',
 u'Cabot Corporation',
 u'aTyr Pharma',
 u'Alcon Research',
 u'Wyatt Technology Corporation',
 u'Southwest Research Institute',
 u'Skidmore',
 u'Fairchild Semiconductor Corporation',
 u'Saint-Gobain Performance Plastics Corporation',
 u'Ideal Power Inc.',
 u'NuOrtho Surgical',
 u'Suncore Photovoltaics',
 u'L-3 Communications Cincinnati Electronics Corporation',
 u'Medical Diagnostic Laboratories',
 u'Solarmer Energy',
 u'Braun Intertec Geothermal',
 u'ProNAi Therapeutics',
 u'Amicus Therapeutics',
 u'Raytheon Company',
 u'Weatherford Canada Partners

 u'POET Research',
 u'Sumitomo Rubber Industries',
 u'MANUFACTURING RESOURCES INTERNATIONAL',
 u'Greatbatch Ltd.',
 u'IDEALAB',
 u'Nthdegree Technologies Worldwide Inc.',
 u'Fina Technology',
 u'Corporation for National Research Initiatives',
 u'Ablexis',
 u'AbbVie Inc.',
 u'FastCAP Systems Corporation',
 u'Professional Compounding Centers of America',
 u'Sensor-Kinesis Corporation',
 u'Molecular Rebar Design',
 u'GLIKNIK INC.',
 u'Luna Innovations Incorporated',
 u'Opel Solar',
 u'VINYLAST',
 u'Bostik',
 u'OSI Optoelectronics',
 u'The Procter & Gamble Company',
 u'Cornell Research Foundation',
 u'CNH Industrial America LLC',
 u'Microsemi SoC Corporation',
 u'NOK Corporation',
 u'Bi-Modal Corporation',
 u'E Ink Corporation',
 u'Wenger Corporation',
 u'Tufts Medical Center',
 u'Cedar Ridge Research',
 u'Global Eagle Entertainment Inc.',
 u'JNC Corporation',
 u'Synaptic Research',
 u'KT Corporation',
 u'Intuitive Surgical Operations',
 u'FutureWei Technologies',
 u'GLYCON LLC',
 u'S. C. 

In [6]:
# remove html content
def is_javascript (x):
    match_string = r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')"
    # capture CDATA; function declarations; function calls; word sequences separated by a period (e.g., denoting paths)
    regex = re.findall(match_string, x) 
    # check to see if the regex finds some percentage of the words look like javascript patterns
    if (len(regex) / float(len(x.split())) > .10):
        return True 
    else:
        return False

def clean_page_content (text_list):
    # remove whatever we think is html
    removed_html = filter(lambda x: not( bool(BeautifulSoup(x, "html.parser").find()) ), text_list)
    # remove content that looks like javascript 
    removed_js = filter(lambda x: not (is_javascript(x)), removed_html)
    # add other checks here as needed

    return removed_js
    

# iterate through each firm, get all pages associated with a firm, and produce data structure
# url --> depth
#     --> content (list)
# return data structure
def process_firm (firm_name): 
    regex = '^' + re.escape(firm_name) + '$'
    results = col.find( {"firm_name": re.compile(regex, re.IGNORECASE) } )
    firm_pages_dict = {}
    depth0_page_text = [] # home page
    result = results[0]
    key = result['url'][0]
    depth = result['depth'][0]

    if key and depth <= 0:
        page_dict = {}
        page_dict['depth'] = depth
        page_dict['domain'] = result['domain'][0]
        result['domain'][0]
        page_dict['firm_name'] = firm_name
        clnd_text = clean_page_content(result['full_text'])
        page_dict['clnd_text'] = '\n'.join(clnd_text)
        
        if 'body' in result:
            extractor = Extractor(extractor='DefaultExtractor', html = result['body'][0])
            lines = extractor.getText().replace(u'\xa0', u' ').split('\n')
            filtered = filter(lambda x: not re.match(r'^\s*$', x), lines)
            page_dict['boilerpipe'] = '\n'.join(filtered)

        firm_pages_dict[key] = page_dict

        if 'boilerpipe' in page_dict and page_dict['boilerpipe'] and (len(page_dict['boilerpipe']) > .5 * len (page_dict['clnd_text'])):
            print ('\tUsing boilerplate')
            depth0_page_text = page_dict['boilerpipe']
        else:
            print ('\tUsing clnd_text')
            depth0_page_text = page_dict['clnd_text']

    return firm_pages_dict, depth0_page_text

In [7]:
# regex test 
regex = re.findall(r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')", 
                   "CDATA function contact-us getelementbyid javascript.function linker:autoLink www.littlekidsinc.com fxnCall(param.param); email@dextr.us 'type': 'image' return true return false rev7bynlh\\u00252bvcgrjg\\ {height}") # last part is words sequences separated by punct
print (regex)

['CDATA', 'function', 'getelementbyid', 'javascript.function', 'linker:autoLink', 'www.littlekidsinc', 'fxnCall(param.param);', 'dextr.us', "'type': 'image", 'return true', 'return false', 'rev7bynlh\\u00252bvcgrjg', '\\', '{', '}']


In [8]:
firm_pages_dict, depth0_page_text = process_firm ("ASCENT SOLAR TECHNOLOGIES")
print (depth0_page_text)

	Using clnd_text
Search Our Site
Enter Search
Search
Company Overview 
SEC Filings
Company Press Releases
Stock Information
Events and Presentations
Corporate Governance
Investor Relations Contact
Description
Mission Vision
Awards
Intellectual Property
Leadership
FAQ
User Manuals & Documentation
Warranties, Returns & Exchanges
Contact
Solar solutions from bare modules to finished goods.
LEARN MORE
LEARN MORE
LEARN MORE
LEARN MORE
LEARN MORE
LEARN MORE
LEARN MORE
TECHNOLOGY
Superlight CIGS Technology
MARKETS
Government & Public Sector 
Consumer & OEM
PRODUCTS
XD48 Solar Charger 
WS50 Solar Blanket 
Bare Modules
CUSTOM SOLUTIONS
Range of Capabilities
INVESTOR RELATIONS
Company Overview 
SEC Filings
Company Press Releases
Stock Information
Events and Presentations
Corporate Governance
Investor Relations Contact
ABOUT
Description 
Awards
Intellectual Property 
Facilities
Digital Press Kit 
ProDeal
SUPPORT
FAQs
CONTACT
12300 N. Grant St. 
+1 720-872-5000 
Copyright © 2018 Ascent Solar Techn

In [11]:
# standard firm cleaning regex
def clean_firm_name (firm):
    firm_clnd = re.sub('(\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()
    return firm_clnd

In [16]:
# run process_firm and write to file
pp = pprint.PrettyPrinter()
for firm_name in firm_names: 
    print ("Working on " + firm_name)
    firm_pages_dict, depth0_page_text = process_firm (firm_name)
    
    if depth0_page_text: 
        firm_clnd = clean_firm_name(firm_name) # standard cleaning code throughout project
        file = re.sub('\/', '|', firm_name) + '.txt'
        with io.open(DATA_DIR + file,'w',encoding='utf8') as f:
            f.write (depth0_page_text)
    else:
        pp.pprint(depth0_page_text)

Working on Little Kids
	Using boilerplate
Working on Daylight Solutions
	Using clnd_text
Working on Biocon Limited
	Using boilerplate
Working on Sony Corporation
	Using clnd_text
Working on Magna International Inc.
	Using clnd_text
Working on Supremex Inc.
	Using boilerplate
Working on Honeywell International Inc.
	Using clnd_text
Working on Aurrion
	Using clnd_text
Working on The Jackson Laboratory
	Using clnd_text
Working on Hewlett Packard Enterprise Development LP
	Using clnd_text
Working on CUMMINS FILTRATION IP
	Using clnd_text
Working on Renewable Power Conversion
	Using clnd_text
Working on Fuji Electric Co.
	Using clnd_text
Working on Canon Kabushiki Kaisha
	Using clnd_text
Working on Sola U.S.A. Inc.
	Using clnd_text
Working on Chromalox
	Using boilerplate
Working on Empire Technology Development LLC
	Using boilerplate
Working on Cabot Corporation
	Using clnd_text
Working on aTyr Pharma
	Using clnd_text
Working on Alcon Research
	Using clnd_text
Working on Wyatt Technology Co

	Using clnd_text
Working on Abbott Point of Care Inc.
	Using boilerplate
Working on Antaya Technologies Corporation
	Using clnd_text
Working on Aurora Alage
	Using boilerplate
Working on Anelva Corporation
	Using clnd_text
Working on Claret Medical
	Using clnd_text
Working on Angstron Materials
	Using boilerplate
Working on Veracyte
	Using clnd_text
Working on Magnachip Semiconductor
	Using clnd_text
Working on Banpil Photonics
	Using clnd_text
Working on Medgenics Medical Israel Ltd.
	Using clnd_text
''
Working on Danisco US Inc.
	Using boilerplate
Working on Bristol-Myers Squibb Company
	Using boilerplate
Working on Polaris Products LLC
	Using clnd_text
Working on GE-Hitachi Nuclear Energy Americas LLC
	Using clnd_text
Working on Butamax(TM) Advanced Biofuels LLC
	Using boilerplate
Working on Semics Inc.
	Using boilerplate
Working on ECOSYNTHETIX LTD.
	Using clnd_text
Working on Waters Technologies Corporation
	Using clnd_text
Working on Takara Bio Inc.
	Using boilerplate
Working on 

	Using boilerplate
Working on Cleanvantage LLC
	Using boilerplate
Working on Novus Technology
	Using clnd_text
Working on NCC Nano
	Using boilerplate
Working on Enginuity Worldwide
	Using boilerplate
Working on Simbol Inc.
	Using clnd_text
Working on Agena Bioscience
	Using clnd_text
Working on Stratasys
	Using clnd_text
Working on U.S. NUTRACEUTICALS
	Using clnd_text
Working on Mission Product Holdings
	Using boilerplate
Working on GED Intergrated Solution
	Using clnd_text
Working on Linne Industries LLC
	Using boilerplate
Working on ElectraTherm
	Using clnd_text
Working on Intermolecular
	Using clnd_text
Working on Kerr Corporation
	Using boilerplate
Working on Turf Group LLC
	Using boilerplate
Working on Materia
	Using clnd_text
Working on Pfizer Inc.
	Using clnd_text
Working on Nanoco Technologies
	Using clnd_text
Working on Ube Industries
	Using clnd_text
Working on Alcatel Lucent
	Using clnd_text
Working on EPCOS AG
	Using clnd_text
Working on Sick AG
	Using clnd_text
Working on 

	Using clnd_text
Working on Reliance Controls Corporation
	Using clnd_text
Working on Auterra
	Using clnd_text
Working on Sumitomo Electric Device Innovations
	Using clnd_text
Working on NovaSolix
	Using clnd_text
Working on Pharmatrophix
	Using boilerplate
Working on Microchip Technology Incorporated
	Using boilerplate
Working on Sima Therapeutics
	Using boilerplate
Working on Genomic Health
	Using boilerplate
Working on Carestream Health
	Using clnd_text
Working on Valicor
	Using boilerplate
Working on Avtaec Limited
	Using clnd_text
Working on GTherm
	Using clnd_text
Working on Saint-Gobain Adfors Canada
	Using clnd_text
Working on Evernote Corporation
	Using clnd_text
Working on Cree
	Using clnd_text
Working on IDEX Health & Science LLC
	Using clnd_text
Working on Qiagen GmbH
	Using clnd_text
Working on Echogen Power Systems
	Using clnd_text
Working on Lion Copolymer Geismar
	Using clnd_text
Working on Nanoridge Materials
	Using clnd_text
Working on Rebellion Photonics
	Using clnd_

	Using clnd_text
Working on Dynamic Solutions Worldwide
	Using clnd_text
Working on Starlight Energy Holdings LLC
	Using boilerplate
Working on Grain Processing Corporation
	Using clnd_text
Working on nLIGHT
	Using boilerplate
Working on Fenwal
	Using clnd_text
Working on CLEARSIGN COMBUSTION CORPORATION
	Using boilerplate
Working on Incept LLC
	Using clnd_text
Working on ZON
	Using boilerplate
Working on Acorn Technologies
	Using clnd_text
Working on True-Safe Technologies
	Using boilerplate
Working on Polysar Corporation
	Using clnd_text
Working on SunLink Corporation
	Using boilerplate
Working on Sangamo BioSciences
	Using boilerplate
Working on Cima NanoTech Israel Ltd.
	Using boilerplate
Working on Tata Consultancy Services Limited
	Using clnd_text
Working on KOLON INDUSTRIES
	Using clnd_text
Working on Heraeus Precious Metals North America Conshohocken LLC
	Using clnd_text
Working on Instron Corporation 
	Using boilerplate
Working on Robert Bosch GmbH
	Using clnd_text
Working on 

	Using boilerplate
Working on Basell Polyolefine GmbH
	Using clnd_text
Working on VIASAT INC.
	Using clnd_text
Working on Coactive Drive Corporation
	Using boilerplate
Working on Starsource Scientific LLC
	Using clnd_text
Working on AVOGY
	Using boilerplate
Working on Schneider Electric USA
	Using clnd_text
Working on WAFERTECH
	Using clnd_text
Working on BASF
	Using boilerplate
Working on Dymax Corporation
	Using clnd_text
Working on InView Technology Corporation
	Using clnd_text
Working on Conoco Inc.
	Using boilerplate
Working on Atrium Medical Corporation
	Using clnd_text
Working on Akron Polymer Systems
	Using clnd_text
Working on Cardiva Medical
	Using clnd_text
Working on Columbia Insurance Company
	Using clnd_text
Working on Altivera
	Using clnd_text
Working on AROG PHARMACEUTICALS
	Using boilerplate
Working on Liquidia Technologies
	Using boilerplate
Working on GE Healthcare Limited
	Using clnd_text
Working on Abengoa Bioenergy New Technologies
	Using boilerplate
Working on Ko

	Using clnd_text
Working on Narsys
	Using boilerplate
Working on IMRA America
	Using boilerplate
Working on Sun Drilling Products Corporation
	Using clnd_text
Working on Ethicon Endo-Surgery
	Using boilerplate
Working on HTC Corporation
	Using clnd_text
Working on Sumitomo Electric Industries
	Using clnd_text
Working on Arkema Inc.
	Using clnd_text
Working on DISCOVERYBIOMED INC.
	Using clnd_text
Working on PLYCEM USA
	Using clnd_text
Working on Matrix Genetics
	Using clnd_text
Working on Siluria Technologies
	Using clnd_text
Working on Cellular Research
	Using boilerplate
Working on EchoStar Technologies LLC
	Using clnd_text
Working on Milliken & Company
	Using clnd_text
Working on Honda Motor Co.
	Using clnd_text
Working on Maxout Renewables
	Using boilerplate
Working on Singulex
	Using clnd_text
Working on Gemex Systems
	Using clnd_text
Working on Semprius
	Using boilerplate
Working on Copernicus Therapeutics
	Using boilerplate
Working on Unistrut International Corp.
	Using clnd_tex