### extract-pages-from-mongo
SanjayKAroraPhD@gmail.com <br>
November 2018

## Description
This notebook extracts groups of pages from mongodb by firm_name to create firm-centric page output files that can later be topic modeled.  In doing so, it removes repetitive content (e.g., repeated menu items) and garbage content (e.g., improperly parsed HTML code). 

## Change log
v3 adds the python boilerplate  api for web page cleaning.

## TODO:
* Need to make better use of all pages in the site, e.g., to improve data quality and use additional paragraph data found on non-homepages 

In [8]:
# import data processing and other libraries
import csv
import sys
import requests
import os
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
import io
from IPython.display import display
import time
import numpy as np
from bs4 import BeautifulSoup
import string

In [9]:
from boilerpipe.extract import Extractor

In [15]:
MONGODB_DB = "FirmDB_20181203"
MONGODB_COLLECTION = "pages_depth0"
CONNECTION_STRING = "mongodb://localhost"
username = "scrapy"
password = "eager"
authSource = "FirmDB"
authMechanism='SCRAM-SHA-1'

client = pymongo.MongoClient(CONNECTION_STRING, username=username, password=password, authSource=authSource, authMechanism=authMechanism)
db = client[MONGODB_DB]
col = db[MONGODB_COLLECTION]

DATA_DIR = '/Users/sarora/dev/EAGER/data/orgs/depth0_boilerpipe/'

In [176]:
# gather unique firm_names from mongodb

def get_firm_aggregates ():
    query = [ { "$group": {"_id":"$firm_name" , "number":{"$sum":1}} } ]
    results = col.aggregate(query)

    mongo_dict = {}
    for result in results:
        key = (result['_id'])
        if key:
            mongo_dict[key[0]] = result['number']
    
    return mongo_dict

results_dict = get_firm_aggregates()
firm_names = results_dict.keys()
print (len(firm_names))
pp = pprint.PrettyPrinter()
pp.pprint(firm_names)

1152
[u'Little Kids',
 u'Daylight Solutions',
 u'Biocon Limited',
 u'Sony Corporation',
 u'Magna International Inc.',
 u'Supremex Inc.',
 u'Honeywell International Inc.',
 u'Aurrion',
 u'The Jackson Laboratory',
 u'Hewlett Packard Enterprise Development LP',
 u'CUMMINS FILTRATION IP',
 u'Renewable Power Conversion',
 u'Fuji Electric Co.',
 u'Canon Kabushiki Kaisha',
 u'Sola U.S.A. Inc.',
 u'Chromalox',
 u'Empire Technology Development LLC',
 u'Cabot Corporation',
 u'aTyr Pharma',
 u'Alcon Research',
 u'Wyatt Technology Corporation',
 u'Southwest Research Institute',
 u'Skidmore',
 u'Fairchild Semiconductor Corporation',
 u'Saint-Gobain Performance Plastics Corporation',
 u'Ideal Power Inc.',
 u'NuOrtho Surgical',
 u'Suncore Photovoltaics',
 u'L-3 Communications Cincinnati Electronics Corporation',
 u'Medical Diagnostic Laboratories',
 u'Solarmer Energy',
 u'Braun Intertec Geothermal',
 u'ProNAi Therapeutics',
 u'Amicus Therapeutics',
 u'Raytheon Company',
 u'Weatherford Canada Partners

 u'Hitachi High-Technologies Corporation',
 u'ITN Energy Systems',
 u'Deep Science',
 u'Invensas Corporation',
 u'Verliant Energy',
 u'Johnson & Johnson Vision Care',
 u'KJ BIOSCIENCES LLC',
 u'Johnson & Johnson Consumer Companies',
 u'Thorlabs',
 u'Tessera',
 u'Osram Sylvania Inc.',
 u'BlackBerry Limited',
 u'Sun Chemical Corporation',
 u'Kajima Corporation',
 u'Mitsubishi Metal Corporation',
 u'TP Solar',
 u'Aerogen',
 u'OPTERRA ENERGY SERVICES',
 u'Wikipad',
 u'Global Solar Water Power Systems',
 u'Furukawa Electric Co.',
 u'Stablcor Technology',
 u'Sensor Electronic Technology',
 u'Sweetwater Energy',
 u'NanoTech Lubricants',
 u'The Samuel Roberts Noble Foundation',
 u'WiSys Technology Foundation',
 u'Seiko Instruments Inc.',
 u'Reliance Controls Corporation',
 u'Auterra',
 u'Sumitomo Electric Device Innovations',
 u'NovaSolix',
 u'Pharmatrophix',
 u'Microchip Technology Incorporated',
 u'Sima Therapeutics',
 u'Genomic Health',
 u'Carestream Health',
 u'Valicor',
 u'Avtaec Limited'

In [172]:
# remove html content
def is_javascript (x):
    match_string = r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')"
    # capture CDATA; function declarations; function calls; word sequences separated by a period (e.g., denoting paths)
    regex = re.findall(match_string, x) 
    # check to see if the regex finds some percentage of the words look like javascript patterns
    if (len(regex) / float(len(x.split())) > .10):
        return True 
    else:
        return False

def clean_page_content (text_list):
    # remove whatever we think is html
    removed_html = filter(lambda x: not( bool(BeautifulSoup(x, "html.parser").find()) ), text_list)
    # remove content that looks like javascript 
    removed_js = filter(lambda x: not (is_javascript(x)), removed_html)
    # add other checks here as needed

    return removed_js
    

# iterate through each firm, get all pages associated with a firm, and produce data structure
# url --> depth
#     --> content (list)
# return data structure
def process_firm (firm_name): 
    regex = '^' + re.escape(firm_name) + '$'
    results = col.find( {"firm_name": re.compile(regex, re.IGNORECASE) } )
    firm_pages_dict = {}
    depth0_page_text = [] # home page
    result = results[0]
    key = result['url'][0]
    depth = result['depth'][0]

    if key and depth <= 0:
        page_dict = {}
        page_dict['depth'] = depth
        page_dict['domain'] = result['domain'][0]
        result['domain'][0]
        page_dict['firm_name'] = firm_name
        clnd_text = clean_page_content(result['full_text'])
        page_dict['clnd_text'] = '\n'.join(clnd_text)
        
        if 'body' in result:
            extractor = Extractor(extractor='DefaultExtractor', html = result['body'][0])
            lines = extractor.getText().replace(u'\xa0', u' ').split('\n')
            filtered = filter(lambda x: not re.match(r'^\s*$', x), lines)
            page_dict['boilerpipe'] = '\n'.join(filtered)

        firm_pages_dict[key] = page_dict

        if 'boilerpipe' in page_dict and page_dict['boilerpipe'] and (len(page_dict['boilerpipe']) > .5 * len (page_dict['clnd_text'])):
            print ('\tUsing boilerplate')
            depth0_page_text = page_dict['boilerpipe']
        else:
            print ('\tUsing clnd_text')
            depth0_page_text = page_dict['clnd_text']

    return firm_pages_dict, depth0_page_text

In [18]:
# regex test 
regex = re.findall(r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')", 
                   "CDATA function contact-us getelementbyid javascript.function linker:autoLink www.littlekidsinc.com fxnCall(param.param); email@dextr.us 'type': 'image' return true return false rev7bynlh\\u00252bvcgrjg\\ {height}") # last part is words sequences separated by punct
print (regex)

['CDATA', 'function', 'getelementbyid', 'javascript.function', 'linker:autoLink', 'www.littlekidsinc', 'fxnCall(param.param);', 'dextr.us', "'type': 'image", 'return true', 'return false', 'rev7bynlh\\u00252bvcgrjg', '\\', '{', '}']


In [173]:
firm_pages_dict, depth0_page_text = process_firm ("NA")
print (depth0_page_text)

<class 'pymongo.cursor.Cursor'>


IndexError: no such item for Cursor instance

In [None]:
# run
pp = pprint.PrettyPrinter()
for firm_name in firm_names: 
    print ("Working on " + firm_name)
    firm_pages_dict, depth0_page_text = process_firm (firm_name)
    
    if depth0_page_text: 
        file = re.sub('\.|\/', '_', firm_name) + '.txt'
        with io.open(DATA_DIR + file,'w',encoding='utf8') as f:
            f.write (depth0_page_text)
    else:
        pp.pprint(depth0_page_text)

Working on Little Kids
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Daylight Solutions
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Biocon Limited
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Sony Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Magna International Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Supremex Inc.
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Honeywell International Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Aurrion
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on The Jackson Laboratory
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Hewlett Packard Enterprise Development LP
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on CUMMINS FILTRATION IP
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Renewable Power Conversion
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Fuji Elect

	Using boilerplate
Working on MonoSol
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Conexant Systems
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Samsung Electronics
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on ADMA Products
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Energysolutions
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Authenex
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on SOLAZYME
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on HGST NETHERLANDS B.V.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on BioNano Genomics
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Revera
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on IDEA TREE
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Moxtek
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on DNA Twopointo
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on VMware
<cl

	Using clnd_text
Working on Ion Power Group
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on SEaB Energy Holdings Ltd.
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Chipmos Technologies Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Gas Technology Institute
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on St. Microelectronics Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Globus Medical
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on SABIC GLOBAL TECHNOLOGIES B.V.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on The Paymaster Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Guardian Industries
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Tela Innovations
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on VEECO PRECISION SURFACE PROCESSING LLC
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on K.S. INTERNATIONAL CO.
<class 'pymongo.cu

	Using boilerplate
Working on Taiwan Semiconductor Manufacturing Company
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Sequenom
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Hunt Energy Enterprises LLC
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on UChicago Argonne
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Hunter Douglas Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on SRG Global
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on St. Jude Medical
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on SEIKO NPC Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Yissum
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on ADASA INC.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Ostendo Technologies
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Senaya
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on RES USA
<class 'pymongo.cursor.

	Using boilerplate
Working on Red Hat
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on MirTech
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Momentive Performance Materials GmbH
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Pinnacle Technology
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Elwha LLC
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Inaeris Technologies
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Genomatica
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Rolls-Royce PLC
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Dentsply International
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Owens-Brockway Glass Container Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on SolarWorld Americas Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on GENERAL ELECTRIC COMPANY
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on k-Sp

	Using clnd_text
Working on Magnolia Optical Technologies
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on PolyPlus Battery Company
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Ford Global Technologies
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Bayer Cropscience AG
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on EMD Technologies Inc.
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Nederlandse Organisatie voor toegepast-natuurwetenschappelijk onderzoek TNO
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Nexcom Technology
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on AT&T Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Floadia Corporation
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Entech Solar
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Amgen Fremont Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on PortaFire
<class

<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Microsemi SoC Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on NOK Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Bi-Modal Corporation
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on E Ink Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Wenger Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Tufts Medical Center
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Cedar Ridge Research
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Global Eagle Entertainment Inc.
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on JNC Corporation
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on Synaptic Research
<class 'pymongo.cursor.Cursor'>
	Using boilerplate
Working on KT Corporation
<class 'pymongo.cursor.Cursor'>
	Using clnd_text
Working on Intuitive Surgical Operations
<class 'pymongo.curso