# extract-pages-from-mongo
SanjayKAroraPhD@gmail.com <br>
November 2018

## Description
This notebook extracts groups of pages from mongodb by firm_name to create firm-centric page output files that can later be topic modeled.  In doing so, it removes repetitive content (e.g., repeated menu items) and garbage content (e.g., improperly parsed HTML code). 

## Change log
v2 fixes some bugs.  

## TODO:
* Need to make better use of all pages in the site, e.g., to improve data quality and use additional paragraph data found on non-homepages 

In [23]:
# import data processing and other libraries
import csv
import sys
import requests
import os
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
import io
from IPython.display import display
import time
import numpy as np
from bs4 import BeautifulSoup
import string

In [45]:
MONGODB_DB = "FirmDB_20181203"
MONGODB_COLLECTION = "pages_depth0"
CONNECTION_STRING = "mongodb://localhost"

client = pymongo.MongoClient(CONNECTION_STRING)
db = client[MONGODB_DB]
col = db[MONGODB_COLLECTION]

DATA_DIR = '/Users/sarora/dev/EAGER/data/orgs/depth0_pages/'

In [54]:
# gather unique firm_names from mongodb

def get_firm_aggregates ():
    query = [ { "$group": {"_id":"$firm_name" , "number":{"$sum":1}} } ]
    results = col.aggregate(query)

    mongo_dict = {}
    for result in results:
        key = (result['_id'])
        if key:
            mongo_dict[key[0]] = result['number']
        else:
            mongo_dict['NA'] = result['number']
    
    return mongo_dict

results_dict = get_firm_aggregates()
firm_names = results_dict.keys()
print (len(firm_names))
pp = pprint.PrettyPrinter()
pp.pprint(firm_names)

1153
[u'Little Kids',
 u'Daylight Solutions',
 u'Biocon Limited',
 u'Sony Corporation',
 u'Magna International Inc.',
 u'Supremex Inc.',
 u'Honeywell International Inc.',
 u'Aurrion',
 u'The Jackson Laboratory',
 u'Hewlett Packard Enterprise Development LP',
 u'CUMMINS FILTRATION IP',
 u'Renewable Power Conversion',
 u'Fuji Electric Co.',
 u'Canon Kabushiki Kaisha',
 u'Sola U.S.A. Inc.',
 u'Chromalox',
 u'Empire Technology Development LLC',
 u'Cabot Corporation',
 u'aTyr Pharma',
 u'Alcon Research',
 u'Wyatt Technology Corporation',
 u'Southwest Research Institute',
 u'Skidmore',
 u'Fairchild Semiconductor Corporation',
 u'Saint-Gobain Performance Plastics Corporation',
 u'Ideal Power Inc.',
 u'NuOrtho Surgical',
 u'Suncore Photovoltaics',
 u'L-3 Communications Cincinnati Electronics Corporation',
 u'Medical Diagnostic Laboratories',
 u'Solarmer Energy',
 u'Braun Intertec Geothermal',
 u'ProNAi Therapeutics',
 u'Amicus Therapeutics',
 u'Raytheon Company',
 u'Weatherford Canada Partners

 u'Furukawa Electric Co.',
 u'Stablcor Technology',
 u'Sensor Electronic Technology',
 u'Sweetwater Energy',
 u'NanoTech Lubricants',
 u'The Samuel Roberts Noble Foundation',
 u'WiSys Technology Foundation',
 u'Seiko Instruments Inc.',
 u'Reliance Controls Corporation',
 u'Auterra',
 u'Sumitomo Electric Device Innovations',
 u'NovaSolix',
 u'Pharmatrophix',
 u'Microchip Technology Incorporated',
 u'Sima Therapeutics',
 u'Genomic Health',
 u'Carestream Health',
 u'Valicor',
 u'Avtaec Limited',
 u'GTherm',
 u'Saint-Gobain Adfors Canada',
 u'Evernote Corporation',
 u'Cree',
 u'IDEX Health & Science LLC',
 u'Qiagen GmbH',
 u'Echogen Power Systems',
 u'Lion Copolymer Geismar',
 u'Nanoridge Materials',
 u'Rebellion Photonics',
 u'Big Belly Solar',
 u'Nokomis',
 u'G.D.O. Inc',
 u'BTU International',
 u'VINDICO NANOBIO TECHNOLOGY INC.',
 u'Swagelok Company',
 u'Magnolia Optical Technologies',
 u'PolyPlus Battery Company',
 u'Ford Global Technologies',
 u'Bayer Cropscience AG',
 u'EMD Technolog

In [81]:
# remove html content
def is_javascript (x):
    match_string = "(CDATA|return\s+true|return\s+false|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+')|{|}|\r|\n|\/\/"
    # capture CDATA; function declarations; function calls; word sequences separated by a period (e.g., denoting paths)
    regex = re.findall(match_string, x) 
    # check to see if the regex finds some percentage of the words look like javascript patterns
    if (len(regex) / float(len(x.split())) > .10) and len(regex) > 3:
        return True 
    else:
        return False

def clean_page_content (text_list):
    # remove whatever we think is html
    removed_html = filter(lambda x: not( bool(BeautifulSoup(x, "html.parser").find()) ), text_list)
    # remove content that looks like javascript 
    removed_js = filter(lambda x: not (is_javascript(x)), removed_html)
    # add other checks here as needed

    return removed_js
    

# iterate through each firm, get all pages associated with a firm, and produce data structure
# url --> depth
#     --> content (list)
# return data structure
def process_firm (firm_name): 
    regex = '^' + re.escape(firm_name) + '$'
    results = col.find( {"firm_name": re.compile(firm_name, re.IGNORECASE) } )
    firm_pages_dict = {}
    depth0_page_text = [] # home page
    for result in results:
        key = result['url'][0]
        if key:
            page_dict = {}
            depth = result['depth'][0]
            page_dict['depth'] = depth
            page_dict['domain'] = result['domain'][0]
            page_dict['firm_name'] = firm_name
            clnd_text = clean_page_content(result['full_text'])
            page_dict['clnd_text'] = clnd_text
            firm_pages_dict[key] = page_dict
            
            if depth == -1:
                depth0_page_text = clnd_text
        else:
            continue
            
    return firm_pages_dict, depth0_page_text
# TODO: identify which pieces of content are common across all sites, and remove those
# def clean_content(firm_dict): 

In [82]:
# regex test 
regex = re.findall(r"(CDATA|return\s+true|return\s+false|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\')", 
                   "CDATA function contact-us javascript.function linker:autoLink www.littlekidsinc.com fxnCall(param.param); email@dextr.us 'type': 'image' return true return false rev7bynlh\\u00252bvcgrjg\\") # last part is words sequences separated by punct
print (regex)

['CDATA', 'function', 'javascript.function', 'linker:autoLink', 'www.littlekidsinc', 'fxnCall(param.param);', 'dextr.us', "'type': 'image", 'return true', 'return false', 'rev7bynlh\\u00252bvcgrjg']


In [83]:
firm_pages_dict, depth0_page_text = process_firm ("Little Kids")
print (depth0_page_text)

[u' Phone Number: 800-545-5437', u'Subscribe', u'Register', u'Contact Us', u'Home', u'Products', u'Slick Tricks \u2122', u'Fubbles Bubbles\xae', u'Fubbles\xae No-Spill\xae', u'CANDYLICIOUS\xae', u'Peeps\xae', u'Jelly Belly\xae', u'Sesame Street\xae', u'PJ Masks \u2122', u'Paw Patrol \u2122', u'Shimmer & Shine \u2122', u'Nickelodeon \u2122', u'Junk Ball\xae', u'Magic Kidchen \u2122', u'About Us', u'PICS', u'FAQs', u'How To', u'Stores', u'Contact', u'Contact Us', u'Cart', u'0', u"TAKE A PEEP AT WHAT'S HOT FOR\xa0", u'FOURTH OF JULY!', u'Learn More', u'Fubbles\xae Bubbles', u"Tip it upside down, knock it over, it won't spill! Fubbles No-Spill bubbles have won numerous toy innovation awards.", u'Learn More', u'Junk Ball\xae', u'Junk Ball is the ultimate backyard baseball game! Just twist the dial on the ball to control your pitch: throw curves, sliders, risers, even knuckleballs!', u'Learn More', u'Junk Ball\xae', u'Junk Ball is the ultimate backyard baseball game! Just twist the dial on t

In [84]:
# run
pp = pprint.PrettyPrinter()
for firm_name in firm_names: 
    print ("Working on " + firm_name)
    firm_pages_dict, depth0_page_text = process_firm (firm_name)
    # pp.pprint(depth0_page_text)
    if depth0_page_text: 
        file = re.sub('\.|\/', '_', firm_name) + '.txt'
        with io.open(DATA_DIR + file,'w',encoding='utf8') as f:
            f.write ('\n'.join (depth0_page_text))

Working on Little Kids
Working on Daylight Solutions
Working on Biocon Limited
Working on Sony Corporation
Working on Magna International Inc.
Working on Supremex Inc.
Working on Honeywell International Inc.
Working on Aurrion
Working on The Jackson Laboratory
Working on Hewlett Packard Enterprise Development LP
Working on CUMMINS FILTRATION IP
Working on Renewable Power Conversion
Working on Fuji Electric Co.
Working on Canon Kabushiki Kaisha
Working on Sola U.S.A. Inc.
Working on Chromalox
Working on Empire Technology Development LLC
Working on Cabot Corporation
Working on aTyr Pharma
Working on Alcon Research
Working on Wyatt Technology Corporation
Working on Southwest Research Institute
Working on Skidmore
Working on Fairchild Semiconductor Corporation
Working on Saint-Gobain Performance Plastics Corporation
Working on Ideal Power Inc.
Working on NuOrtho Surgical
Working on Suncore Photovoltaics
Working on L-3 Communications Cincinnati Electronics Corporation
Working on Medical Dia

Working on Microchips Biotech
Working on Evoqua Water Technologies LLC
Working on HRL Laboratories
Working on Ansun Biopharma
Working on MicroContinuum
Working on ExxonMobil Upstream Research Company
Working on Sony Interactive Entertainment America LLC
Working on SII Semiconductor Corporation
Working on PNM
Working on AMPT
Working on FLOW CONTROL LLC.
Working on Great American Duck Races
Working on Biocare Medical
Working on Swift Engineering
Working on Renesas Electronics Corporation
Working on Star Technology and Research
Working on Praxair S.T. Technology
Working on TDK Corporation
Working on CAMBRIDGE ENTERPRISE LIMITED
Working on Bell Helicopter Textron Inc.
Working on Everspin Technologies
Working on Alcotek
Working on ARBOR THERAPEUTICS
Working on WOVN
Working on Bitrode Corporation
Working on Hyundai Motor Company
Working on BASF Plant Science GmbH
Working on Hitachi Metals
Working on BROADCOM CORPORATION
Working on Yageo Corporation
Working on Em-Tech LLC
Working on McAlister

Working on Magnolia Optical Technologies
Working on PolyPlus Battery Company
Working on Ford Global Technologies
Working on Bayer Cropscience AG
Working on EMD Technologies Inc.
Working on Nederlandse Organisatie voor toegepast-natuurwetenschappelijk onderzoek TNO
Working on Nexcom Technology
Working on AT&T Corporation
Working on Floadia Corporation
Working on Entech Solar
Working on Amgen Fremont Inc.
Working on PortaFire
Working on Ormat Technologies Inc.
Working on JAC Products Inc.
Working on AstenJohnson
Working on MCI
Working on Boston Scientific Scimed
Working on Xyleco
Working on Bigelow Aerospace
Working on Sienna Biopharmaceuticals
Working on Integrated Solar Technology
Working on FULL CIRCLE BIOCHAR
Working on bioTheranostics
Working on JFE STEEL CORPORATION
Working on Zygo Corporation
Working on Cetac Technologies Inc.
Working on Novozymes A/S
Working on fybr
Working on Da Yu Enterprises
Working on Adtran
Working on HOWARD INDUSTRIES
Working on NanoGram Corporation
Working

Working on ASM America
Working on Sinton Consulting
Working on Hysitron Incorporated
Working on PeterBrod Corp.
Working on Daikin Industries
Working on NeoPhotonics Corporation
Working on Litron Laboratories Limited
Working on ConocoPhillips Company
Working on AC International Inc.
Working on PolyOne Corporation
Working on Forest Concepts
Working on Nanospectra Biosciences
Working on Glucan Biorenewables LLC
Working on GOJO Industries
Working on New England Biolabs
Working on Genzyme Corporation
Working on ASTUTE MEDICAL
Working on UOP LLC
Working on Quantapore
Working on Marathon Equipment Company
Working on Johnson Controls Technology Company
Working on EMC Corporation
Working on PELLION TECHNOLOGIES
Working on Interface Performance Materials
Working on Paratek Pharmaceuticals
Working on NLT TECHNOLOGIES
Working on DiscoveRx Corporation
Working on Tyco Electronics Corporation
Working on MILLENIUM SYNTHFUELS CORPORATION
Working on Nutech Ventures
Working on Plasma-Therm
Working on Lon

Working on LifeNet Health
Working on IntriEnergy Inc.
Working on New Technology Ventures
Working on Altex Technologies Corporation
Working on Greyrock Energy
Working on Courtagen Life Sciences
Working on ATOMERA INCORPORATED
Working on FEI Company
Working on Alpha and Omega Semiconductor Incorporated
Working on Quest Diagnostics Investments Incorporated
Working on Ethox Chemicals
Working on Murata Manufacturing Co.
Working on O.B.I. Inc.
Working on Foret Plasma Labs
Working on Tigo Energy
Working on United Technologies Corporation
Working on Neumedicines
Working on OAS Design Group
Working on Sagacious Investment Group L.L.C.
Working on Sirnaomics
Working on Electrix
Working on Seetron Inc.
Working on Baxter Healthcare SA
Working on Senga Advisors
Working on Schneider Electric Solar Inverters USA
Working on Ceramatec
Working on NOVA Chemicals (International) S.A.
Working on The Charles Stark Draper Laboratory
Working on Pacific Light Technologies
Working on DePuy Synthes Products
Worki