# extract-pages-from-mongo
SanjayKAroraPhD@gmail.com <br>
November 2018

## Description
This notebook extracts groups of pages from mongodb by firm_name to create firm-centric page output files that can later be topic modeled.  In doing so, it removes repetitive content (e.g., repeated menu items) and garbage content (e.g., improperly parsed HTML code). 

In [43]:
# import data processing and other libraries
import csv
import sys
import requests
import os
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
from IPython.display import display
import time
import numpy as np
from bs4 import BeautifulSoup

In [44]:
MONGODB_DB = "FirmDB_20181116"
MONGODB_COLLECTION = "pages_COMBINED"
CONNECTION_STRING = "mongodb://localhost"

client = pymongo.MongoClient(CONNECTION_STRING)
db = client[MONGODB_DB]
col = db[MONGODB_COLLECTION]

DEPTH = 0

In [46]:
# gather unique firm_names from mongodb

def get_firm_aggregates ():
    query = [ { "$group": {"_id":"$firm_name" , "number":{"$sum":1}} } ]
    results = col.aggregate(query)

    mongo_dict = {}
    for result in results:
        key = (result['_id'])
        if key:
            mongo_dict[key[0]] = result['number']
        else:
            mongo_dict['NA'] = result['number']
    
    return mongo_dict

results_dict = get_firm_aggregates()
firm_names = results_dict.keys()
pp = pprint.PrettyPrinter()
pp.pprint(firm_names)

[u'Little Kids',
 u'Daylight Solutions',
 u'Biocon Limited',
 u'Sony Corporation',
 u'Magna International Inc.',
 u'ENI Technology',
 u'Honeywell International Inc.',
 u'Aurrion',
 u'The Jackson Laboratory',
 u'Hewlett Packard Enterprise Development LP',
 u'CUMMINS FILTRATION IP',
 u'Custom Electronics Inc.',
 u'H R D CORPORATION',
 u'GENERAL MOTORS LLC',
 u'FLIR Systems',
 u'Sola U.S.A. Inc.',
 u'Chromalox',
 u'Empire Technology Development LLC',
 u'Alcon Research',
 u'Wyatt Technology Corporation',
 u'Southwest Research Institute',
 u'Tokyo Ohka Kogyo Co.',
 u'Samsung Electronics',
 u'Fairchild Semiconductor Corporation',
 u'Supremex Inc.',
 u'Calient Technologies',
 u'Infineon Technologies Americas Corp.',
 u'Ideal Power Inc.',
 u'NuOrtho Surgical',
 u'Easel Biotechnologies',
 u'L-3 Communications Cincinnati Electronics Corporation',
 u'SunEdison Semiconductor Limited',
 u'IDEA TREE',
 u'Solarmer Energy',
 u'Braun Intertec Geothermal',
 u'Fuji Electric Co.',
 u'The Babcock & Wilcox 

 u'Sumitomo Electric Device Innovations',
 u'MicroContinuum',
 u'Microchip Technology Incorporated',
 u'DiscoveRx Corporation',
 u'Genomic Health',
 u'Carestream Health',
 u'TRI ALPHA ENERGY',
 u'Renmatix',
 u'Zygo Corporation',
 u'PACCAR Inc',
 u'Evernote Corporation',
 u'Entegris',
 u'IDEX Health & Science LLC',
 u'Echogen Power Systems',
 u'Lion Copolymer Geismar',
 u'Nanoridge Materials',
 u'Rebellion Photonics',
 u'Big Belly Solar',
 u'Nokomis',
 u'SUMITOMO WIRING SYSTEMS',
 u'Zoetis Services LLC',
 u'ExxonMobil Research and Engineering Company',
 u'VINDICO NANOBIO TECHNOLOGY INC.',
 u'HNO Greenfuels',
 u'Esolar',
 u'Bayer Cropscience AG',
 u'EMD Technologies Inc.',
 u'Newlans',
 u'HIQ SOLAR',
 u'Nederlandse Organisatie voor toegepast-natuurwetenschappelijk onderzoek TNO',
 u'U S MICROPOWER INC',
 u'Hewlett-Packard Development Company',
 u'Arkema Inc.',
 u'Floadia Corporation',
 u'Entech Solar',
 u'Global Solar Water Power Systems',
 u'PortaFire',
 u'Ormat Technologies Inc.',
 u'C

In [137]:
# remove html content
def is_javascript (x):
    print ('\t' + x)
    # capture CDATA; function declarations; function calls; word sequences separated by a period (e.g., denoting paths)
    regex = re.findall('(CDATA|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,})', x) 
    # check to see if the regex finds some percentage of the words look like javascript patterns
    print ('\tregex is' + str(len(regex)))
    print ('\tx split is' + str(len(x.split()))
    if ((len(regex) / len(x.split())) > .10) and len(regex) > 3:
        print ("\tin if, looks like javascript")
        return True 
    else:
        return False

def clean_page_content (text_list):
    # remove whatever we think is html
    removed_html = filter(lambda x: not( bool(BeautifulSoup(x, "html.parser").find()) ), text_list)
    # remove content that looks like javascript 
    removed_js = filter(lambda x: not (is_javascript(x)), removed_html)
    
    print ("full text is " + str(len(text_list)))
    print ("cleaned html text is " + str(len(removed_html)))
    print ("cleaned js text is " + str(len(removed_js)))
    return removed_js
    

# iterate through each firm, get all pages associated with a firm, and produce data structure
# url --> depth
#     --> content (list)
# return data structure
def process_firm (firm_name): 
    regex = '^' + re.escape(firm_name) + '$'
    results = col.find( {"firm_name": re.compile(regex, re.IGNORECASE) } )

    firm_pages_dict = {}
    for result in results:
        key = (result['url'])
        if key:
            page_dict = {}
            page_dict['depth'] = result['domain'][0]
            page_dict['firm_name'] = firm_name
            page_dict['depth'] = result['depth'][0]
            page_dict['clnd_text'] = clean_page_content(result['full_text'])
            firm_pages_dict[key[0]] = page_dict
        else:
            continue
            
    return firm_pages_dict
# TODO: identify which pieces of content are common across all sites, and remove those
# def clean_content(firm_dict): 

SyntaxError: invalid syntax (<ipython-input-137-01a6568eb51f>, line 9)

In [133]:
# regex test 
regex = re.findall('(CDATA|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,})', 'CDATA function contact-us javascript.function linker:autoLink www.littlekidsinc.com fxnCall(param.param); email@dextr.us') # last part is words sequences separated by punct
print (regex)

['CDATA', 'function', 'javascript.function', 'linker:autoLink', 'www.littlekidsinc', 'fxnCall(param.param);', 'dextr.us']


In [138]:
# run
pp = pprint.PrettyPrinter()
for firm_name in firm_names: 
    print ("Working on " + firm_name)
    pp.pprint(process_firm (firm_name))
    break 

Working on Little Kids
full text is 117
cleaned html text is 116
cleaned js text is 116
full text is 119
cleaned html text is 118
cleaned js text is 118
full text is 106
cleaned html text is 105
cleaned js text is 105
full text is 90
cleaned html text is 89
cleaned js text is 89
full text is 96
cleaned html text is 95
cleaned js text is 95
full text is 237
cleaned html text is 236
cleaned js text is 236
full text is 202
cleaned html text is 201
cleaned js text is 201
full text is 221
cleaned html text is 220
cleaned js text is 220
full text is 107
cleaned html text is 106
cleaned js text is 106
full text is 148
cleaned html text is 147
cleaned js text is 147
full text is 158
cleaned html text is 157
cleaned js text is 157
full text is 227
cleaned html text is 226
cleaned js text is 226
full text is 248
cleaned html text is 247
cleaned js text is 247
full text is 200
cleaned html text is 199
cleaned js text is 199
full text is 208
cleaned html text is 208
cleaned js text is 208
full tex

                                                            u'Sat: Closed\n',
                                                            u'Sun: Closed\n',
                                                            u'Closed All Holidays',
                                                            u'Contact Us',
                                                            u'Leave A Message',
                                                            u'*',
                                                            u' Required\r\n    ',
                                                            u'First Name ',
                                                            u'*',
                                                            u'Last Name ',
                                                            u'*',
                                                            u'Email Address ',
                                                            u'*',
                             

                                                                               u'T',
                                                                               u'800-545-5437',
                                                                               u'M',
                                                                               u'info@littlekidsinc.com',
                                                                               u'Opening Hours',
                                                                               u'Mon - Fri: 8:30am - 5:00pm EST\n',
                                                                               u'Sat: Closed\n',
                                                                               u'Sun: Closed\n',
                                                                               u'Closed All Holidays',
                                                                               u'About',
                             

                                                                        u'643',
                                                                        u'AGES 5 & UP',
                                                                        u'Click for Detail View',
                                                                        u'MAGIC KIDCHEN SLUSHY MAKER',
                                                                        u' Add to Favorites',
                                                                        u'5 and up',
                                                                        u'Magjic Kidchen\u2122 is a fun brand that lets kids make their own delicious frozen threats without much help from mom or dad! All our Magic Kidchen\u2122\n    items are reusable, top rack dishwater safe and BPA free. Each Magic Kidchen\u2122 item also uses real food and ingredients that most families have\n    at home- there are no processed food packets here!',
             

                                                                        u'Click for Detail View',
                                                                        u'SHIMMER AND SHINE\u2122 JEWEL WATER BACKPACK',
                                                                        u' Add to Favorites',
                                                                        u'5 and up',
                                                                        u'Go on magical adventure with Shimmer and Shine! Where you can beat the summer heat with the sparkly and stylish Shimmer and Shine\u2122 Jewel Water Backpack!',
                                                                        u'Fill the sparkly and stylish Shimmer and Shine\u2122 rescue pack with water, put it on and adjust the easy-slide straps.',
                                                                        u'Then take aim, and pump the nozzle to blast away!',
                                             

['CDATA', 'function', 'javascript.function', 'linker:autoLink', 'www.littlekidsinc', 'fxnCall(param.param);', 'dextr.us']
