# extract-pages-from-mongo
SanjayKAroraPhD@gmail.com <br>
December 2018

## Description
This version of the notebook extracts groups of pages from mongodb by firm_name to create firm-centric <b>about</b> page output files that can later be topic modeled.  In doing so, it removes repetitive content (e.g., repeated menu items) and garbage content (e.g., improperly parsed HTML code). 

## Change log
v4 focuses on about pages

## TODO:
* Whole process: get data, topic model and see if it looks sufficiently interesting/different
* Enhance data collection, per the following: 
    * Train a model to see if home page (or any other page) looks like an about page — WAIT
        * http://forestconcepts.com/index.php?page=01005
        * https://www.itri.org.tw/eng/ 
        * http://paxscientific.com/history/ 
    * Select a region or country — WAIT 
        * http://www.ivoclarvivadent.com: Please select your region
        * https://www.enersys.com/: PLEASE SELECT A REGION
        * https://www.m-petfilm.com/: ENGLISH
    * Crawl from focal about page only following links that look like part of the about story, maintaining ordering.  Check to see if the other links identified above are also there? 
        * http://xtalsolar.com/investors_partners.html

In [1]:
# import data processing and other libraries
import csv
import sys
import requests
import os
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
import io
from IPython.display import display
import time
import numpy as np
from bs4 import BeautifulSoup
import string
import random
from urllib.parse import urlparse, urljoin
from collections import defaultdict

In [2]:
from boilerpipe.extract import Extractor

In [33]:
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

In [None]:
# first figure out what is an about page. need to label training data
# identify features -- for right now, unigrams if just one word in the header, otherwise bi- or trigrams 
# predict about pages

In [5]:
MONGODB_DB = "FirmDB_20181226"
MONGODB_COLLECTION = "pages_ABOUT"
CONNECTION_STRING = "mongodb://localhost"

client = pymongo.MongoClient(CONNECTION_STRING)
db = client[MONGODB_DB]
col = db[MONGODB_COLLECTION]

DATA_DIR = '/Users/sarora/dev/EAGER/data/orgs/about/'
TRAINING_PERCENT = .10
pp = pprint.PrettyPrinter()

In [7]:
def get_domain (url):
    o = urlparse(url.lower())
    domain = o.netloc.strip('www.')
    return domain

# output urls for labeling of training data
results = col.find({},{"url": 1, "firm_name": 1})
df = pd.DataFrame(columns = ('firm_name', 'url', 'label'))
domain_count = defaultdict(lambda:0,{})
for i in range(results.count()):
    result = results.next()
    url = result['url'][0]
    domain_count[get_domain(url)] += 1
    firm_name = result['firm_name'][0] if 'firm_name' in result else ''
    df.loc[i] = [firm_name, url, '']
    
df['gid'] = df.groupby(['firm_name']).ngroup()

In [8]:
df.gid.nunique()
label_ids = random.sample(range(1, df.gid.nunique()), 200)
df_label = df[df['gid'].isin(label_ids)]
with open(DATA_DIR + 'about_pages_to_label.csv', mode='w') as to_label:
    df_label.to_csv(to_label, index=False)

In [9]:
# read back labeled data (note that about, management/team and partners, are dichotomous)
df_about_labeled = pd.read_csv(DATA_DIR + 'about_pages_labeled.csv')
df_about_labeled = df_about_labeled.fillna(0)

# count pages per domain
for index, row in df_about_labeled.iterrows():
    pages_in_domain = domain_count[get_domain(row['url'])]
    df_about_labeled.loc[index,'pages_in_domain'] = pages_in_domain
    is_sole_page = 0 if pages_in_domain > 1 else 1
    df_about_labeled.loc[index,'is_sole_page'] = is_sole_page
    
labeled_urls = list(df_about_labeled['url']) # for training models on labeled urls below
df_about_labeled = df_about_labeled.set_index('url')
print (df_about_labeled.columns.tolist())

# final test set is the rows of the original data frame without the urls in df_about_labeled 

['firm_name', 'about', 'mgmt', 'partners', 'gid', 'pages_in_domain', 'is_sole_page']


## Create features to predict about pages
Create features:
1. title and url path fragment unigrams (also tried n-grams, as well as content from headers, with worse results) 
2. is home page and doesn't have any other pages
3. other ideas here: https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41

In [None]:
# load page data and create features
firm_page_features = {}
# header_regex = re.compile(r'h[1-2]+')
stop_words = ['the','a'] + list(string.punctuation) # remove simple article words and punctuation (need to keep 'about')
# remove known company names for model training and evaluation in the labeled data 
remove_regex = re.compile(r'^(3m|united|states|en_us|algeternal|s\d+|sarepta|skygen|nexgen|abbott|adlens|errorpage|\d{1,3}|\d{5,}|asten|johnson|baker|hughes|ge|bhge|biocon|egfr|gcsf|biocon|pegfilgrastim|bostik|canon|chevron|phillips|coloplast|cyberonics|microsoft|evoqua|ford|hitachi|glucanbio|hunter|douglas|kimberly|clark|lextar|fisher|lockheed|martin |lux|nec|nanocopoeia|cisco|schlumberger|weccamerica|inanobio|nanocomposix|zoetis|zygo)$')

def clean_string(in_string):
    if not in_string:
        return in_string
    split_words = in_string.lower().split()
    result_words  = [word for word in split_words if word not in stop_words]
    result_words  = [word for word in result_words if not remove_regex.search(word)]
    result = ' '.join(result_words)
    return ' ' + result

def get_page_path_text (url):
    o = urlparse(url.lower())
    path = o.path
    path_parts = path.split ('/')
    path_parts = [part.split('.')[0] for part in path_parts] # remove page names
    path_parts = [split for part in path_parts for split in part.split('-') ] # split on underscores, hyphens, et al
    path_parts = [split for part in path_parts for split in part.split('_') ] # split on underscores, hyphens, et al
    clnd_string = clean_string(' '.join(path_parts))
    return clnd_string

# remove html content
def is_javascript (x):
    match_string = r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')"
    # capture CDATA; function declarations; function calls; word sequences separated by a period (e.g., denoting paths)
    regex = re.findall(match_string, x) 
    # check to see if the regex finds some percentage of the words look like javascript patterns
    if (len(regex) / float(len(x.split())) > .10):
        return True 
    else:
        return False

def clean_page_content (text_list):
    # remove whatever we think is html
    removed_html = filter(lambda x: not( bool(BeautifulSoup(x, "html.parser").find()) ), text_list)
    # remove content that looks like javascript 
    removed_js = filter(lambda x: not (is_javascript(x)), removed_html)
    # add other checks here as needed

    return removed_js
# TODO with version 3 code 
# def get_words_on_page (url)
     
def process_firms (urls): 
    for url in urls: 
        result = col.find_one({"url": url})
        
        url = result['url'][0]
        domain = get_domain(url)
        html = result['html'][0]

        soup = BeautifulSoup(html, 'lxml')
        running_text = ''
        path_text = get_page_path_text(url)
        if path_text:
            running_text += path_text
        if soup.title and soup.title.string:
            running_text += clean_string(soup.title.string)
#         headers = soup.find_all(header_regex, text=True)
#         headers_text = [clean_string(header.text) for header in headers]
                
#         for header_text in (headers_text):
#             running_text += ' ' + header_text

        clnd_text = '\n'.join(clean_page_content(result['full_text']))
        if 'body' in result:
            extractor = Extractor(extractor='DefaultExtractor', html = result['body'][0])
            lines = extractor.getText().replace(u'\xa0', u' ').split('\n')
            filtered = filter(lambda x: not re.match(r'^\s*$', x), lines)
            boilerpipe = '\n'.join(filtered)

        # TODO: write to a date frame as argument 
        if boilerpipe and (len(boilerpipe) > (.5 * len (clnd_text))):
            df_about_labeled.loc[url,'page_words'] = len(boilerpipe.split())
        else:
            df_about_labeled.loc[url,'page_words'] = len(clnd_text.split())

        firm_page_features[url] = running_text

process_firms (labeled_urls)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Sou

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [None]:
df_about_labeled

In [None]:
print (get_page_path_text('http://www.google.com/path1-en/path2_to/page.html'))
print (re.split("\W+|_", "Testing this_thing"))
print (clean_string('3m 01	08	100	10m ford 235 1990 s129 188209 0913lk the ? about us'))

In [None]:
# you can set the n-gram range to 1,2 to get unigrams as well as bigrams
# TODO: refactor into method
urls = firm_page_features.keys()
print (len(urls))
corpus = []
for url in urls:
    corpus.append (firm_page_features[url])
    
# unigram
ubv = TfidfVectorizer(min_df=0., max_df=1.)
# ngrams (performs worse than just unigrams)
# ubv = TfidfVectorizer(ngram_range=(3,3))
ubv_matrix = ubv.fit_transform(corpus)

ubv_matrix = ubv_matrix.toarray()
vocab = ubv.get_feature_names()
ubv_df = pd.DataFrame(ubv_matrix, columns=vocab)
ubv_df.index = urls
# print(ubv.vocabulary_)

In [None]:
# merge two datasets (features and labeled data)
print(ubv_df.shape)
print(df_about_labeled.shape)

all_merged = ubv_df.join(df_about_labeled, how='outer', rsuffix='_lbl')
print(all_merged.shape)

In [None]:
# split labeled and predict datasets 
labeled = all_merged[all_merged['gid'].notnull()]
print(labeled.shape)

to_predict = all_merged[all_merged['gid'].isnull()]
to_predict = to_predict.fillna(0)
print(to_predict.shape)

print (df_about_labeled.columns.tolist())

In [None]:
# labeled train/test split
X = labeled.iloc[:,1:len(ubv_df.columns)]
X['pages_in_domain'] = labeled['pages_in_domain']
X['is_sole_page'] = labeled['is_sole_page']
# X['page_words'] = labeled['page_words']

y = labeled.loc[:,'about_lbl']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X

## Train and evaluate the model
On just the labeled data

In [None]:
# specify a few models

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "SVC", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(gamma=0.001, C=100.), 
    QuadraticDiscriminantAnalysis()]

In [None]:
# build dataframe for output metrics 
eval_df = pd.DataFrame (names,index=(range(len(names))), columns=["Name"])
eval_df['Accuracy'] = np.float64(0)
display (eval_df)

In [None]:
# build evaluation outputs (currently limited to accuracy)
i = np.int64(0)
for name, clf in zip(names, classifiers):
    display (name)
    scores = cross_val_score(clf, X, y)
    avg_score = np.mean(scores)
    eval_df.set_value(i, 'Accuracy', avg_score)
    i = i + 1
    
display(eval_df)
eval_df.to_clipboard()

## Grid search using MLPClassifier to tune hyperparameter

In [None]:
parameters = {'solver': ['lbfgs'], 'max_iter': [500,1000,1500], 'alpha': 10.0 ** -np.arange(1, 7), 'hidden_layer_sizes':np.arange(5, 12), 'random_state':[0,1,2,3,4,5,6,7,8,9]}
clf_grid = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1)
clf_grid.fit(X,y)

print("Best score: %0.4f" % clf_grid.best_score_)
print("Using the following parameters:")
print(clf_grid.best_params_)