# extract-pages-from-mongo-v6
SanjayKAroraPhD@gmail.com <br>
December 2018

## Description
This version of the notebook extracts groups of pages from mongodb by firm_name to create firm-centric <b>about</b> page output files that can later be topic modeled.  In doing so, it removes repetitive content (e.g., repeated menu items) and garbage content (e.g., improperly parsed HTML code). 

## Change log
v4 focuses on about pages

## TODO:
* Whole process: get data, topic model and see if it looks sufficiently interesting/different
* Enhance data collection, per the following: 
    * Select a region or country — WAIT 
        * http://www.ivoclarvivadent.com: Please select your region
        * https://www.enersys.com/: PLEASE SELECT A REGION
        * https://www.m-petfilm.com/: ENGLISH
    * Crawl from focal about page only following links that look like part of the about story, maintaining ordering.  Check to see if the other links identified above are also there? 
        * http://xtalsolar.com/investors_partners.html
* Order known about us pages in the same way the links are found on a home page or about us landing page

In [31]:
# import data processing and other libraries
import csv
import sys
import requests
import os
import re
import pprint
import pymongo
import traceback
from time import sleep
import requests
import pandas as pd
import io
from IPython.display import display
import time
import numpy as np
from bs4 import BeautifulSoup
import string
import random
from urllib.parse import urlparse, urljoin
from collections import defaultdict
from collections import OrderedDict
import collections

In [32]:
from boilerpipe.extract import Extractor

In [33]:
# import sklearn
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [34]:
MONGODB_DB = "FirmDB_20181226"
MONGODB_COLLECTION = "pages_ABOUT2"
CONNECTION_STRING = "mongodb://localhost"

client = pymongo.MongoClient(CONNECTION_STRING)
db = client[MONGODB_DB]
col = db[MONGODB_COLLECTION]

ABOUT_DIR = '/Users/sarora/dev/EAGER/data/orgs/about/'
DATA_DIR = '/Users/sarora/dev/EAGER/data/orgs/parsed_page_output/'
TRAINING_PERCENT = .10

PHRASE_LENGTH = 60
MIN_PARA_LEN = 5

pp = pprint.PrettyPrinter()

In [35]:
# output urls for labeling of training data
results = col.find({},{"url": 1, "firm_name": 1})
df = pd.DataFrame(columns = ('firm_name', 'url'))
for i in range(results.count()):
    result = results.next()
    url = result['url'][0]
    firm_name = result['firm_name'][0] if 'firm_name' in result else ''
    df.loc[i] = [firm_name, url]
    
df['gid'] = df.groupby(['firm_name']).ngroup()

In [36]:
df.gid.nunique()
label_ids = random.sample(range(1, df.gid.nunique()), 200)
df_label = df[df['gid'].isin(label_ids)]
with open(ABOUT_DIR + 'about_pages_to_label.csv', mode='w') as to_label:
    df_label.to_csv(to_label, index=False)

In [37]:
# read back labeled data (note that about, management/team and partners, are dichotomous)
df_about_labeled = pd.read_csv(ABOUT_DIR + 'about_pages_labeled_v4.csv')
df_about_labeled = df_about_labeled.fillna(0)
df_about_labeled['pages_in_domain_ftr'] = df_about_labeled.groupby(["firm_name"])["url"].transform("count")

labeled_urls = list(df_about_labeled['url']) # for training models on labeled urls below
df_about_labeled = df_about_labeled.set_index(['firm_name', 'url'])
df_about_labeled.head()

# final test set is the rows of the original data frame without the urls in df_about_labeled 

Unnamed: 0_level_0,Unnamed: 1_level_0,about_lbl,mgmt_lbl,partners_lbl,ip_lbl,about_agg_lbl,gid,pages_in_domain_ftr
firm_name,url,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3M Innovative Properties Company,https://www.3m.com/,0.0,0.0,0.0,0.0,0.0,1.0,16
3M Innovative Properties Company,https://www.3m.com/3M/en_US/company-us/3m-science-applied-to-life/,0.0,0.0,0.0,0.0,0.0,0.0,16
3M Innovative Properties Company,https://www.3m.com/3M/en_US/company-us/about-3m/,1.0,0.0,0.0,0.0,1.0,1.0,16
3M Innovative Properties Company,https://www.3m.com/3M/en_US/company-us/about-3m/state-of-science-index-survey/,0.0,0.0,0.0,0.0,0.0,0.0,16
3M Innovative Properties Company,https://www.3m.com/3M/en_US/company-us/about-3m/technologies/,1.0,0.0,0.0,0.0,0.0,0.0,16


In [38]:
df_about_labeled.shape

(1455, 7)

## Create features to predict about pages
Create features:
1. number of domain pages (identified above)
2. whether a given page is an about us page (as opposed to a home page)
3. is home page and doesn't have any other pages (is_sole_page_ftr)
4. number of words on a page
5. number of sentences
6. title and url path fragment unigrams (also tried n-grams) with worse results
7. descriptor text around the focal link (as identified upstream when crawling as as persisted to mongodb then)

Other text-based ideas for features may be found here: https://towardsdatascience.com/understanding-feature-engineering-part-3-traditional-methods-for-text-data-f6f7d70acd41

In [27]:
# pattern regex to remove unwanted words that show up in topic models
p = re.compile(r"(\(\)|''|``|\"|null|ul|li|ol|^\.|^:|^/|\\|--|cooki|'s|corpor|busi|inc\.|ltd|co\.|compan|keyboard|product|technolog)", flags=re.IGNORECASE)

# remove html content
def is_javascript (x):
    match_string = r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|header|hover|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')"
    # capture CDATA; function declarations; function calls; word sequences separated by a period (e.g., denoting paths)
    regex = re.findall(match_string, x) 
    # check to see if the regex finds some percentage of the words look like javascript patterns
    if (len(regex) / float(len(x.split())) > .10):
        return True 
    else:
        return False

def clean_page_content (text_list):
    # remove whatever we think is html
    removed_html = filter(lambda x: not( bool(BeautifulSoup(x, "html.parser").find()) ), text_list)
    # remove content that looks like javascript 
    removed_js = filter(lambda x: not (is_javascript(x)), removed_html)
    # add other checks here as needed

    return removed_js

# this method called from below
def count_page_features (result): 
    if not result:
        return 0, 0
    
    # get number of words
    running_text = ''
    clnd_text = clean_page_content(result['full_text'])
    clnd_text = '\n'.join(clnd_text)
    boilerpipe = None

    if 'body' in result:
        extractor = Extractor(extractor='DefaultExtractor', html = result['body'][0])
        lines = extractor.getText().replace(u'\xa0', u' ').split('\n')
        filtered = filter(lambda x: not re.match(r'^\s*$', x), lines)
        boilerpipe = '\n'.join(filtered)

    # TODO fix to split().  Counting characters currently 
    if boilerpipe and (len(boilerpipe) > .5 * len(clnd_text)):
        running_text += boilerpipe
    else:
        running_text += clnd_text
    
    num_words = len(running_text.split())
    num_sentences = 0
    
    # loop over text and add title elements to the paragraph they describe
    document = running_text.split('\n')
    for i in range(len(document)): # figure out a way to chunk groups of content
        if len(document[i]) <= 12 or len(document[i].split()) < MIN_PARA_LEN : # maybe a menu or simple pagragraph heading? 
            continue
        num_sentences += len(re.findall(r'(\.|;|\!)( |$)', document[i])) # count what appears to be number of sentences sentence

    # pp.pprint (joined)
    return num_words, num_sentences

In [28]:
# remove simple article words and punctuation (need to keep 'about')
stop_words = ['the','a'] + list(string.punctuation) 
# remove known company names for model training and evaluation in the labeled data 
remove_regex = re.compile(r'^(3m|united|states|menu|en_us|algeternal|s\d+|sarepta|skygen|nexgen|abbott|adlens|errorpage|\d{1,3}|\d{5,}|\w+\d+|\d+\w+|asten|johnson|baker|hughes|ge|bhge|biocon|egfr|gcsf|biocon|pegfilgrastim|bostik|canon|chevron|phillips|coloplast|cyberonics|microsoft|evoqua|ford|hitachi|glucanbio|hunter|douglas|kimberly|clark|lextar|fisher|lockheed|martin |lux|nec|nanocopoeia|cisco|schlumberger|weccamerica|inanobio|nanocomposix|zoetis|zygo)$', re.IGNORECASE)
# used to filter top-level header content
header_in = re.compile('(about|company|corporate|who.we.are|(^|/)vision|awards|profile|corporate|management|team|history|values|strategy|our |technology|research|commercialization)', flags=re.IGNORECASE)
header_regex = re.compile(r'h[1-9]+')

def get_domain (url):
    o = urlparse(url.lower())
    domain = o.netloc.strip('www.')
    return domain

def strip_firm_name (firm_name, text):
    strip_regex = re.compile(r"(" + "\s|".join(firm_name.split()) + "\s)", re.IGNORECASE)
    clnd_text = strip_regex.sub ('', text)
    
    more_regex = re.compile(r"([A-Z]\.?){1,} ")
    clnd_text = more_regex.sub ('', clnd_text)
    
    return ' '.join(clnd_text.split(' '))

# standard firm cleaning regex
def clean_firm_name (firm):
    firm_clnd = re.sub('(\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()
    return firm_clnd

def clean_string(in_string):
    if not in_string:
        return in_string
    split_words = in_string.lower().split()
    result_words  = [word for word in split_words if word not in stop_words]
    result_words  = [word for word in result_words if not remove_regex.search(word)]
    result = ' '.join(result_words)
    return ' ' + result

def get_page_path_text (url):
    o = urlparse(url.lower())
    path = o.path
    path_parts = path.split ('/')
    path_parts = [part.split('.')[0] for part in path_parts] # remove page names
    path_parts = [split for part in path_parts for split in part.split('-') ] # split on underscores, hyphens, et al
    path_parts = [split for part in path_parts for split in part.split('_') ] 
    clnd_string = clean_string(' '.join(path_parts))
    return clnd_string

# recurse through the header text to add into feature grams
def get_header_text (headers, names, index):
    texts = [clean_string(header.text) for header in headers if header.name == names[index]]
    texts = list(filter(header_in.search, texts))
    if texts and len(texts[0].split()) > 4:
        if(len(names) > (index + 1)):
            return get_header_text (headers, names, index + 1)
        else:
            return ''
    else: 
        return ' '.join (texts)

# load page data and create features (this method kicks everything else off)
def process_firms (urls): 
    firm_text_features = {}
    firm_count_features = {} # ['is_about', 'num_words', 'num_sentences']
    
    for url in urls: 
        result = col.find_one({"url": url})
        if not result:
            result = col.find_one({"orig_url": url})
        if not result: # just can't find the page
            print ('Cannot find ' + url)
            continue
        
        # --------------------
        # text based features
        # --------------------
        firm_name = result
        domain = get_domain(url)
        
        if 'html' not in result:
            print ('Cannot find html for', url)
            continue
            
        html = result['html'][0]
        
        running_text = ''
        
        # text from the text wrapping the link
        descriptor = result['descriptor'][0].replace ('|', '').replace('None', '')
        if descriptor: 
            running_text += clean_string(descriptor)
        
        # path text within the url 
        path_text = get_page_path_text(url)
        if path_text:
            # print (path_text)
            running_text += path_text
         
        # title from the html page
        soup = BeautifulSoup(html, 'lxml')
        if soup.title and soup.title.string:
            # print (soup.title.string)
            running_text += clean_string(soup.title.string)
        
        # headers from the page
        headers = soup.find_all(header_regex, text=True)
        names = sorted(set ([header.name for header in headers]))
        running_text += get_header_text (headers, names, 0)

        firm_name = result['firm_name'][0]
        running_text_clnd = strip_firm_name (firm_name, running_text)
        
        firm_text_features[url] = running_text_clnd
        
        # --------------------
        # count based features
        # --------------------        
        is_about = int(result['is_about'][0])
        num_words, num_sentences = count_page_features (result)
        firm_count_features[url] = [is_about, num_words, num_sentences]
        
    return firm_text_features, firm_count_features

In [29]:
# Test various methods and regex used above 
print (get_page_path_text ('http://biocon.com/biocon_aboutus_businesses.asp'))

print (get_page_path_text('http://www.google.com/path-en/path_to/page.html'))
print (re.split("\W+|_", "Testing this_thing"))
print (clean_string('3m 01	08	100	10m ford 235 1990 s129 188209 0913lk the ? about us'))
print (clean_string('3m 01	08	100	10m ford 235 1990 s129 188209 0913lk the ? about us'))
pp.pprint (list(filter(header_in.search, ['about us', 'not found', 'company'])))

print (strip_firm_name('Ford Motor Company', 'This Ford Motor Company has been around for a while.'))
print (strip_firm_name (clean_firm_name('Ford Motor Company'), 'Ford is a motor company.  It has been building vehicles for over a century. H.W.F_ Ford was a nice guy.'))

 aboutus businesses
 path en path to page
['Testing', 'this', 'thing']
 about us
 about us
['about us', 'company']
This has been around for a while.
is a company.  It has been building vehicles for over a century. H.W.F_ was a nice guy.


In [30]:
# get firm website data for n-gram processing AND grab count features
labeled_firm_text_features, labeled_firm_count_features = process_firms (labeled_urls)

Cannot find http://www.genomichealth.com/en-US/


In [None]:
# testing, should be  
# pages about us about us – about us 
# [1, 882, 19]
print(labeled_firm_text_features['https://nanocomposix.com/pages/about-us'])
print(labeled_firm_count_features['https://nanocomposix.com/pages/about-us'])

In [None]:
urls = labeled_firm_text_features.keys() # create in an order
print (len(urls))
corpus = []
for url in urls:
    corpus.append (labeled_firm_text_features[url])
    
# unigram
ubv = TfidfVectorizer(min_df=0., max_df=1.)
# you can set the n-gram range to 1,2 to get unigrams as well as bigrams (performs worse than just unigrams)
# ubv = TfidfVectorizer(ngram_range=(1,2)) 

ubv_matrix = ubv.fit_transform(corpus)

ubv_matrix = ubv_matrix.toarray()
vocab = ubv.get_feature_names()
ubv_df = pd.DataFrame(ubv_matrix, columns=vocab)
ubv_df.index = urls
ubv_df.index.name='url'
ubv_df.head()

In [None]:
count_df = pd.DataFrame.from_dict(labeled_firm_count_features, orient='index', columns = ['is_about_ftr', 'num_words_ftr', 'num_sentences_ftr'])
count_df.index.name = 'url'
count_df.head()

In [None]:
# processed above but output here for clarity
df_about_labeled.head()

## Merge labeled and feature data

In [None]:
# merge datasets (features and labeled data)
print(ubv_df.shape)
print(df_about_labeled.shape)
print(count_df.shape)

merged = ubv_df.join(df_about_labeled, how='inner')

labeled = merged.join(count_df, how='inner')
labeled['num_words_firm_ftr'] = labeled['num_words_ftr'].groupby(level=0).transform('sum')
labeled['share_of_words_ftr'] = labeled['num_words_ftr'] / labeled['num_words_firm_ftr']

labeled['num_sentences_firm_ftr'] = labeled['num_sentences_ftr'].groupby(level=0).transform('sum')
labeled['share_of_sentences_ftr'] = labeled['num_sentences_ftr'] / labeled['num_sentences_firm_ftr']

print(labeled.shape)
labeled.head()

In [None]:
# check for missing urls in the final labeled data frame
for url in labeled_urls: # labeled data
    if url not in list(labeled.index.levels[1]): # feature data
        print ('Missing', url)

labeled.xs('https://nanocomposix.com/pages/about-us', level=1)

In [None]:
# labeled train/test split
print (len(ubv_df.columns))
X = labeled.iloc[:,1:len(ubv_df.columns)]
       
# other in other non text-based features
# 1. number of domain pages (identified above)
# 2. is home page and doesn't have any other pages (is_sole_page)
# 3. whether a given page is an about us page (as opposed to a home page)
# 4. number of words on a page (and share of words)
# 5. number of sentences (and share of sentences)

X['pages_in_domain_ftr'] = np.reciprocal(labeled['pages_in_domain_ftr'])
# X['is_sole_page_ftr'] = labeled['is_sole_page_ftr']
X['is_about_ftr'] = labeled['is_about_ftr']
# X['share_of_words_ftr'] = labeled['share_of_words_ftr']
# X['num_sentences_ftr'] = labeled['num_sentences_ftr']
X['share_of_sentences_ftr'] = labeled['share_of_sentences_ftr']
X.to_csv(ABOUT_DIR + 'X.csv', index = True) # for manual inspection

# normalize
# X = (X - X.mean()) / X.std()
# X = (X - X.min()) / (X.max() - X.min())

y = labeled.loc[:,'about_lbl']
print (X.shape)
print (y.shape)
X.head()

## Train and evaluate the model
On just the labeled data

In [None]:
# specify a few models

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "SVC", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(gamma=0.001, C=100.), 
    QuadraticDiscriminantAnalysis()]

In [None]:
# build dataframe for output metrics 
eval_df = pd.DataFrame (names,index=(range(len(names))), columns=["Name"])
eval_df['Accuracy'] = np.float64(0)

In [None]:
# build evaluation outputs (currently limited to accuracy)
i = np.int64(0)
for name, clf in zip(names, classifiers):
    display (name)
    scores = cross_val_score(clf, X, y)
    avg_score = np.mean(scores)
    eval_df.set_value(i, 'Accuracy', avg_score)
    i = i + 1
    
display(eval_df)
eval_df.to_clipboard()
# Neural net work best

## Grid search using MLPClassifier to tune hyperparameters
The above results clearly show that a type of feed-forward neural network is the most accurate type of model

In [None]:
hls = []
# hls.append([20,])
# hls.append([70,])
hls.append([100,])
# hls.append([50,50])
# hls.append([70,70,70])
# hls.append([40,40,40])
# hls.append([10,10,10])
# hls.append([50,50,50,50])
pp.pprint(hls)

In [None]:
parameters = {'solver': ['adam'], 'max_iter': [40], 'alpha': 10.0 ** -np.arange(1, 3), 'hidden_layer_sizes': hls, 'random_state':[20]}
clf_grid = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1)
clf_grid.fit(X,y)

print("Best score: %0.4f" % clf_grid.best_score_)
print("Using the following parameters:")
print(clf_grid.best_params_)

In [None]:
# train neural net model with best hyperparameter configuration
clf = MLPClassifier(alpha=0.1, hidden_layer_sizes=(100,), max_iter=40, random_state=20, solver='adam')
clf.fit(X, y)

y_hat = clf.predict(X)
print(confusion_matrix(y, y_hat))

In [None]:
parameters = [ {'C': [100, 500, 1000, 2000], 'gamma': [.01, 0.001], 'kernel': ['rbf']},]
clf_grid = GridSearchCV(SVC(), parameters, n_jobs=-1)
clf_grid.fit(X,y)

print("Best score: %0.4f" % clf_grid.best_score_)
print("Using the following parameters:")
print(clf_grid.best_params_)

In [None]:
# train SVC model with best hyperparameter configuration
clf = SVC(C=1000., gamma=0.001, kernel='rbf')
clf.fit(X, y)

y_hat = clf.predict(X)
print(confusion_matrix(y, y_hat))

In [None]:
# print all instances where predictions don't match labels (for inspection)
for key, y_i, y_hat_i in zip(list(X.index), y, y_hat):
    if y_i != y_hat_i:
        print(key[1], 'has been classified as ', y_hat_i, 'but should be ', y_i) 

## Predict about pages for unlabeled data

In [None]:
# prepare domain level features 
df_predict = df[~df['url'].isin(labeled_urls)] 
df_predict['pages_in_domain_ftr'] = df_predict.groupby(["firm_name"])["url"].transform("count")

df_predict = df_predict.set_index(['firm_name', 'url'])
df_predict = df_predict.fillna(0)

df_predict.sort_values(by=['gid']).head()

In [None]:
# check to see whether there are duplicate urls
# note: there should be because different assignees may map to the same domain (see error above)
counter=collections.Counter(df_predict.index)
most_common = counter.most_common(5)
pp.pprint (most_common)

In [None]:
# prepare n-gram and count features
unlabeled_firm_text_features, unlabeled_firm_count_features = process_firms (set(df_predict.index.get_level_values('url')))

In [None]:
prediction_urls = unlabeled_firm_text_features.keys()

pred_corpus = []
for url in prediction_urls:
    pred_corpus.append (unlabeled_firm_text_features[url])

ubv_prediction_matrix = ubv.transform(pred_corpus)

ubv_prediction_matrix = ubv_prediction_matrix.toarray()
vocab = ubv.get_feature_names()
ubv_prediction_df = pd.DataFrame(ubv_prediction_matrix, columns=vocab)
ubv_prediction_df.index = prediction_urls
ubv_prediction_df.index.name='url'
ubv_prediction_df.head()

In [None]:
count_pred_df = pd.DataFrame.from_dict(unlabeled_firm_count_features, orient='index', columns = ['is_about_ftr', 'num_words_ftr', 'num_sentences_ftr'])
count_pred_df.index.name = 'url'
count_pred_df.head()

In [None]:
# merge datasets (features and labeled data)
print(ubv_prediction_df.shape)
print(df_predict.shape)
print(count_pred_df.shape)

predict_merged = ubv_prediction_df.join(df_predict, how='inner')

unlabeled = predict_merged.join(count_pred_df, how='inner')
unlabeled['num_sentences_firm_ftr'] = unlabeled['num_sentences_ftr'].groupby(level=0).transform('sum')
unlabeled['share_of_sentences_ftr'] = unlabeled['num_sentences_ftr'] / unlabeled['num_sentences_firm_ftr']

unlabeled = unlabeled.sort_values(by=['gid']).head()
print(unlabeled.shape)
unlabeled.head()

In [None]:
# merge
X_test = unlabeled.iloc[:,1:len(ubv_prediction_df.columns)]

print (X.shape)
print (X_test.shape) # should be the same number of cols

X_test['pages_in_domain_ftr'] = np.reciprocal(unlabeled['pages_in_domain_ftr'])
X_test['is_about_ftr'] = unlabeled['is_about_ftr']
X_test['share_of_sentences_ftr'] = unlabeled['share_of_sentences_ftr']
X_test.to_csv(ABOUT_DIR + 'X_test.csv', index = True) # for manual inspection

X_test.head()

In [None]:
# predict with newly constructed X
y_predicted = clf.predict(X_test)

In [None]:
# write to file
with open(ABOUT_DIR + 'about_predicted_and_labels.csv', mode='w') as about_file:
    about_writer = csv.writer(about_file, delimiter=',', quotechar='"')
    about_writer.writerow(['firm_name', 'url', 'is_about'])
    # output predicted values to file
    for fn, u, predicted_value in zip(X_test.index.get_level_values('firm_name'), X_test.index.get_level_values('url'), y_predicted):
        # print (fn + ' with url ' + u + ' has predicted value ' + str(predicted_value))
        about_writer.writerow([clean_firm_name(fn), u, predicted_value])
    # and the labeled ones too...
    for fn, u, labeled_value in zip(X.index.get_level_values('firm_name'), X.index.get_level_values('url'), y):
        # print (fn + ' with url ' + u + ' has predicted value ' + str(labeled_value))
        about_writer.writerow([clean_firm_name(fn), u, labeled_value])

## Extract data from mongodb
* Now that we know which pages are about pages, extract from mongodb and output for topic modeling
* For now, construct paragraphs from different pages by ordering urls by their length.  In the future, might want to contruct paragraphs in their 'natural' sequential order as they would appear on a home page or landing page

In [None]:
# combine both labeled and predicted frames
print (X_test.shape)
print(X.shape)

combined = X_test.append(X)
print (combined.shape)
print (len(y_predicted))
print (len(y))
abouts = pd.DataFrame(index=combined.index)

abouts['is_about'] = list(y_predicted) + list(y)
abouts = abouts.reset_index()
abouts

In [None]:
# gather unique firm_names from mongodb
firm_names = set(abouts['firm_name'])
print (len(firm_names))
pp = pprint.PrettyPrinter()
pp.pprint(firm_names)

In [None]:
def get_ordered_about_urls (firm_name):
    urls = list (abouts.loc[(abouts['firm_name'] == firm_name) & (abouts['is_about'] == 1), 'url'])
    urls.sort(key = len)
    # print ('Original urls')
    # pp.pprint(urls)

    index = {}
    for url in urls:
        path_fragments = len(url.split('/'))
        added = False
        for i in range(1, path_fragments):
            key_phrase = url.rsplit('/', maxsplit=i)[0]
            if key_phrase in urls or (key_phrase + '/') in urls: 
                od = index.setdefault(key_phrase, OrderedDict())
                od[url] = 1
                added = True
                continue
        if not added:
            od = index.setdefault(url, OrderedDict())
            od[url] = 1
 
    # pp.pprint (index)
    
    return_urls = [] 
    seen = set ()
    for key in index.keys():
        tree_urls = index[key]
        for fu in tree_urls:
            if fu not in seen:
                return_urls.appju8j nend(fu)
                seen.add(fu)
    
    # finally remove home page if it exists and if there are other pages to draw on
    if not return_urls: 
        return None
    else: 
        first_page = return_urls[0]
        first_page_path = get_page_path_text (first_page)
        if first_page_path == ' ' or first_page_path == '':
            print (first_page_path + 'empty')
            return_urls.pop(0)
        return return_urls

test_urls = get_ordered_about_urls ('Previvo Genetics')
print ('Ordered urls')
pp.pprint (test_urls)

In [None]:
# iterate through firm urls and return concatenated string
def get_content (urls): 
    running_text = ''
    for url in urls:
        print ('\tWorking on ' + url)
        result = col.find_one( {"url": url} )
        if result:
            clnd_text = clean_page_content(result['full_text'])
            clnd_text = '\n'.join(clnd_text)
            boilerpipe = None
            
            if 'body' in result:
                extractor = Extractor(extractor='DefaultExtractor', html = result['body'][0])
                lines = extractor.getText().replace(u'\xa0', u' ').split('\n')
                filtered = filter(lambda x: not re.match(r'^\s*$', x), lines)
                boilerpipe = '\n'.join(filtered)

            # TODO fix to split().  Counting characters currently 
            if boilerpipe and (len(boilerpipe) > .5 * len(clnd_text)):
                print ('\t\tUsing boilerplate')
                running_text += boilerpipe
            else:
                print ('\t\tUsing clnd_text')
                running_text += clnd_text
        else:
            print ('Cannot find url: ' + url)

    return running_text

In [None]:
# regex test 
regex = re.findall(r"(CDATA|return\s+true|return\s+false|getelementbyid|function|\w+\(.*?\);|\w{2,}[\\.|:]+\w{2,}|'\w+':\s+'\w+|\\|{|}|\r|\n|\/\/')", 
                   "CDATA function contact-us getelementbyid javascript.function linker:autoLink www.littlekidsinc.com fxnCall(param.param); email@dextr.us 'type': 'image' return true return false rev7bynlh\\u00252bvcgrjg\\ {height}") # last part is words sequences separated by punct

In [None]:
test_site_text = get_content (test_urls)
print (test_site_text)

In [None]:
# run process_firm and write to file
pp = pprint.PrettyPrinter()
for firm_name in firm_names: 
    print ("Working on " + firm_name)
    about_urls = get_ordered_about_urls(firm_name)
    if not about_urls:
        print ("\tCouldn't find any urls for firm!")
        firm_urls = df_predict.xs(firm_name, level=0)
        home_page = firm_urls.loc[firm_urls['is_about_ftr'] == 0]
        about_urls.append(home_page.index.get_level_values('url'))
        
    about_text = get_content (about_urls)
    
    if about_text: 
        firm_clnd = clean_firm_name(firm_name) # standard cleaning code throughout project
        about_clnd = strip_firm_name (firm_name, about_text)
        file = re.sub('\/', '|', firm_clnd) + '.txt'
        with io.open(DATA_DIR + file,'w',encoding='utf8') as f:
            f.write (about_clnd)
    else:
        print ("\tCouldn't find any text for firm!")