In [1]:
# Create simple web measures, including number of pages and mean words per page
# Sanjay K Arora
# November 2018

# Input: A single collection
# Output: (Eventually) a file with firm name, (pipe delimited unique domains), total pages, total words, (and average words per page)

import pprint
import sys
import pprint
import pymongo
import csv
import pandas as pd
import re

In [11]:
# Setting up database connection information
CONNECTION_STRING = 'mongodb://localhost'
MONGODB_DB = "FirmDB_20181116"
TARGET_COLLECTION = "pages_COMBINED"
OUTF = "../../data/analysis/measures/simple_web_measures_v1.csv"

client = pymongo.MongoClient(CONNECTION_STRING)
db = client[MONGODB_DB]

pp = pprint.PrettyPrinter()

In [29]:
def get_pages ():
    target_col = db[TARGET_COLLECTION]
    pipeline = [ { "$match": { "firm_name" : { "$exists": "true", "$ne": "null" }} },
                { "$group": {"_id":"$firm_name" , "number":{"$sum":1}} } ]
    pages_by_firm_name = list(target_col.aggregate(pipeline))
    print ('Found ' + str(len(pages_by_firm_name)) + ' firm names with pages')
    return pages_by_firm_name

def get_num_words(name):
    target_col = db[TARGET_COLLECTION]
    results = target_col.find ({ "firm_name" : name })
    num_words = 0
    for result in results:
        print ('\tWorking on', result['url'][0])
        if 'full_text' in results:
            num_words += [len(t) for t in text for text in result['full_text']]
        # else continue
    return num_words

def print_measures():
    f_out = open(OUTF, 'w')
    csv_out = csv.writer(f_out)
    csv_out.writerow(['firm_name', 'num_pages'])
    for firm_name, m in measures.items():
        pages = m['pages']
        csv_out.writerow([firm_name, pages])
        
# standard firm cleaning regex
def clean_firm_name (firm):
    firm_clnd = re.sub('(\.|,| corporation| incorporated| llc| inc| international| gmbh| ltd)', '', firm, flags=re.IGNORECASE).rstrip()
    return firm_clnd

In [14]:
pages_by_firm_name = get_pages ()

Found 1186 firm names with pages


In [28]:
words_by_firm_name = {}
for rec in pages_by_firm_name:
    name = rec['_id'][0]
    print ('Working on', name)
    words_by_firm_name[name] = get_num_words(name) # not cleaned text, but probably a good proxy

Working on Shimadzu Corporation
	Working on  https://www.shimadzu.com/an/index.html


NameError: name 'full_text' is not defined

In [15]:
measures = {} # key is firm

# pp.pprint(pages_by_firm_name)
for rec in pages_by_firm_name:
    firm_name = clean_firm_name(rec['_id'][0])
    measures[firm_name] = {}
    measures[firm_name]['pages'] = int(rec['number'])
    measures[firm_name]['num_words'] = words_by_firm_name[rec]
# pp.pprint(measures)

In [16]:
print_measures()