# Article Crawler
Packages:
- selenium
- selenium-stealth

This is a web crawler that was tested on the landing pages of tech company diversity pages. 
It's intended to run naive but also has options for excluding specific url paths 
It crawls all pages 3 levels deep by default. 

Attempts to most relevant target hrefs within visible main content by: 
- excluding all from footer/head/header/nav
- only urls from domains with partial match to domain
- should also download pdfs when crawled
- excludes duplicates by checking urls, excluding parameters, checking for same html content

See comments for further details.

In [11]:
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium_stealth import stealth # https://github.com/diprajpatra/selenium-stealth
import time 
from datetime import datetime 
datetime.now().strftime('%d/%m/%Y %H:%M:%S')
import random
import pandas as pd

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# enable browser logging
d = DesiredCapabilities.CHROME
d['loggingPrefs'] = { 'browser':'ALL' }

options = webdriver.ChromeOptions()
options.add_argument("--window-size=1920,1080")
options.add_argument("--headless") 

### gpu timeout fixes (https://stackoverflow.com/a/52340526/13079526) ###

options.add_argument("start-maximized")# https://stackoverflow.com/a/26283818/1689770
options.add_argument("enable-automation")#https://stackoverflow.com/a/43840128/1689770
options.add_argument("--no-sandbox")#https://stackoverflow.com/a/50725918/1689770
options.add_argument("--disable-dev-shm-usage")#https://stackoverflow.com/a/50725918/1689770
options.add_argument("--disable-browser-side-navigation")#https://stackoverflow.com/a/49123152/1689770
options.add_argument("--disable-gpu")#//https://stackoverflow.com/questions/51959986/how-to-solve-selenium-chromedriver-timed-out-receiving-message-from-renderer-exc

###

options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

driver = webdriver.Chrome(chrome_options=options, executable_path='/Users/am/chromedriver',desired_capabilities=d)

#see docs if we need to edit this (https://github.com/diprajpatra/selenium-stealth)
stealth(driver,
        languages=["en-US", "en"],
        vendor="Google Inc.",
        platform="Win32",
        webgl_vendor="Intel Inc.",
        renderer="Intel Iris OpenGL Engine",
        fix_hairline=True,
        )

  driver = webdriver.Chrome(chrome_options=options, executable_path='/Users/am/chromedriver',desired_capabilities=d)
  driver = webdriver.Chrome(chrome_options=options, executable_path='/Users/am/chromedriver',desired_capabilities=d)


# Functions
- dget sleeps 3 secs, scrolls to bottom of page
- analyze_divpar, see: https://github.com/joephofhuis/DivPAR
- added additional logging structures for divanalyzer for debugging

In [12]:
import re
import json
#import logging

def dget(url,sleep_time=3):
    driver.get(str(url))
    time.sleep((float(sleep_time)/2))
    height = driver.execute_script("return document.documentElement.scrollHeight")
    driver.execute_script("window.scrollTo(0, " + str(height) + ");")
    time.sleep((float(sleep_time)/2))    

#### This runs automated coding on any body of text ####
# this function is abstracted from (https://github.com/joephofhuis/DivPAR/tree/main/src/analysis) and uses the searchstrings.json therein 
# commented out lines for handling text files from previous software
# added verbosity and temporary log output
def analyze_divpar(document, searchstrings_json_path, phraseoffset=4):
    regexes = {k: [re.compile(v2, re.I) for v2 in v] for k,v in json.load(open(searchstrings_json_path)).items()}
    frames = ['marketperspective', 'moralperspective', 'innovationperspective']
    #allfiles =  glob('../../data/raw-private/**/**/*.txt.gz')
    #outputfile = 'automatedcoding.csv'

    text = document.replace("\n", " ").replace("\r", " ")
    text = " ".join(text.split())
    phrases = text.split('.')

    #with open(outputfile, mode='w', newline='') as fo:
        #fieldnames = ['country', 'year', 'company', 'marketperspective', 'moralperspective','innovationperspective']
        #writer = csv.DictWriter(fo, fieldnames=fieldnames)
        #writer.writeheader()
        #for f in tqdm(allfiles):
        #    document = gzip.open(f, mode='rt').read()
    matches_form = {}
    # Create output log dict
    end_log={
        'analyzer_event_id':[] 
        ,'analyzer_event_type':[]
        ,'frame':[]
        , 'phrase':[]
        , 'trigger_regex':[]
        , 'triggers':[]
        , 'phrase_count':[]
        , 'match_tried':[]
        , 'match':[]
        , 'analyzer_event_type:':[]
        }
    
    for frame in frames:
        matches_form[frame] = 0
        phrase_count = 0
        trigger = None
        for index,phrase in enumerate(phrases):
            if not trigger:
                triggers=set()
                for trigger_regex in regexes['trigger']:
                    trigger = any(trigger_regex.finditer(phrase))
                    triggers.update(trigger_regex.findall(phrase))
                    #end_log.append('(*) ~Trying trigger~ "[{}]" with phrase: "[{}]" for frame: "[{}]. Trying next trigger…"'.format(trigger_regex,phrase,frame))
                    #end_log['analyzer_event_id'].append(index)
                    #end_log['analyzer_event_type'].append('trig-try')
                    #end_log['frame'].append(frame)
                    #end_log['phrase'].append(phrase)
                    #end_log['trigger_regex'].append(trigger_regex.pattern)
                    #end_log['triggers'].append(triggers)
                    #end_log['phrase_count'].append(phrase_count)
                    #end_log['match'].append(None)
                    #end_log['match_tried'].append(None)
                #end_log.append('(\X/) _No Triggers for phrase: "[{}]" for frame: "[{}]". Trying next phrase…'.format(trigger_regex,phrase,frame))
                #end_log['analyzer_event_id'].append(index)
                #end_log['analyzer_event_type'].append('no-trigs')
                #end_log['frame'].append(frame)
                #end_log['phrase'].append(phrase)
                #end_log['trigger_regex'].append(trigger_regex.pattern)
                #end_log['triggers'].append(triggers)
                #end_log['phrase_count'].append(phrase_count)
                #end_log['match'].append(None)
                #end_log['match_tried'].append(None)

            if trigger:
                phrase_count += 1
                #end_log.append('((!))) |||Found Trigger||| "[{}]" for phrase: "[{}]" for frame: "[{}]". Searching nearby terms'.format(trigger_regex,phrase,frame))
                end_log['analyzer_event_id'].append(index)
                end_log['analyzer_event_type'].append('trig-found')
                end_log['frame'].append(frame)
                end_log['phrase'].append(phrase)
                end_log['trigger_regex'].append(trigger_regex.pattern)
                end_log['triggers'].append(triggers)
                end_log['phrase_count'].append(phrase_count)
                end_log['match'].append(None)
                end_log['match_tried'].append(None)

                for rex in regexes[frame]:
                    #end_log.append('((!))) Frame: "[{}]"|||Searching for additional terms in phrase: "[{}]". Using Regex: "[{}]"'.format(frame,phrase,rex))
                    #end_log['analyzer_event_id'].append(index)
                    #end_log['analyzer_event_type'].append('validate-found-trig')
                    #end_log['frame'].append(frame)
                    #end_log['phrase'].append(phrase)
                    #end_log['trigger_regex'].append(trigger_regex.pattern)
                    #end_log['triggers'].append(triggers)
                    #end_log['phrase_count'].append(phrase_count)
                    #end_log['match_tried'].append(rex)
                    #end_log['match'].append(None)
                    match = rex.findall(phrase)
                    if match:
                        #logging.warning("** MATCH FOR " + frame + " **\n")
                        #logging.warning("TRIGGER \n%s" % trigger)
                        #logging.warning("MATCH USED:\n%s" % rex)
                        #logging.warning("PHRASE WE ARE ANALYSING:\n%s" % phrase)
                        #logging.warning("MATCH FOUND IN PHRASE:\n%s\n\n" % match)
                        trigger = None  #resets trigger before breaking so loop can restart. unnecesary in this function
                        matches_form[frame] += 1
                        #end_log.append('(√√√) [{}]:|||Trigger>>Phrase>>Match "[{}]">"[{}]" for frame: "[{}]" + 1 Frame Count'.format(frame,trigger_regex,phrase,match))
                        end_log['analyzer_event_id'].append(index)
                        end_log['analyzer_event_type'].append('match-counted')
                        end_log['frame'].append(frame)
                        end_log['phrase'].append(phrase)
                        end_log['trigger_regex'].append(trigger_regex.pattern)
                        end_log['triggers'].append(triggers)
                        end_log['phrase_count'].append(phrase_count)
                        end_log['match_tried'].append(rex)
                        end_log['match'].append(match)
                        break
                #end_log.append('(X) [{}]:|||No Matches Found "[{}]">"[{}]" for frame: "[{}]" + 1 Frame Count'.format(frame,trigger_regex,phrase,match))
                end_log['analyzer_event_id'].append(index)
                end_log['analyzer_event_type'].append('trig-not-valid')
                end_log['frame'].append(frame)
                end_log['phrase'].append(phrase)
                end_log['trigger_regex'].append(trigger_regex.pattern)
                end_log['triggers'].append(triggers)
                end_log['phrase_count'].append(phrase_count)
                end_log['match_tried'].append(rex)
                end_log['match'].append(match)
                if phrase_count > phraseoffset:
                    #end_log.append('Phrase count exceeded phrase offset of {} | Frame>Trigger>Phrase>Match: {}>{}>{}>{}\nMoving on to next phrase'.format(phraseoffset,frame,trigger_regex,phrase,match))
                    end_log['analyzer_event_id'].append(index)
                    end_log['analyzer_event_type'].append('exceeded-phraseoffset')
                    end_log['frame'].append(frame)
                    end_log['phrase'].append(phrase)
                    end_log['trigger_regex'].append(trigger_regex.pattern)
                    end_log['triggers'].append(triggers)
                    end_log['phrase_count'].append(phrase_count)
                    end_log['match_tried'].append(rex)
                    end_log['match'].append(match)
                    phrase_count = 0
                    trigger = None
            #print(f)
            #print(matches_form)
            #print('\n\n\n')
            
            #country, company, year = os.path.split(f)[1].split('_')
            #matches_form['country'] = country
            #matches_form['year'] = int(year[:4])
            #matches_form['company'] = company
            #writer.writerow(matches_form)
        
        
    return [matches_form,end_log] #The logging is all placeholder for now: Use actual data structures next


# Nested Loops, 3 Crawl Levels
- Crawls articles and hrefs therein 3 levels deep
- Analyzes as it crawls/scrapes
- Constructs dicts from divanalyzer logging, the scraped results, and the divanalyzer scores. 
- Optional CSV handler instead of urls dict therein

To dos: 
- abstract crawling to function so can be contained with `while depth <= i`

In [18]:
import pandas as pd
from bs4 import BeautifulSoup
from htmldate import find_date
import re
import csv
import gzip
import json
import os
#from tqdm import tqdm
from glob import glob

urls= {
    'diversity.fb.com':['https://diversity.fb.com/','2021'],
    #'salesforce.com/company/equality':['https://www.salesforce.com/company/equality/','2021'],
    #'newsroom.pinterest.com/en/post/pinterest-2021-inclusion-diversity-report':['https://newsroom.pinterest.com/en/post/pinterest-2021-inclusion-diversity-report','2021'],
    #'apple.com/diversity':['https://www.apple.com/diversity/','2021'],
    #'airbnb.com/resources/hosting-homes/t/diversity-inclusion-41':['https://www.airbnb.com/resources/hosting-homes/t/diversity-inclusion-41','2021'], # should we be using this instead? (https://news.airbnb.com/an-update-on-diversity-and-belonging-at-airbnb/)
    'aboutamazon.com/workplace/diversity-inclusion':['https://www.aboutamazon.com/workplace/diversity-inclusion','2021'],
    #'news.linkedin.com/2021/october/2021-workforce-diversity-report':['https://news.linkedin.com/2021/october/2021-workforce-diversity-report','2021']
}

### Optional CSV input handler in place of above urls dict, Structure your .CSV like: ###
## example_page_name, full_url, content_year
# e.g.: diversity.fb, https://diversity.fb.com/, 2021

## Constructs the same dict like urls above from .CSV input
#import csv
#with open('/Users/am/tst.csv') as file:
#    urls={i.split(',')[0]:[s.strip() for s in i.split(',')[1:3]] for i in file}


## Table for all scraped data and divanalysis scores
data = {'company':[]
        ,'page':[]
        ,'root_report_date':[]
        ,'date':[]
        ,'marketperspective':[]
        ,'moralperspective':[]
        ,'innovationperspective':[]
        ,'word_count_alnum':[]
        ,'word_count':[]
        , 'total_links':[]
        , 'document_text':[]
    }

#### Excluded Terms List #### 
## If you observe the scraper wasting time on any specific directory types, you can nip them by adding to the excluded terms list. This will ensure no urls with these terms in them are crawled.
# e.g. Here linkedin, links leading to social network part of their site, which led to crawling dozens of irrelevant Linkedin profiles.
excluded_terms = ['linkedin.com/in/','linkedin.com/company/','login','signup','/privacy','redirect','/_created/','/_shop/']


# list for duplicate text body handler #
htmls=[]

# Crawler 3 depth or can be limited to 2 depth #
limit = False

divpar_analysis_outputs = []

## Todo: ##
# - functionalize parts so that it is no longer a nested loop, but can run like: `while depth <= n`
#def bget(url):
#    return [all_hrefs, all_text]

for url,date in urls.values():
    print("first loop for ",url)
    ## Access website and initialize soup object with selenium driver's page source
    print('Accessing: ',url)
    dget(url)
    domain = ''.join(url.split('/')[2].split('.')[:-1])
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #remove unwanted sections of sites
    [el.decompose() for el in soup.find_all('head')+soup.find_all('header')+soup.find_all('footer')+soup.find_all('nav')]
    #build list remaining hrefs using subdomain any-matching and joining relative links to domain
    all_hrefs = {el.get('href','').split('?')[0].split('#')[0] for el in soup.find_all(['a']) if 'http' in el.get('href','') and len([s for s in el.get('href','') if s == '/'])>=2 and any(match in url.split('/')[2].split('.')[:-1] for match in el.get('href','').split('/')[2].split('.')[:-1] if match != 'www') == True}
    relative_links = ['/'.join(url.split('/')[:3])+relative_link.split('?')[0].split('#')[0] for relative_link in {el.get('href','') for el in soup.find_all(['a']) if not any(substring in el.get('href','') for substring in ['#','http','mailto:','.zip','javascript:void(0)'])}]
    all_hrefs.update(relative_links)
    all_hrefs = {href for href in all_hrefs if href not in data['page']+[val[0] for val in urls.values()] and not any(substring in href for substring in excluded_terms)}
    #join remaining text body
    all_text = ' '.join([b.get_text(separator='\n') for b in soup.find_all('body')])
    
    ## Run divpar analysis function store data ##
    # Using joephofhuis's DivPAR analysis.py, searchstrings.json: https://github.com/joephofhuis/DivPAR/blob/main/src/analysis/analysis.py
    divpar_analysis = analyze_divpar(all_text, '/Users/am/DivPAR-main/src/analysis/searchstrings.json',4)
    divpar_coding = divpar_analysis[0]
    divpar_analysis[1]['company'] = [domain for n in divpar_analysis[1]['analyzer_event_id']]
    divpar_analysis[1]['page'] = ['root' for n in divpar_analysis[1]['analyzer_event_id']]
    divpar_analysis_outputs.append(divpar_analysis[1]) # getting data dicts for each divpar analysis run
    data['company'].append(domain)
    data['page'].append('url')
    data['date'].append(date)
    data['root_report_date'].append(date) ## This is the 1st level crawl of root pages, the date of these is defined by the user's input .CSV or urls dict ##
    data['marketperspective'].append(divpar_coding['marketperspective'])
    data['moralperspective'].append(divpar_coding['moralperspective'])
    data['innovationperspective'].append(divpar_coding['innovationperspective'])
    data['word_count_alnum'].append(len([text for text in all_text.split(' ') if text.isalnum()])) #alphanumeric word count
    data['word_count'].append(len([text for text in all_text.split(' ')])) #split on whitespace word count
    data['total_links'].append(len(all_hrefs))
    data['document_text'].append(all_text)
    htmls.append(html)
    
    ## Debug (For testing a few urls from the first level crawl) ##
   # all_hrefs = ['https://www.apple.com/newsroom/2021/03/apples-worldwide-developers-conference-is-back-in-its-all-online-format/',
 #'https://www.apple.com/newsroom/2021/03/australian-primary-school-drives-innovation-and-creativity-with-ipad/','https://www.apple.com/diversity/pdf/2020-Final-Certified-Apple-EEO-1-Report.pdf','https://www.apple.com/newsroom/2022/03/apples-coda-wins-historic-oscar-for-best-picture-at-the-academy-awards/'] #debug

        ####### Hrefs Loop ########

    for i,href in enumerate(href for href in all_hrefs if len([s for s in href if s == '/'])>=2): #double check if this rule is neccesary anymore
        #print(i,'\t',href) #debug

        ## Access href and initialize soup object with selenium driver's page source
        href_domain = ''.join(href.split('/')[2].split('.')[:-1])
        href_url = '/'.join(href.split('/')[2:])
        print('{}: [{} of {}] Accessing {} '.format(href,i,len([href for href in all_hrefs if len([s for s in href if s == '/'])>=2]),href))
        try:
            dget(href)
        except Exception as e:
            print("GET Error: {}".format(e))
            
            data['company'].append(domain)
            data['page'].append(href)
            data['date'].append(None)
            data['root_report_date'].append(date)
            data['marketperspective'].append(None)
            data['moralperspective'].append(None)
            data['innovationperspective'].append(None)
            data['word_count_alnum'].append(None) #alphanumeric word count
            data['word_count'].append(None) #split on whitespace word count
            data['total_links'].append(None)
            data['document_text'].append(None)
            htmls.append(html)
            continue

        html = driver.page_source
        if html in htmls: # if the page source has not changed or if is duplicate, resets loop and outputs Nones
            html = None
            print("Driver Could Not Load Page or Duplicate Text Body Found - Check URL")
            
            data['company'].append(domain)
            data['page'].append(href)
            data['date'].append(None)
            data['root_report_date'].append(date)
            data['marketperspective'].append(None)
            data['moralperspective'].append(None)
            data['innovationperspective'].append(None)
            data['word_count_alnum'].append(None) #alphanumeric word count
            data['word_count'].append(None) #split on whitespace word count
            data['total_links'].append(None)
            data['document_text'].append(None)
            htmls.append(html)
            continue
            
        soup = BeautifulSoup(html, "html.parser")

        #remove unwanted sections of sites
        [el.decompose() for el in soup.find_all('head')+soup.find_all('header')+soup.find_all('footer')+soup.find_all('nav')]
        #build list of remaining hrefs using subdomain any-matching and joining relative links to domain
        try:
            all_hrefs = {el.get('href','').split('?')[0].split('#')[0] for el in soup.find_all(['a']) if 'http' in el.get('href','') and len([s for s in el.get('href','') if s == '/'])>=2 and any(match in href.split('/')[2].split('.')[:-1] for match in el.get('href','').split('/')[2].split('.')[:-1] if match != 'www') == True}
            relative_links = ['/'.join(href.split('/')[:3])+relative_link.split('?')[0].split('#')[0] for relative_link in {el.get('href','') for el in soup.find_all(['a']) if not any(substring in el.get('href','') for substring in ['#','http','mailto:','.zip','javascript:void(0)'])}]
            all_hrefs.update(relative_links)
            all_hrefs = {href for href in all_hrefs if href not in data['page']+[val[0] for val in urls.values()] and not any(substring in href for substring in excluded_terms)}
        except Exception as e:
            print("Href Error: {} ".format(e))
        #join remaining text body
        all_text = ' '.join([b.get_text(separator='\n') for b in soup.find_all('body')])
        ## Use use year from url, if none, use htmldate to get best guess of article date (see: https://github.com/adbar/htmldate)
        article_year = [re.findall('(?<=[^0-9])[1-2][0-9]{3}(?=[^0-9]|$)', href) or [None]][0][0]
        #use htmldate to get next best guess of article date (see: https://github.com/adbar/htmldate)
        if article_year is None:
            try:
                article_year = find_date(html)[:4] #remove [:4] for full date
            except Exception as e:
                print('No year found or htmldate failed… Check if content exists on this web page. Error [{}]'.format(e))
                article_year = None
        #run divpar analysis function abstracted from: https://github.com/joephofhuis/DivPAR/blob/main/src/analysis/analysis.py
        divpar_analysis = analyze_divpar(all_text, '/Users/am/DivPAR-main/src/analysis/searchstrings.json',4)
        divpar_coding = divpar_analysis[0]
        
        # adding company + page column to analysis outputs
        divpar_analysis[1]['company'] = [domain for n in divpar_analysis[1]['analyzer_event_id']]
        divpar_analysis[1]['page'] = [href for n in divpar_analysis[1]['analyzer_event_id']]
        divpar_analysis_outputs.append(divpar_analysis[1]) # getting data dicts for each divpar analysis run
        
        data['company'].append(domain)
        data['page'].append(href)
        data['date'].append(article_year)
        data['root_report_date'].append(date)
        data['marketperspective'].append(divpar_coding['marketperspective'])
        data['moralperspective'].append(divpar_coding['moralperspective'])
        data['innovationperspective'].append(divpar_coding['innovationperspective'])
        data['word_count_alnum'].append(len([text for text in all_text.split(' ') if text.isalnum()])) #alphanumeric word count
        data['word_count'].append(len([text for text in all_text.split(' ') if text.isalnum()])) #split on whitespace word count
        data['total_links'].append(len(all_hrefs))
        data['document_text'].append(all_text)
        htmls.append(html)

            ####### 3rd level loop ######## ## remove the docttext filter if need to crawl a 4th level, it's neccesary at this level, but we need to build it so that it doesn't stop future crawling which we can observe by adding it above. 
        if limit == True:
            break
        else:
            for i,href in enumerate(href for href in all_hrefs if len([s for s in href if s == '/'])>=2): #double check if this rule is neccesary anymore
                #print(i,'\t',href) #debug

                ## Access href and initialize soup object with selenium driver's page source
                href_domain = ''.join(href.split('/')[2].split('.')[:-1])
                href_url = '/'.join(href.split('/')[2:])
                print('{}: [{} of {}] Accessing {} '.format(href,i,len([href for href in all_hrefs if len([s for s in href if s == '/'])>=2]),href))
                if href in data['page']:
                    print('Duplicate Href, Skipping Crawl')
                    continue
                try:
                    dget(href)
                except Exception as e:
                    print("GET Error: {}".format(e))
                    
                    data['company'].append(domain)
                    data['page'].append(href)
                    data['date'].append(None)
                    data['root_report_date'].append(date)
                    data['marketperspective'].append(None)
                    data['moralperspective'].append(None)
                    data['innovationperspective'].append(None)
                    data['word_count_alnum'].append(None) #alphanumeric word count
                    data['word_count'].append(None) #split on whitespace word count
                    data['total_links'].append(None)
                    data['document_text'].append(None)
                    htmls.append(html)
                    continue
                html = driver.page_source
                
                if html in htmls: # if the page source has not changed or if is duplicate, resets loop and outputs Nones (this happens to handle opening pdf pages in --headless mode well by spotting that the driver page source has not changed. This intially caused duplicate text entries since last page source is unchanged thus logged a second time)
                    html = None
                    print("Driver Could Not Load Page or Duplicate Text Body Found - Check URL (if pdf, data went to downloads folder")
                    
                    data['company'].append(domain)
                    data['page'].append(href)
                    data['date'].append(None)
                    data['root_report_date'].append(date)
                    data['marketperspective'].append(None)
                    data['moralperspective'].append(None)
                    data['innovationperspective'].append(None)
                    data['word_count_alnum'].append(None) #alphanumeric word count
                    data['word_count'].append(None) #split on whitespace word count
                    data['total_links'].append(None)
                    data['document_text'].append(None)
                    htmls.append(html)
                    continue
                    
                soup = BeautifulSoup(html, "html.parser")

                #remove unwanted sections of sites
                [el.decompose() for el in soup.find_all('head')+soup.find_all('header')+soup.find_all('footer')+soup.find_all('nav')]
                #build list of remaining hrefs using subdomain any-matching and joining relative links to domain
                try:
                    all_hrefs = {el.get('href','').split('?')[0].split('#')[0] for el in soup.find_all(['a']) if 'http' in el.get('href','') and len([s for s in el.get('href','') if s == '/'])>=2 and any(match in href.split('/')[2].split('.')[:-1] for match in el.get('href','').split('/')[2].split('.')[:-1] if match != 'www') == True}
                    relative_links = ['/'.join(href.split('/')[:3])+relative_link.split('?')[0].split('#')[0] for relative_link in {el.get('href','') for el in soup.find_all(['a']) if not any(substring in el.get('href','') for substring in ['#','http','mailto:','.zip','javascript:void(0)'])}]
                    all_hrefs.update(relative_links)
                    all_hrefs = {href for href in all_hrefs if href not in data['page']+[val[0] for val in urls.values()] and not any(substring in href for substring in excluded_terms)}
                except Exception as e:
                    print("Href Error: {} ".format(e))
                #join remaining text body
                all_text = ' '.join([b.get_text(separator='\n') for b in soup.find_all('body')])

                if all_text in data['document_text']: # if the page source has not changed or if is duplicate, resets loop and outputs Nones
                    html = None
                    print("Found duplicate text body from alternate url, skipping to next crawl…")
                    data['company'].append(domain)
                    data['page'].append(href)
                    data['date'].append(None)
                    data['root_report_date'].append(date)
                    data['marketperspective'].append(None)
                    data['moralperspective'].append(None)
                    data['innovationperspective'].append(None)
                    data['word_count_alnum'].append(None) #alphanumeric word count
                    data['word_count'].append(None) #split on whitespace word count
                    data['total_links'].append(None)
                    data['document_text'].append('DUPLICATE')
                    htmls.append(html)
                    continue

                ## Use use year from url, if none, use htmldate to get best guess of article date (see: https://github.com/adbar/htmldate)
                article_year = [re.findall('(?<=[^0-9])[1-2][0-9]{3}(?=[^0-9]|$)', href) or [None]][0][0]
                #use htmldate to get next best guess of article date (see: https://github.com/adbar/htmldate)
                if article_year is None:
                    try:
                        article_year = find_date(html)[:4] #remove [:4] for full date
                    except Exception as e:
                        print('No year found or htmldate failed… Check if content exists on this web page. Error [{}]'.format(e))
                        article_year = None
                #run divpar analysis function abstracted from: https://github.com/joephofhuis/DivPAR/blob/main/src/analysis/analysis.py
                divpar_analysis = analyze_divpar(all_text, '/Users/am/DivPAR-main/src/analysis/searchstrings.json',4)
                divpar_coding = divpar_analysis[0]
                
                # adding company + page column to analysis outputs
                divpar_analysis[1]['company'] = [domain for n in divpar_analysis[1]['analyzer_event_id']]
                divpar_analysis[1]['page'] = [href for n in divpar_analysis[1]['analyzer_event_id']]
                divpar_analysis_outputs.append(divpar_analysis[1]) # getting data dicts for each divpar analysis run
                
                data['company'].append(domain)
                data['page'].append(href)
                data['date'].append(article_year)
                data['root_report_date'].append(date)
                data['marketperspective'].append(divpar_coding['marketperspective'])
                data['moralperspective'].append(divpar_coding['moralperspective'])
                data['innovationperspective'].append(divpar_coding['innovationperspective'])
                data['word_count_alnum'].append(len([text for text in all_text.split(' ') if text.isalnum()])) #alphanumeric word count
                data['word_count'].append(len([text for text in all_text.split(' ') if text.isalnum()])) #split on whitespace word count
                data['total_links'].append(len(all_hrefs))
                data['document_text'].append(all_text)
                htmls.append(html)
                
            

first loop for  https://diversity.fb.com/
Accessing:  https://diversity.fb.com/
https://diversity.fb.com/./initiatives/in-our-communities/: [0 of 5] Accessing https://diversity.fb.com/./initiatives/in-our-communities/ 
https://about.fb.com/wp-content/uploads/2020/07/Civil-Rights-Audit-Final-Report.pdf: [0 of 14] Accessing https://about.fb.com/wp-content/uploads/2020/07/Civil-Rights-Audit-Final-Report.pdf 
Driver Could Not Load Page or Duplicate Text Body Found - Check URL (if pdf, data went to downloads folder
https://diversity.fb.com/initiative/codefwd/: [1 of 14] Accessing https://diversity.fb.com/initiative/codefwd/ 
https://diversity.fb.com/initiative/tech-prep/: [2 of 3] Accessing https://diversity.fb.com/initiative/tech-prep/ 
https://connectivity.fb.com/: [3 of 2] Accessing https://connectivity.fb.com/ 
https://diversity.fb.com/initiative/facebook-university/: [4 of 14] Accessing https://diversity.fb.com/initiative/facebook-university/ 
https://diversity.fb.com/initiative/boost-

# Results From Crawl/Analysis

In [26]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Constructs dataframe from all analyzed articles ###
df = pd.DataFrame.from_dict(data)  # use when new pull completed
# Exports that to .CSV
out_path = str(os.getcwd())+"/"+str(datetime.now())+"_DivCrawl_output.csv"
df.to_csv(out_path,index=False)

## View Which .pdfs found in hrefs by crawler ##
linked_pdf_df = df[df['page'].str.contains('pdf')][df.columns[:-1]]

## View results of some company's articles by year
#df[df['company'].str.contains('apple') & df['date'].str.contains('2022')][df.columns][:-2]

### View Only Articles w/ Matches DF 
with_matches_df = df[(df['moralperspective']>0) | (df['marketperspective']>0) | (df['innovationperspective']>0)].drop_duplicates()

### Totals by Company
grouped_companies_df = df[df.columns[:-1]].groupby(['company']).sum()
#df[df.columns[:-1]].drop_duplicates()#.groupby(['company']).sum()


# View dataframe without text column (due to size)
#df[df.columns[:-1]]
grouped_companies_df

Unnamed: 0_level_0,marketperspective,moralperspective,innovationperspective,word_count_alnum,word_count,total_links
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
diversityfb,0.0,8.0,0.0,18423.0,18561.0,386.0
wwwaboutamazon,3.0,46.0,5.0,83081.0,83378.0,1262.0


# DivPAR Coded Articles Dataframes

In [29]:
## Divpar analysis output logger dataframes and csv exports

## Merges all analysis output dicts into one dict ##
d0={k:[] for k in divpar_analysis_outputs[0]}
for output in [output for output in divpar_analysis_outputs if output is not None]:
    for k,v in output.items():
        d0[k]+=v
## Turns divanalysis logs into df
analysis_logs_df = pd.DataFrame.from_dict(d0, orient='index').transpose()

## View Analysis logs Where Match Counted ## 
analysis_logs_matches_df = analysis_logs_df[analysis_logs_df['analyzer_event_type']=='match-counted']
analysis_logs_matches_df.head(50)
# Exports that to .CSV
out_path = str(os.getcwd())+"/"+str(datetime.now())+"_divanalyze_matches_log.csv"
analysis_logs_matches_df.to_csv(out_path,index=False)
# View Analysis logs where trigger found but not validated by a Frame perspective
analysis_logs_invalid_trigs_df = analysis_logs_df[analysis_logs_df['analyzer_event_type']=='trig-not-valid']
# Exports that to .CSV
out_path = str(os.getcwd())+"/"+str(datetime.now())+"_divanalyze_invalid_trigs_log.csv"
analysis_logs_invalid_trigs_df.to_csv(out_path,index=False)


analysis_logs_matches_df.head()

Unnamed: 0,analyzer_event_id,analyzer_event_type,frame,phrase,trigger_regex,triggers,phrase_count,match_tried,match,analyzer_event_type:,company,page
89,18,match-counted,moralperspective,Representation matters and we are committed to creating tech teams that match the diversity of the world,\bdiversity,"{(diversity , , ), diversity}",1,"re.compile('(creat(e){0,1}(ing){0,1}|improv(e){0,1}(ing){0,1}) (\\s*\\w+\\s*\\W\\s*){0,5}(diversity|cultural diversity|diverse workforce)', re.IGNORECASE)","[(creating, , ing, , , the , diversity)]",,diversityfb,https://diversity.fb.com/./initiatives/in-our-communities/
244,18,match-counted,moralperspective,Supplier Diversity Program Manager I’m a program manager within Facebook’s finance organization focused on Supplier Diversity,\bdiversity,"{Diversity, (Diversity , , ), (Diversity, , )}",1,"re.compile('(diversity|inclusion)(\\s*\\w+\\s*\\W\\s*){0,5}(guideline(s){0,1}|program(s){0,1}|benchmark(s){0,1}|criteria|criterion)', re.IGNORECASE)","[(Diversity, a , program, , , )]",,diversityfb,https://diversity.fb.com/initiative/supplier-diversity-program/
333,3,match-counted,moralperspective,"Visit Site Externs can be matched with one of the following eight Facebook teams: Finance Operations Diversity and Inclusion (HR) Recruiting Programs and Operations IT Service Desk Media Partnerships Information Security Facilities Operations Communications For full descriptions of the externship opportunities within each team and more information about qualifying for the program, visit the Facebook Summer Academy website",\bdiversity,"{Diversity and Inclusion, Diversity, (Diversity , , )}",1,"re.compile('(diversity|inclusion)(\\s*\\w+\\s*\\W\\s*){0,5}(guideline(s){0,1}|program(s){0,1}|benchmark(s){0,1}|criteria|criterion)', re.IGNORECASE)","[(Diversity, Recruiting , Programs, , s, )]",,diversityfb,https://diversity.fb.com/initiative/facebook-summer-academy/
369,2,match-counted,moralperspective,"During this course, participants will learn about diversity and inclusion-related strategies and tactics to: Help All Team Members Feel Valued Participants learn how to create a sense of belonging for all team members, particularly those from underrepresented groups",\bdiversity,"{(diversity , , ), diversity, diversity and inclusion}",1,"re.compile('(diverse composition|fair balance|composition is appropriate|feel respected|feel valued)', re.IGNORECASE)",[Feel Valued],,diversityfb,https://diversity.fb.com/initiative/managing-inclusion/
558,2,match-counted,moralperspective,"Since 2014, we’ve publicly reported Facebook’s diversity metrics and shared our plans to better support communities of color, women, members of the LGBTQ+ community and others",\bdiversity,"{(different, , , , , , , backgrounds, , s, , ), Diversity, (Diversity , , )}",3,"re.compile('(diversity|inclusion)(\\s*\\w+\\s*\\W\\s*){0,5}(program|standard(s){0,1}legislation|regulation|policy|policies|plan(s){0,1})', re.IGNORECASE)","[(diversity, our , plans, , s)]",,diversityfb,https://about.fb.com/news/2021/07/facebook-diversity-report-2021/
