In [131]:
#%pip install tldextract
import tldextract
import os
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time

In [132]:
def _get_company_data(file_path="../assets/company_data.txt"):
    # Read the flat file using a context manager
    with open(file_path, "r") as f:
        data = f.readlines()

    source_data = []
    no_url = 0

    for counter, row in enumerate(data):
        if counter > 0:  # Skip the header row
            row = row.strip().split("\t")
            company = {
                'company_name': row[0],
                'legal_name': row[1],
                'url': row[2]
            }
            if not company['url']:
                no_url += 1
            source_data.append(company)

    print(f"Number of companies: {len(source_data)}")
    print(f"Number of companies without URL: {no_url}")

    url_oriented_data = {}
    for company in source_data:
        url = company['url']
        tld = _extract_tld(url)
        if tld:
            if tld not in url_oriented_data:
                url_oriented_data[tld] = {}
                url_oriented_data[tld]['companies'] = []
            url_oriented_data[tld]['companies'].append(company)


    print(f"Number of companies with unique URL: {len(url_oriented_data)}")
    return url_oriented_data



def _extract_tld(url):
    if url is None or url == "":
        return None
    
    if not url.startswith("http"):
        url = f"https://{url}"
        
    extracted = tldextract.extract(url)
    # Combine the domain and suffix (TLD)
    tld = f"{extracted.domain}.{extracted.suffix}"
    return tld

In [133]:
def setup():
    company_data = _get_company_data()
    print(f"found {len(company_data)} viable company URLs")
    # we'll make a directory for each company, if we need it
    new_setup = 0
    for company in company_data:
        os.makedirs(f"../assets/{company}", exist_ok=True)
        # check if the file exists, if no, add it from the source
        if not os.path.exists(f"../assets/{company}/company_data.json"):
            with open(f"../assets/{company}/company_data.json", "w") as f:
                f.write(json.dumps(company_data[company]))
                new_setup += 1
    print(f"Wrote {new_setup} new company data files")


In [134]:
def process_url(data, driver):
    print(data)
    url = _extract_tld(data["companies"][0]['url'])
    return query_google(url, driver)


def query_google(url, driver):    
    log = {}
    log["timestamp"] = time.time()
    try:
        search_query = "family maternity leave paternity HR policy".replace(" ", "+")
        # Navigate to Google
        goog_url = "https://www.google.com/search?q=" + search_query + "+site%3A" + url + "&oq=" + search_query + "+site%3A" + url + "&sourceid=chrome&ie=UTF-8"
        log["url"] = goog_url
        driver.get(goog_url)
                
        # Wait for results to load
        time.sleep(15)
        print("Results loaded")
        page_source = driver.page_source
        log["page_source"] = page_source
    except Exception as e:
        print(f"An error occurred while querying Google: {e}")
        log["error"] = str(e)
    return log


In [137]:
def process():
    counter = 0
    driver_path ="./chromedriver" 
    service = Service(executable_path=driver_path)

    # Initialize the Chrome driver with the Service object
    driver = webdriver.Chrome(service=service)

    for root, dirs, files in os.walk("../assets"):
        if "company_data.json" in files:
            with open(f"{root}/company_data.json", "r") as f:
                try:
                    data = json.load(f)
                    complete = False
                    if 'status' in data:
                        if data['status'] == "complete":
                            complete = True
                    if not complete:
                        response = process_url(data, driver)
                        if "crawl" not in data:
                            data["crawl"] = []
                        data["crawl"].append(response)
                        data["status"] = "complete"
                        with open(f"{root}/company_data.json", "w") as f:
                            f.write(json.dumps(data))
                    else:
                        print("Already crawled")
                except Exception as e:
                    print(f"An error occurred while processing company data: {e}")
        else:
            print("No company data file found")
    driver.quit()


In [138]:
setup()
process()

Number of companies: 43893
Number of companies without URL: 21259
Number of companies with unique URL: 17804
found 17804 viable company URLs
Wrote 0 new company data files
No company data file found
Already crawled
Already crawled
Already crawled
Already crawled
Already crawled
Already crawled
{'companies': [{'company_name': 'DORCHESTER MINERALS  -LP', 'legal_name': 'Dorchester Minerals LP', 'url': 'www.dmlp.net'}]}
Results loaded
{'companies': [{'company_name': 'TELLABS INC', 'legal_name': 'Tellabs Inc', 'url': 'www.tellabs.com'}]}
Results loaded
{'companies': [{'company_name': 'AMB PROPERTY CORP', 'legal_name': 'AMB Property Corp', 'url': 'www.amb.com'}]}
Results loaded
{'companies': [{'company_name': 'ACD SYSTEMS INTL INC', 'legal_name': 'ACD Systems International Inc', 'url': 'www.acdsee.com'}]}
Results loaded
{'companies': [{'company_name': 'WESTBURY BANCORP INC', 'legal_name': 'Westbury Bancorp Inc', 'url': 'www.westburybankwi.com'}]}
Results loaded
{'companies': [{'company_name'

KeyboardInterrupt: 