In [1]:
import requests
from dotenv import load_dotenv
import os
load_dotenv()


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

Set parameters

In [9]:
class BingSearchAPI():

    def __init__(self) -> None:

        # replace with list of keys once i run out
        self.subscription_key_free  = os.getenv("BING_SEARCH_API_KEY_FREE_1")
        self.subscription_key_paid = os.getenv("BING_SEARCH_API_KEY_PAID_1")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key_free}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Inser Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}

    def search(self, query: str) -> dict:
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return "Bing Search has failed"
        
        self.search_results = response.json()

        # get dict of site names and urls
        for result in self.search_results['webPages']['value']:
            print("Result is ", result)
            print(result["siteName"])
            self.webpages[result["siteName"]] = result["url"]
    
        return self.search_results
    
    def get_top_webpages(self) -> list:
        return self.webpages
    

In [3]:
class WebScraper():
    def __init__(self) -> None:
        # Set up Chrome driver with webdriver manager
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')  # Run headless for no browser window
        self.options.add_argument('--disable-gpu')  # Disable GPU acceleration
        self.options.add_argument('--no-sandbox')  # Required for some Linux environments

        # Automatically download and use ChromeDriver
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.options)

        # for using requests
        self.requests_headers =  {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    def extract_text_with_selenium(self,url):
        try:
            # Open the URL in the browser
            self.driver.get(url)
            # Let the page load
            time.sleep(2) 
            # Get page source and parse it after it has fully loaded
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            text = soup.get_text(separator='\n')
            return text
        except Exception as e:
            print(f"An error occurred: {e}")
            return "No content"
        
    def extract_text_with_requests(self,url):
        try:
            # Fetch the content from the URL with headers
            response = requests.get(url, headers=self.requests_headers)
            response.raise_for_status()  # Check if the request was successful

            # Parse text
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator='\n')  # Using '\n' to preserve some structure
            return text
        except Exception as e:
            print(f"Error fetching the URL: {e}")
            return "No content"
        
    def scrape_webpage(self, url: str) -> str:
        # use both methods and return the one that works
        text_1 = self.extract_text_with_selenium(url)
        text_2 = self.extract_text_with_requests(url)

        if len(text_1) > len(text_2):
            return text_1
        else:
            return text_2

In [10]:
bing_searcher = BingSearchAPI()
site_scraper = WebScraper()

Test for one web page

In [11]:
r = bing_searcher.search("Dancing Goats Coffee ARR, LLC Registered Address")
urls = bing_searcher.get_top_webpages()
urls

Result is  {'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0', 'name': 'DANCING GOATS COFFEE ARR, LLC - Bizapedia', 'url': 'https://www.bizapedia.com/ga/dancing-goats-coffee-arr-llc.html', 'datePublished': '2024-08-16T00:00:00.0000000', 'datePublishedDisplayText': 'Aug 16, 2024', 'isFamilyFriendly': True, 'displayUrl': 'https://www.bizapedia.com/ga/dancing-goats-coffee-arr-llc.html', 'snippet': "DANCING GOATS COFFEE ARR, LLC. DANCING GOATS COFFEE ARR, LLC is a Georgia Domestic Limited-Liability Company filed on July 5, 2022. The company's filing status is listed as Active/Compliance and its File Number is 22146246. The Registered Agent on file for this company is Bob J Goldberg and is located at 1600 Parkwood Circle Suite 400, Atlanta ...", 'dateLastCrawled': '2024-09-27T18:07:00.0000000Z', 'language': 'en', 'isNavigational': True, 'richFacts': [{'label': {'text': 'Location'}, 'items': [{'text': 'WA'}], 'hint': {'text': 'ADDRESS:LOCATIONGENERAL'}}], 'noCache': False, 'siteName'

{'Bizapedia': 'https://www.bizapedia.com/addresses/200-market-st-ne-olympia-wa-98501.html',
 'OpenCorporates': 'https://opencorporates.com/companies/us_pa/2639070',
 'Georgia Company Directory': 'https://www.georgiacompanyregistry.com/companies/dancing-goats-coffee-arr-llc/',
 'Dun & Bradstreet': 'https://www.dnb.com/contact-directory/contact-profile.zoe.cb802a4479b3264f89a76231ec17f223.html',
 'Dancing Goats® Coffee': 'https://www.dancinggoats.com/pages/wholesale-info',
 'MenuPix': 'https://www.menupix.com/atlanta/restaurants/31628996/Dancing-Goats-Coffee-Bar-Atlanta-GA',
 'Buzzfile': 'https://www.buzzfile.com/business/Dancing-Goats-Coffee-Arr,-LLC-360-753-3391'}

In [22]:
r

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': 'Dancing Goats Coffee ARR, LLC Registered Address',
  'askUserForLocation': True},
 'webPages': {'webSearchUrl': 'https://www.bing.com/search?q=Dancing+Goats+Coffee+ARR%2c+LLC+Registered+Address',
  'totalEstimatedMatches': 147000,
  'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
    'name': 'DANCING GOATS COFFEE ARR, LLC - Bizapedia',
    'url': 'https://www.bizapedia.com/ga/dancing-goats-coffee-arr-llc.html',
    'datePublished': '2024-08-16T00:00:00.0000000',
    'datePublishedDisplayText': 'Aug 16, 2024',
    'isFamilyFriendly': True,
    'displayUrl': 'https://www.bizapedia.com/ga/dancing-goats-coffee-arr-llc.html',
    'snippet': "DANCING GOATS COFFEE ARR, LLC. DANCING GOATS COFFEE ARR, LLC is a Georgia Domestic Limited-Liability Company filed on July 5, 2022. The company's filing status is listed as Active/Compliance and its File Number is 22146246. The Registered Agent on file for this company

In [16]:
website_info = ""
for key, value in urls.items():
    print(f"Contents of the website of {key} with url {value}")
    result = site_scraper.scrape_webpage(value)
    print(len(result))
    website_info = website_info + f"\n contents from website with name: {key}" + result

Contents of the website of Bizapedia with url https://www.bizapedia.com/addresses/200-market-st-ne-olympia-wa-98501.html
3699
Contents of the website of OpenCorporates with url https://opencorporates.com/companies/us_pa/2639070
2993
Contents of the website of Georgia Company Directory with url https://www.georgiacompanyregistry.com/companies/dancing-goats-coffee-arr-llc/
8565
Contents of the website of Dun & Bradstreet with url https://www.dnb.com/contact-directory/contact-profile.zoe.cb802a4479b3264f89a76231ec17f223.html
1435
Contents of the website of Dancing Goats® Coffee with url https://www.dancinggoats.com/pages/wholesale-info
8983
Contents of the website of MenuPix with url https://www.menupix.com/atlanta/restaurants/31628996/Dancing-Goats-Coffee-Bar-Atlanta-GA
8988
Contents of the website of Buzzfile with url https://www.buzzfile.com/business/Dancing-Goats-Coffee-Arr,-LLC-360-753-3391
Error fetching the URL: 403 Client Error: Forbidden for url: https://www.buzzfile.com/business

In [18]:
website_info

"\n contents from website with name: Bizapedia\n\n\n\n\n\n200 Market St Ne Olympia, WA 98501-6965 - Bizapedia Address Profile\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n200 MARKET ST NE OLYMPIA, WA 98501-6965\nSponsored Links\n\n\n\n\n\n\n\n\nThere are 9 companies that have an address matching 200 Market St Ne Olympia, WA 98501-6965.\nThe companies are Challain Inc Spc, Challain Inc, Dgcb Atl Ponce City Market LLC, Dgcb Atl 33 Peachtree Place LLC, Dgcb Atl Buckhead Exchange LLC, Tdgcb LLC, Challain Inc, Dancing Goats Coffee Arr LLC, and Dancing Goats Coffee.\nCHALLAIN, INC., SPC\nWASHINGTON WA SOCIAL PURPOSE CORPORATION\nWRITE REVIEW\nAddress:\xa0\xa0\n200 Market St Ne\nOlympia, WA 98501-6965 \nAddress\xa0Types:\xa0\xa0\nPrincipal, Mailing, and Registered Agent\nRegistered\xa0Agent:\xa0\xa0\nDavid J Wasson\nFiled:\xa0\xa0\nMay 21, 1990\nFile\xa0Number:\xa0\xa0\n601250414\nContact Us About The Company Profile For Challain, Inc., Spc\nCHALLAIN, INC.\nFLO

Clean text

In [20]:
import re
def clean_text(text):
    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    # Remove leading/trailing whitespace from each line
    text = '\n'.join(line.strip() for line in text.splitlines())
    # Remove extra spaces between words
    text = re.sub(r'\s+', ' ', text)
    return text

In [21]:
cleaned_text = clean_text(website_info)
print(cleaned_text)

 contents from website with name: Bizapedia 200 Market St Ne Olympia, WA 98501-6965 - Bizapedia Address Profile 200 MARKET ST NE OLYMPIA, WA 98501-6965 Sponsored Links There are 9 companies that have an address matching 200 Market St Ne Olympia, WA 98501-6965. The companies are Challain Inc Spc, Challain Inc, Dgcb Atl Ponce City Market LLC, Dgcb Atl 33 Peachtree Place LLC, Dgcb Atl Buckhead Exchange LLC, Tdgcb LLC, Challain Inc, Dancing Goats Coffee Arr LLC, and Dancing Goats Coffee. CHALLAIN, INC., SPC WASHINGTON WA SOCIAL PURPOSE CORPORATION WRITE REVIEW Address: 200 Market St Ne Olympia, WA 98501-6965 Address Types: Principal, Mailing, and Registered Agent Registered Agent: David J Wasson Filed: May 21, 1990 File Number: 601250414 Contact Us About The Company Profile For Challain, Inc., Spc CHALLAIN, INC. FLORIDA FOREIGN PROFIT CORPORATION WRITE REVIEW Address: 200 Market St Ne Olympia, WA 98501-6965 Address Types: Principal and Mailing Registered Agent: Cross Street Corporate Servi