In [67]:
import requests
import re
from bs4 import BeautifulSoup


In [68]:
def get_company_name_from_opengraph(urls: list[str]) -> list[str]:
    
    company_names = []

    for url in urls:
        try:
            # Fetch the HTML content of the webpage
            response = requests.get(url, timeout=2)

            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find Open Graph meta tags
            og_title = soup.find('meta', property='og:site_name')
            if og_title:
                company_names.append(og_title['content'])
                print(og_title['content'], url)
            else:
                print("No Open Graph meta tags found for the URL:", url)
        except Exception as e:
            print("Error:", e)
        
    return company_names

def get_company_name_from_html_title_tag(urls: list[str]) -> list[str]:
    
    company_names = []

    for url in urls:
        try:
            # Fetch the HTML content of the webpage
            response = requests.get(url)

            # Parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the title tag
            title_tag = soup.find('title').text.strip()
            if title_tag:
                company_names.append(title_tag)
                print(title_tag, url)
            else:
                print("No title tag found for the URL:", url)
        except Exception as e:
            print("Error:", e)
        
    return company_names

def get_domain_name_using_regex(urls: list[str]) -> list[str]:

    company_names = []

    for url in urls:
        try:
            # Use regex to extract the domain name
            domain = re.search(r"(?:https?://)?(?:www\.)?([^./]+(?:\.[^./]+)+)", url).group(1)

            # Make the domain name title case
            domain = domain.title()
            company_names.append(domain)
            print(domain, url)
        except Exception as e:
            print("Error:", e)
        
    return company_names


def get_domain_name_using_urllib(urls: list[str]) -> list[str]:
    from urllib.parse import urlparse

    company_names = []

    for url in urls:
        try:
            # Parse the URL
            parsed_url = urlparse(url)
            # Remove 'www.' if present
            netloc = parsed_url.netloc.replace("www.", "")
            # Extract the domain name
            domain_parts = netloc.split('.')
            if len(domain_parts) > 2:
                domain_name = '.'.join(domain_parts[-2:])
            else:
                domain_name = netloc

            domain_name = domain_name.title()
            domain_name = domain_name.split('.')[0]
            company_names.append(domain_name)
            print(domain_name, url)
        except Exception as e:
            print("Error:", e)
        
    return company_names


In [69]:
company_urls = [
    "https://www.tinybeans.com/",
    "https://barkbox.com/",
    "https://www.warbyparker.com/",
    "https://www.awaytravel.com/",
    "https://www.glossier.com/",
    "https://www.amazon.com/",
    "https://www.google.com/",
    "https://www.apple.com/",
    "https://www.microsoft.com/",
    "https://www.meta.com/",
    "https://www.meundies.com/",
    "https://casper.com/",
    "https://www.allbirds.com/",
    "https://www.chubbiesshorts.com/",
    "https://www.bird.co/",
    "https://www.tesla.com/",
    "https://www.netflix.com/",
    "https://www.coca-colacompany.com/",
    "https://www.walmart.com/",
    "https://www.nike.com/",
    "https://www.canva.com/",
    "https://www.figma.com/",
    "https://www.grammarly.com/",
    "https://www.notion.so/",
    "https://trello.com/",
    "https://www.disney.com/",
    "https://www.ibm.com/",
    "https://www.mcdonalds.com/",
    "https://www.starbucks.com/",
    "https://www.airbnb.com/",
    "https://www.welcome.northmarq.com/about/team"
]

# company_names = get_company_name_from_opengraph(urls=company_urls)
# company_names = get_company_name_from_html_title_tag(urls=company_urls)
# company_names = get_domain_name_using_regex(urls=company_urls)
company_names = get_domain_name_using_urllib(urls=company_urls)

Tinybeans https://www.tinybeans.com/
Barkbox https://barkbox.com/
Warbyparker https://www.warbyparker.com/
Awaytravel https://www.awaytravel.com/
Glossier https://www.glossier.com/
Amazon https://www.amazon.com/
Google https://www.google.com/
Apple https://www.apple.com/
Microsoft https://www.microsoft.com/
Meta https://www.meta.com/
Meundies https://www.meundies.com/
Casper https://casper.com/
Allbirds https://www.allbirds.com/
Chubbiesshorts https://www.chubbiesshorts.com/
Bird https://www.bird.co/
Tesla https://www.tesla.com/
Netflix https://www.netflix.com/
Coca-Colacompany https://www.coca-colacompany.com/
Walmart https://www.walmart.com/
Nike https://www.nike.com/
Canva https://www.canva.com/
Figma https://www.figma.com/
Grammarly https://www.grammarly.com/
Notion https://www.notion.so/
Trello https://trello.com/
Disney https://www.disney.com/
Ibm https://www.ibm.com/
Mcdonalds https://www.mcdonalds.com/
Starbucks https://www.starbucks.com/
Airbnb https://www.airbnb.com/
Northmar