In [30]:
from bs4 import BeautifulSoup
from validate_email import validate_email

import requests
import os
import re
from requests.exceptions import MissingSchema

In [31]:
def get_site_text(url):
    try:
        response = requests.get(url)
    except MissingSchema:
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')

    return soup.get_text()

In [32]:
def get_links(url):
    try:
        response = requests.get(url)
    except MissingSchema:
        return ""

    soup = BeautifulSoup(response.text, 'html.parser')

    urls = [url]
    for link in soup.find_all('a'):
        if link not in urls:
                urls.append(link.get('href'))

    return urls

In [33]:
base_url = "https://aardwolfarchers.org.uk"
filename = "aardwol.txt"

In [34]:
list_of_urls = get_links(base_url)
print(list_of_urls)

['https://aardwolfarchers.org.uk', '#main', 'https://aardwolfarchers.org.uk/', 'https://aardwolfarchers.org.uk/', 'https://aardwolfarchers.org.uk/about/', 'https://aardwolfarchers.org.uk/course/', 'https://aardwolfarchers.org.uk/books/', 'https://aardwolfarchers.org.uk/contact/', 'https://aardwolfarchers.org.uk/resources/', 'https://aardwolfarchers.org.uk/lessons/', 'https://aardwolfarchers.org.uk/', 'https://aardwolfarchers.org.uk/course/', 'https://pindersschoolwear.com/', 'https://aardwolfarchers.org.uk/about/', 'https://aardwolfarchers.org.uk/course/', 'https://aardwolfarchers.org.uk/books/', 'https://aardwolfarchers.org.uk/contact/', 'https://aardwolfarchers.org.uk/lessons/', 'https://www.kadencewp.com/', 'https://aardwolfarchers.org.uk/', 'https://aardwolfarchers.org.uk/about/', 'https://aardwolfarchers.org.uk/course/', 'https://aardwolfarchers.org.uk/books/', 'https://aardwolfarchers.org.uk/contact/', 'https://aardwolfarchers.org.uk/resources/', 'https://aardwolfarchers.org.uk/l

In [35]:
def format_string(string):
    string = string.strip()
    string = os.linesep.join([s for s in string.splitlines() if s])
    return string

In [36]:
with open(filename, "w") as file:
    for site in list_of_urls:
        formatted_string = format_string(get_site_text(site) + "\n\n")
        file.write(formatted_string)

In [37]:
def email_finder(file_addr) -> list[str]:
    with open(file_addr, "r") as file:
        emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", file.read())

        return emails

In [38]:
def email_checker(emails, email) -> bool:

    valid_email = validate_email(email, check_format=True, check_blacklist=True, check_dns=True, dns_timeout=10, check_smtp=True, smtp_timeout=10)
    if not valid_email:
        print("email is invalid")
        return False

    if email not in emails:
        print("email not found on website")
        return False
    else:
        print("email is found on website")
        return True



In [39]:
def company_email_finder(emails):
    #TODO just a temp solution, could also compare the company name to email to get most similar or ask an ai to guess which would be the most likely email
    most_common_email = max(set(emails), key=emails.count)
    print("Adding new email")
    return most_common_email

In [40]:
company_email = "ardwolfarchers@gmail.com"
list_of_emails = email_finder(filename)
print(list_of_emails)
if company_email == "":
    company_email = company_email(list_of_emails)
    is_email_correct = True
else:
    is_email_correct = email_checker(list_of_emails, company_email)

print(is_email_correct)

['aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'info@pindersschoolwear.co.ukwww.pindersschoolwear.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com', 'aardwolfarchers@gmail.com']
email is invalid
False


In [41]:
def email_checker_soup_test(base_url, filename=f"{base_url}.txt", company_email=""):

    list_of_urls = get_links(base_url)
    print(list_of_urls)

    with open(filename, "w") as file:
        for site in list_of_urls:
            formatted_string = format_string(get_site_text(site) + "\n\n")
            file.write(formatted_string)

    list_of_emails = email_finder(filename)
    if company_email == "":
        print("company email has been updated")
        company_email = company_email(list_of_emails)
        is_email_correct = True
    else:
        is_email_correct = email_checker(list_of_emails, company_email)

    return is_email_correct, company_email