Frederick Lancia
Eutopia Task 1
April 28

In [2]:
import re
from bs4 import BeautifulSoup as bs
from pip._vendor import requests
import pandas as pd
from googletrans import Translator

# link_type is a class for finding a link for a certain type
# such as for the contact page
# it takes a list of key words that would indicate high likelyhood
# of being a link for such page. 
# it looks for those words first on the text over links and then
# in the url itself
# it makes the assumption that the text over a link is a slightly
# better indicator of the link's type than the url itself

class link_type:
    def check_all_words(self):
        final_link = "N/A"
        for word in self.words:
            # for every key word, ask if it is in some text over a link
            final_link = self.check(word, 0)
            # if found, break and return the link
            if (final_link != "N/A"):
                break 
            # ask if it is in the link itself
            final_link = self.check(word, 1)
            # if found, break and return the link
            if (final_link != "N/A"):
                break
        return final_link


    def check(self, word, num):
        final_link = "N/A"
        # for every link, ask if the word can be found there
        # if num is 0, look in texts over links
        # if num is 1, look in links themselves
        for item in links:
            if word in item[num].lower():
                final_link = str(item[1])
                return final_link
        return final_link

    def __init__(self, words):
        self.words = words

# paragraphs_finder is a class that takes the link to a page and breaks
# the text into a list of strings of paragraphs
# some groups of text appear as paragraphs on the site, but are separated
# by different elements on the html. This keeps those groups of text
# together and returns them to the way they appear on the site

class paragraphs_finder:
    def check(self):
        link = self.link
        if link == "N/A":
            return ""
        # use html parser to find all elements which might contain text
        request = requests.get(link)
        content = request.content
        soup = bs(content)

        paragraphs = soup.findAll(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        # keep texts which come in the same larger elements together
        # because sometimes texts which appear together on the visual site 
        # do not on the html.
        # create a list of each of these larger groups of text
        paragraphs_text = []
        for paragraph in paragraphs:
            paragraph_text = paragraph.text
            paragraphs_text.append(paragraph.text)
        return paragraphs_text

    def __init__(self, link):
        self.link = link

# translate_lists is a class that takes a list of strings and appends translated 
# versions of each string (in the language of the current site) and returns them 
# as a larger list

class translate_list:
    def trans(self):
        words = self.list.copy()
        for word in self.list:
            translated_word = translator.translate(word, dest=language, src='en')
            words.append(translated_word.text)
        return words

    def __init__(self, list):
        self.list = list

# initialize translator for translating texts
translator = Translator()
# read data from given excel file containing company names and links
company_data = pd.read_excel(r'InputData.xlsx')

# create a list for storing language information, so we don't have to translate
# more than once for a single language
languages = []
languages_and_translations = []

# initialize the final SQL-like dataframe to be returned at the end of the program
new_df = company_data.copy()
new_df["about_us_page"] = "N/A"
new_df["contact_page"] = "N/A"
new_df["privacy_policy"] = "N/A"
new_df["terms_and_conditions"] = "N/A"
new_df["english_description"] = "N/A"
new_df["phone_numbers"] = "N/A"
new_df["emails"] = "N/A"

# define some generic patterns now which will be used later and don't rely on 
# variables
phone_pattern = r'([\+]?(\d{10,12}))'
remove_spaces_pattern = r'[\r\n\- ]+'
email_pattern = r'[^@ \t\r\n]+@[^@ \t\r\n]+\.[^@ \t\r\n]+'

# set lists of words to be used for finding these desired data pieces
privacy_words = ['privacy policy', 'privacy', 'cookie', 'cookies', 'policy', 
                'impressum', 'imprint', 'legal']
terms_and_conditions_words = ['terms', 'terms and conditions', 'terms of service', 
                'conditions', 'legal']
contact_words = ['contact', 'reach out', 'in touch', 'impressum', 'imprint', 
                'contact us']
about_words1 = ['about', 'about us' 'mission', 'what we do', 'who we are']
about_words2 = ['find out more', 'learn more', 'more information', 'read more']
about_words3 = ['history', 'people', 'description', 'overview']
description_words = ["we"]

language_words_df = pd.read_pickle('language_words.pkl')
"""""
uncomment this code and delete this text if you want to run the program without 
the language file. You will also need to comment out the above line of code

english_list = [privacy_words, terms_and_conditions_words, contact_words, 
                about_words1, about_words2, about_words3, description_words]
starter_data = [['en', english_list]]
language_words_df = pd.DataFrame(starter_data, columns = ['language', 'words'])
"""

# iterate over each row of the given document, recall that each row represents
# a company and website
for i, row in company_data.iterrows():

    company_link = row['website'] # extract link to home page

    # these links return SSLError: HTTPSConnectionPool, so skip this iteration
    if (company_link == 'https://www.airtight.ai/' or 
            company_link == 'https://agrodronegroup.ru/' or 
            company_link == 'https://fotonow.ai/' or
            company_link == 'https://www.finsu.co.uk/' or
            company_link == 'https://www.eupravnik.eu/' or
            company_link == 'https://www.m-flowers.com/' or
            company_link == 'https://www.relade.eu/'): continue

    company_name = row['name'] # extract company name

    # use beautiful soup api to parse html of site
    home_request = requests.get(company_link)
    home_content = home_request.content
    home_soup = bs(home_content)

    # split visual text into paragraphs, because some bits of text which
    # should read together are in different elements
    # use first paragraph to detect language of website
    # assume that language remains constant throughout
    paragraphs = home_soup.findAll(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

    # nearly all pages will have some paragraphs, if not, assume English
    if paragraphs: language = translator.translate(paragraphs[0].text).src
    else: language = 'en'

    # this translator API is extremely slow, so instead of translating the
    # entire site into English, I will translate the words that I use to 
    # identify the desired data to the language of the site.
    # I will hold onto the English words, in case parts of the site still
    # use English. For example, many foreign sites will still use English URLS.

    if language not in set(language_words_df['language']):
        # data for this language doesn't yet exist in the data frame

        new_language_list = [translate_list(privacy_words).trans(),
                            translate_list(terms_and_conditions_words).trans(),
                            translate_list(contact_words).trans(),
                            translate_list(about_words1).trans(),
                            translate_list(about_words2).trans(),
                            translate_list(about_words3).trans(),
                            translate_list(description_words).trans()]
        new_row_data = pd.DataFrame([[language, new_language_list]],
                            columns=['language', 'words'])
        language_words_df = pd.concat([language_words_df, new_row_data], 
                            ignore_index=True)
    # isolate the row of the language this article is in into it's own dataframe
    language_row = language_words_df.loc[language_words_df['language'] == language]

    # extract word lists from the work list dataframe
    privacy_words = language_row.iat[0,1][0]
    terms_and_conditions_words = language_row.iat[0,1][1]
    contact_words = language_row.iat[0,1][2]
    about_words1 = language_row.iat[0,1][3]
    about_words2 = language_row.iat[0,1][4]
    about_words3 = language_row.iat[0,1][5]
    description_words = language_row.iat[0,1][6]


    # find all links to the site on the home page
    # all links should be found in 'a' or 'p' elements
    links = []
    phone_numbers = []
    emails = []

    for a in home_soup.find_all(['a', 'p'], href=True):
        link = a['href']

        text = a.text
        
        # if the link contains the company link, its definately
        # what we want
        if company_link in link:
            links.append([text, link])
        else:
            # this could be an incomplete link to the site
            # or it could be a phone number or email

            # see if an email address can be found in the link or its text
            new_emails = re.findall(email_pattern, text, re.IGNORECASE)
            new_emails.extend(re.findall(phone_pattern, link, re.IGNORECASE))
            # if so, save it to emails
            if new_emails:
                for email in new_emails:
                    emails.append(email)

            # see if phone number can be found in the link or its text
            # first, remove spaces, dashes, returns, etc
            text_without_spaces = re.sub(remove_spaces_pattern, '', text)
            # its just numbers, so no need to ignore case this time
            new_numbers = re.findall(phone_pattern, text_without_spaces)
            new_numbers.extend(re.findall(phone_pattern, link))

            #if so, save them to phone_numbers
            if new_numbers:
                for number in new_numbers:
                    phone_numbers.append(number)

            # if no numbers or emails were found, and it doesn't contain
            # a period, indicating a .com of some sort, its probably an
            # incomplete link to the site, so save it to links
            if (not new_numbers) & (not new_emails) & ('.'  not in link):
                link = company_link + link
                links.append([text, link])

    # use found links and set key words to find links to desired pages of site
    contact_link = link_type(contact_words).check_all_words()
    privacy_link = link_type(privacy_words).check_all_words()
    terms_and_conditions_link = link_type(terms_and_conditions_words).check_all_words()

    # about us is a more complex concept such that a site may have more than one
    # page about the company, but we must find the best one. That is why we need
    # to look at different key words in a specific order. It also has a longer
    # list of words because there are many semantically similar phrases.
    # So, words are tested in order of 3 groups
    about_link = link_type(about_words1).check_all_words()
    if about_link == "N/A":
        about_link = link_type(about_words2).check_all_words()
        if about_link == "N/A":
            about_link = link_type(about_words3).check_all_words()


    # find text of paragraphs on each of these pages

    # break text in all relevant pages into paragraphs for parsing
    # we will then use these links and their paragraphs to find desired data
    company_link_plus = [company_link, paragraphs_finder(company_link).check()]
    contact_link_plus = [contact_link, paragraphs_finder(contact_link).check()]
    privacy_link_plus = [privacy_link, paragraphs_finder(privacy_link).check()]
    about_link_plus = [about_link, paragraphs_finder(about_link).check()]

    # create lists for finding each page link
    # let them contain links and the paragraphs on those pages
    # the links are ordered from most to least likely to find the desired information
    # contact_info_links is for finding phone numbers and emails
    # for example, its highly likely that they will be found on contact page,
    # that's why contact links come first, and about page links second
    contact_info_links = [contact_link_plus, about_link_plus, company_link_plus]
    # for finding description phrase
    description_links = [about_link_plus, company_link_plus]
    # add company name now to helpful words for finding the description
    # because this didn't need to be translated
    description_words_plus = [company_name] + description_words

    # variable for containing the description string
    description = []

    # start looking for description on each page
    for page in description_links:
        # recall that each element of description links contains both the link and 
        # its text
        # extract those now
        link = page[0]
        paragraphs_text = page[1]

        # look for key words in paragraphs
        for word in description_words_plus:
            for paragraph in paragraphs_text:
                # remove new lines to simplify and improve results
                remove_new_line_pattern = '[\r\n]{1,}'
                paragraph = re.sub(remove_new_line_pattern, '', paragraph)

                # look for the start of a paragraph or a new sentence that contains the 
                # key word and at least 2 words
                # following. save the whole sentence to description
                # an example of a sentence of this form is:
                # we are trying to do x OR
                # Grofit is selling x
                # if I had more time, I might try to look for a verb soon after this key 
                # word "we" or company name
                description = re.findall('[.!?]?[ ]*([^.!?]*'+word+' [^.!? ]+ [^.!?]+[.!?])', 
                                paragraph, re.IGNORECASE)
                if description:
                    # yay, you found a description
                    # if  you don't yet have an about page, there's a pretty good chance
                    # you're on it now.
                    if about_link == "N/A":
                        about_link = link
                    break
            if description: break
        if description: break

    # translate non English descriptions to English
    if language != "en":
        description = translator.translate(description, src=language)

    # now try to find contact info: email addresses and phone numbers
    for page in contact_info_links:
        link = page[0]
        paragraphs_text = page[1]

        # start with emails
        for paragraph in paragraphs_text:
            # use a generic email pattern to find emails in paragraphs
            new_emails = re.findall(email_pattern, paragraph, re.IGNORECASE)
            # if found, add all to emails
            if new_emails:
                for email in new_emails:
                    emails.append(email)

            # remove spaces, dashes, and new lines before looking for phone numbers
            paragraph = re.sub(remove_spaces_pattern, '', paragraph)

            # use a basic phone number pattern to find phone numbers in paragraphs
            new_numbers = re.findall(phone_pattern, paragraph)
            if new_numbers:
                # add them to phone_numbers
                for number in new_numbers:
                    phone_numbers.append(number)
        # if phone numbers have been found on one page, assume that different numbers
        # will not be found on another
        if phone_numbers: break

    # add all values from this company to next row of dataframe
    new_df.at[i, "phone_numbers"] = phone_numbers
    new_df.at[i, "emails"] = emails
    new_df.at[i, "contact_page"] = contact_link
    new_df.at[i, "english_description"] = description
    new_df.at[i, "about_us_page"] = about_link
    new_df.at[i, "emails"] = emails
    new_df.at[i, "privacy_policy"] = privacy_link
    new_df.at[i, "terms_and_conditions"] = terms_and_conditions_link

    # update the language_words pickle file in case a new language was added
    language_words_df.to_pickle('language_words.pkl') 
    # export final results
new_df.to_excel(r'EutopiaTask1Results.xlsx', index = False, header=True)


  language                                              words
0       en  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
0       en  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
0       en  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
1       ru  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
1       ru  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
0       en  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
2       it  [[privacy policy, privacy, cookie, cookies, po...
  language                                              words
0       en  [[privacy policy, privacy, cookie, cookies, po...
  langua

ConnectTimeout: _ssl.c:1108: The handshake operation timed out