# Eutopia test task

In [1]:
import json
import re
from urllib.parse import urlparse

import pandas as pd
import requests

from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from tqdm.auto import tqdm
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from nltk.tokenize import wordpunct_tokenize

In [2]:
# https://chromedriver.chromium.org/downloads

In [3]:
WEBDRIVER = "./chromedriver"

## Task 1

- email (regex)
- phone number
- **short description**
- URL: contact pages
- URL: pages (both Terms and Conditions and Privacy Policy)
- URL: about us pages (this page can also be under Mission, Who we are and similar)

Problems:
- languages
- connection is not secure

**Input data source**

In [4]:
companies = pd.read_excel("InputData.xlsx", sheet_name="Companies")

In [5]:
companies

Unnamed: 0,client_id,name,website
0,42322,Grofit,https://www.grofit.eu/
1,8861,Corrosionradar,https://www.corrosionradar.com/
2,59607,Codio Impact,https://www.codioimpact.com/
3,37252,Wasteout,https://app.wasteout.ru/
4,58593,Agrionica,https://www.agrionica.com/
...,...,...,...
95,56347,Wasteant,https://www.wasteant.com/
96,41264,Sustainiq,https://sustainiq.com/
97,46329,Scoutvision,https://scoutvision.de/?page_id=88&lang=en
98,33854,Legacy Root,https://www.solidwater.life/


In [6]:
companies["website"].apply(lambda x: re.findall("\.([a-z]{2,6})[/$]", x)[0]).value_counts().index

Index(['com', 'de', 'eu', 'it', 'uk', 'io', 'ai', 'dk', 'org', 'nl', 'ru',
       'fr', 'co', 'at', 'ch', 'net', 'pt', 'energy', 'bio', 'app', 'pl',
       'eco', 'live', 'tech', 'earth', 'life'],
      dtype='object')

**Class for information extraction**

In [11]:
class CompanyInfo:
    
    EMAIL_REGEX = re.compile(r'([\w\-]+(?:@|[\(\-]at[\)\-])[\w\-]+\.[a-z]+)', re.MULTILINE | re.IGNORECASE)
    
    # about text stop words that are not part of description
    STOP_WORDS = {"cookie", "cookies", "visitor", "website"}
    
    # word that are likely to be in a company description
    OK_WORDS = {
        "platform", "crowdfunding", "sustainable", "solution", "esg", 
        "decision", "energ", "believe", "marketplace", "project", "growing",
        "renewable", "achieve", "vision", "emissions", "green", "world",
        "waste", "natural",
    }
    
    def __init__(self):
        "Set up a selenium"
        
        options = webdriver.ChromeOptions()

        prefs = {
          "translate_whitelists": {
              "ru": "en", "fr": "en", "de": "en", "de": "en", 
              "it": "en", "nl": "en", "pt": "en", "pl": "en"
          },
          "translate":{"enabled": "true"}
        }
        options.add_experimental_option("prefs", prefs)
        options.add_argument('--ignore-ssl-errors=yes')
        options.add_argument('--ignore-certificate-errors')

        self.driver = webdriver.Chrome(WEBDRIVER, options=options)
        self.driver.set_page_load_timeout(20)
        
    def process(self, url):
        "Main function to process data: calls other functions"
        
        self.main_url = url
        self.url = url
        self.get_page()
        
        terms, privacy, cookies = self.__privacy_and_terms()
        email = self.__find_email()
        contact = self.__get_contact_url()
        about = self.__get_about_url()
        numbers = self.__get_number()
        about_text = self.__get_about_text()
        
        # if there is no short description on the main page, go to about
        if not about_text and about:
            for url in about:
                self.url = url
                self.get_page()
                about_text = self.__get_about_text()
                
                if about_text: break
        
        # if there is no email on the main page, go to contact and other pages
        if not email:
            for url_list in [contact, about, [terms, privacy, cookies]]:
                if url_list is None:
                    continue
                for url in url_list:
                    self.url = url
                    self.get_page()
                    email = self.__find_email()
                    
                    if email: break
                if email: break
        
        # result dictionary with full information
        return {
            "url": self.main_url,
            "privacy": privacy,
            "terms": terms,
            "cookies": cookies,
            "email": email if email else None,
            "contact": contact if contact else None,
            "about": about if about else None,
            "about_text": about_text,
            "numbers": numbers if numbers else None
        }
    
    def get_page(self):
        # page - BS page to search in
        # html is used where is is difficult to extract information using BS
        try:
            self.driver.get(self.url)
            self.page = BeautifulSoup(self.driver.page_source)
            self.html = str(self.page.find("body"))
        except:
            self.page = None
            self.html = ""
    
    @staticmethod
    def check_text(what:list, where:list):
        # not a good one...
        for source in where:
            for text in what:
                if text in source:
                    return True
        return False
    
    def __privacy_and_terms(self):
        terms, privacy, cookies = None, None, None

        if self.page is None:
            return terms, privacy, cookies
        
        links = self.page.find_all("a")
        
        if len(links) is None:
            return {}
        
        for link in links:
            text = link.get_text().lower() or "-"
            link = link.attrs.get("href") or "-"
            link = self.__process_one_url(self.main_url, link) or "-"
            
            # we look for key words in the link itself or in the link text
            if self.check_text(["terms", "termos", "conditions", "legal", "voorwaarden"], [text, link]):
                terms = link
            
            if self.check_text(["cookie"], [text, link]):
                privacy = link
            
            if self.check_text(["privac", "datenschutz"], [text, link]):
                cookies = link
        
        return terms, privacy, cookies
    
    def __find_email(self):
        if not self.html:
            return None
        
        result = []
        for email in set(self.EMAIL_REGEX.findall(self.html)):

            # too short
            if len(email) <= 4:
                continue
                
            # exclude image names
            elif email.split(".")[-1].lower() in {"jpg", "png", "gif"}:
                continue
                
            # (at) instead of @
            email = re.sub("[\-\(]at[\-\)]", "@", email)
            result.append(email)

        return result
    
    def __get_contact_url(self):
        if not self.page:
            return None
        
        links = self.page.find_all("a")
        
        if len(links) is None:
            return []
        
        result = []
        for link in links:
            text = link.get_text().lower() or "-"
            link = link.attrs.get("href") or "-"
            
            # look for key words
            if self.check_text(["contact", "kontakt"], [text, link]):
                result.append(link)
        
        if len(result) == 0 and "contact" in self.page.get_text().lower():
            result = [self.main_url]
        
        return list(set(self.__process_urls(self.main_url, result)))
            
    def __get_about_url(self):
        if not self.page:
            return None
        
        links = self.page.find_all("a")
        if len(links) is None:
            return []
        
        result = []
        for link in links:
            text = link.get_text().lower() or "-"
            link = link.attrs.get("href") or "-"
            
            # key words
            if self.check_text(["about", "mission", "who", "philosophie"], [text, link]):
                result.append(link)
        return list(set(self.__process_urls(self.main_url, result)))
    
    def __get_number(self):
        if not self.page:
            return None
        
        numbers = re.findall(
            "(?:(?:tel|phone|call|contact):?[\s]*)?(\+?[\d]+[\s\d]{6,})", 
            self.page.get_text(), 
            flags=re.IGNORECASE|re.MULTILINE
        )
        numbers = [self.__process_number(i) for i in numbers]
        result = [i for i in numbers if len(i) > 9]
        return result
    
    def __get_about_text(self):
        if not self.page:
            return None
        
        texts = [i.strip() for i in self.page.get_text(separator="\n\n").split("\n\n")]
        texts = [i for i in texts if len(i) > 100]
        
        result_ok, result_maybe = [], []
        for t in texts:
            bag_of_words = set([i.lower() for i in wordpunct_tokenize(t) if i.isalpha()])
            
            # skip paragraphs with stop words (cookies etc)
            if len(bag_of_words & self.STOP_WORDS):
                pass
            # likeky candidates for short description
            elif len(bag_of_words & self.OK_WORDS):
                result_ok.append(t)
            # unknown
            else:
                result_maybe.append(t)
        
        # if there are ikely candidates, take 3
        if result_ok:
            return "\n\n".join(result_ok[:3])
        # otherwise, take at least something
        elif result_maybe:
            return "\n\n".join(result_maybe[:3])
        
        return None
            
    
    @staticmethod
    def __process_one_url(main_url, url):
        "Process partial urls"
        
        main_url = urlparse(main_url)
        main_url = f"{main_url.scheme}://{main_url.hostname}"
        
        current = url.strip()

        if current.startswith("//"):
            current = "https:" + url
        elif current.startswith("/"):
            current = main_url.strip("/") + url
        elif current == "#":
            current = main_url
        elif current.startswith("#"):
            current = main_url.strip("/") + url
        elif "#" in current:
            current_ = current.split("#")[0]
            if current_ != main_url:
                current = current_

        if main_url in current:
            return current.strip("/")
        return None
        
    def __process_urls(self, main_url, urls_to_process):
        "Process urls and keep unique"
        result = []
        not_ = []
        seen = set()
        
        for url in urls_to_process:
            new_url = self.__process_one_url(main_url, url)
            if new_url:
                result.append(new_url)
            
        return list(sorted(set(result)))
    
    @staticmethod
    def __process_number(text):
        "remove spaces and linebreaks from number"
        text = re.sub("[\n\s]", "", text.strip())
        return text

In [12]:
ci = CompanyInfo()

full = []
for url in tqdm(companies["website"].values):
    full.append(ci.process(url))

  self.driver = webdriver.Chrome(WEBDRIVER, options=options)


  0%|          | 0/100 [00:00<?, ?it/s]

Not much, but something. Some websites are not accessible.

In [13]:
pd.DataFrame(full).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   url         100 non-null    object
 1   privacy     31 non-null     object
 2   terms       36 non-null     object
 3   cookies     55 non-null     object
 4   email       78 non-null     object
 5   contact     68 non-null     object
 6   about       54 non-null     object
 7   about_text  89 non-null     object
 8   numbers     33 non-null     object
dtypes: object(9)
memory usage: 7.2+ KB


In [14]:
for row in full:
    for key in row:
        if row[key] == []:
            row[key] = None

In [15]:
with open("task_1_result.json", "w") as f:
    json.dump(full, f, indent=4)

## Task 2

1. The amount of the round as float (preferably denominated in original currency - in the case
above SEK);
2. The date of the round as Timestamp - the article publication date;
3. The investors taking part in the round.

In [25]:
articles = pd.read_excel("InputData.xlsx", sheet_name="Fround")
articles.head()

Unnamed: 0,client_id,name,website,news_url
0,61776,Glanris,https://www.glanris.com/,https://www.businesswire.com/news/home/2021033...
1,391,24M Technologies,https://24-m.com/,https://www.businesswire.com/news/home/2022011...
2,391,24M Technologies,https://24-m.com/,https://www.businesswire.com/news/home/2022011...
3,58945,Aspen Power,https://aspenpower.com/,https://www.businesswire.com/news/home/2022022...
4,58125,Induction Food Systems,https://www.inductionfoodsystems.com/,https://www.businesswire.com/news/home/2021120...


In [125]:
options = webdriver.ChromeOptions()

options.add_argument('--ignore-ssl-errors=yes')
options.add_argument('--ignore-certificate-errors')

driver = webdriver.Chrome(WEBDRIVER, options=options)
driver.set_page_load_timeout(10)

  driver = webdriver.Chrome(WEBDRIVER, options=options)


**Get pages**

In [126]:
pages = []
for url in tqdm(articles["news_url"]):
    try:
        driver.get(url)
        page = BeautifulSoup(driver.page_source)
        html = str(page.find("body"))
        pages.append((url, page, html))
    except:
        print(url)

  0%|          | 0/50 [00:00<?, ?it/s]

https://techcrunch.com/2016/09/21/brightfarms-raises-30-1-million-to-set-up-futuristic-greenhouses-across-the-u-s/?ncid=rss&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+Techcrunch+%28TechCrunch%29
https://www.prnewswire.com/news-releases/brightfarms-secures-100-million-series-e-round-of-funding-to-expand-high-tech-indoor-farming-across-the-us-301156013.html
https://www.marketscreener.com/news/latest/Croatia-Investment-Plan-for-Europe-EIB-supports-Croatian-Rimac-Automobili-in-development-of-high--27722419/
https://www.globenewswire.com/news-release/2022/02/15/2385270/0/en/CEMATRIX-Announces-USD-4-Million-Strategic-Investment-Commitment-to-Glavel-Inc.html
https://www.globenewswire.com/en/news-release/2021/11/22/2339064/0/en/Announcing-the-Innovators-in-Big-Idea-Ventures-Latest-Accelerator-Cohorts.html


In [35]:
# ! python -m spacy download en_core_web_sm

In [128]:
import spacy

nlp = spacy.load("en_core_web_sm")

**Extract information using spacy**

In [161]:
KEY_WORDS = {"raise", "award", "receive", "funding", "round", "invest"}

In [172]:
def extract_fround_info(url, page):
    body = page.find("div", {"itemprop": "articleBody"})
    if len(str(body)) < 500:
        body = page
    
    # remove information in brackets
    text = re.sub("\(.*?\)", " ", body.get_text(separator=" "))
    
    # replace 1m or 2.4b with million or billion
    text = re.sub("([\d])([mb])([\W])", "\g<1> \g<2>illion \g<3>", text, re.IGNORECASE)
    
    parsed = nlp(text)
    result = []
    
    for s in parsed.sents:
        
        labels = {e.label_ for e in s.ents}
        
        for word in KEY_WORDS:
            if word in s.text:
                
                # take only information from sentences with money
                # as they tend to be the relevent ones
                start = False
                
                if "MONEY" not in labels:
                    continue
                
                for e in s.ents:
                    if e.label_ == "MONEY":
                        start = True
                        result.append({"money": e.text.strip(), "url": url, "inverstors": []})
                    # names of investors come after money
                    elif start == True and \
                        e.label_ in {"ORG", "PERSON", "GPE"} and \
                        e.text[0] == e.text[0].upper():
                            result[-1]["inverstors"].append(e.text.strip())
                break
    return result

**Extract information**

It is better to do it as we download pages, but it was more convenient to debug it

In [162]:
data = []

for url, page, html in pages:
    res = extract_fround_info(url, page)
    data.extend(res)

In [164]:
pd.DataFrame(data).head()

Unnamed: 0,money,url,inverstors
0,$2 million,https://www.businesswire.com/news/home/2021033...,"[Riceland, Innova]"
1,$2.8 million,https://www.businesswire.com/news/home/2021033...,[]
2,$9 million,https://www.businesswire.com/news/home/2022011...,[]
3,$120 million,https://www.businesswire.com/news/home/2022022...,"[Ultra Capital, Redball Power]"
4,"up to $250,000",https://www.businesswire.com/news/home/2021120...,"[NREL, Innovation and Entrepreneurship Center]"


In [165]:
CURRENCY = {
    "$": "USD",
    "£": "GBR"
}

In [166]:
def process_money(money):
    # lowercase, process ., separators
    money = money.lower()
    
    if re.match(".*[\d],0{2,}.*", money):
        money = money.replace(",", "")
    else:
        money = money.replace(",", ".")
    
    # find currency if there is one
    possible_currency = set(money) & set(CURRENCY)
    if len(possible_currency) == 1:
        currency = CURRENCY.get(list(possible_currency)[0])
    else:
        currency ="unk"
    
    # extract value
    value = re.findall("[\d\.]+", money)
    if value:
        value = value[0]
    else:
        value = 0
    
    # get multiplier (million or billion)
    if "mil" in money:
        multiplier = 10e6
    elif "bil" in money:
        multiplier = 10e9
    else:
        multiplier = 1
    
    # compute real value
    value = float(value) * multiplier
    
    # if the value is too small, we don't have enough information
    if value < 1000:
        value = None
        
    return value, currency 

In [173]:
money = []

for row in data:
    value, currency = process_money(row["money"])
    if value:
        money.append({"value": value, "currency": currency, "investors": row["inverstors"], "url": row["url"]})

In [174]:
money = pd.DataFrame(money)
money["investors"] = money["investors"].apply(json.dumps)

money.to_csv("task_2_result.tsv", sep="\t")