In [1]:
import ast
from dataclasses import dataclass
from pathlib import Path
from typing import Union, Optional

import pandas as pd
from pydantic import BaseModel
import requests
from tqdm.notebook import tqdm

import dandelion
import dp



In [2]:
URL_DATASET_FILE_STEM = "html"


class DatasetEntry(BaseModel):
    id: int
    kind: str


class DatasetHtmlEntry(DatasetEntry):
    url: str
    html_file: str

    @property
    def content(self):
        with open(self.html_file, "r") as content_f:
            byte_content = content_f.read()
        return byte_content

    @property
    def content_type(self):
        return "html"


class DatasetTextEntry(DatasetEntry):
    text: str

    @property
    def content(self):
        return self.text

    @property
    def content_type(self):
        return "text"


class ScoringResult(BaseModel):
    entry: DatasetEntry
    dandelion_result: Optional[dict]
    dandelion_error: Optional[str]
    dandelion_script_exception: Optional[str]
    dp_result: Optional[dict]
    dp_error: Optional[str]
    dp_script_exception: Optional[str]


def save_csv(df: pd.DataFrame):
    df.to_csv("dataset.csv", index=False)


def load_csv():
    return pd.read_csv("dataset.csv", converters={"dandelion_results": ast.literal_eval, "dandelion_error": ast.literal_eval, "dp_results": ast.literal_eval, "dp_error": ast.literal_eval})


def save_html(path: Union[Path, str], content: bytes):
    with open(path, "wb") as html_f:
        html_f.write(content)
    return str(path)


def download_page(url: str, html_storage_dir: Union[Path, str], page_id: int, force_download: bool = False):
    path = Path(html_storage_dir) / f"{page_id}.html"
    if path.exists() and not force_download:
        return str(path)

    content = requests.get(url).content
    return save_html(path, content)


def iter_dataset_entries(data_dir: Union[Path, str], html_storage_dir: Union[Path, str], force_download_html: bool = False):
    current_id = 0

    for p in tqdm(Path(data_dir).glob("*.txt"), desc=f"Data files"):
        with p.open("r", encoding="utf-8") as data_f:
            for line in tqdm(data_f, desc=f"{p}"):
                line = line.strip()
                if line:
                    current_id += 1
                    if p.stem == URL_DATASET_FILE_STEM:
                        html_file = download_page(line, html_storage_dir, current_id, force_download_html)
                        yield DatasetHtmlEntry(id=current_id, kind=p.stem, url=line, html_file=html_file)
                    else:
                        yield DatasetTextEntry(id=current_id, kind=p.stem, text=line)


def score(data_dir: Union[Path, str], html_storage_dir: Union[Path, str], force_download_html: bool = False):
    results = []

    with requests.Session() as sess:
        for entry in iter_dataset_entries(data_dir, html_storage_dir, force_download_html):
            dandelion_data = dandelion_error = dandelion_exception = None
            try:
                dandelion_result = dandelion.extract_dandelion(sess, entry.content, entry.content_type, include=dandelion.ALL_INCLUDE_OPTIONS)
                if dandelion_result.error:
                    dandelion_error = dandelion_result.error
                    if dandelion_error.get("code") in ["error.authenticationError", "error.forbiddenError"]:
                        print("Exceeded API token limit, come back tomorrow!")
                        break
                else:
                    dandelion_data = dandelion_result.data
            except Exception as e:
                dandelion_exception = f"{type(e)}: {e}"

            dp_data = dp_error = dp_exception = None
            try:
                dp_result = dp.extract_dp(sess, "http://10.11.1.6:9999/", entry.content, entry.content_type)
                if dp_result.error:
                    dp_error = dp_result.error
                else:
                    dp_data = dp_result.data
            except Exception as e:
                dp_exception = f"{type(e)}: {e}"

            scoring_result = ScoringResult(
                entry=entry,
                dandelion_result=dandelion_data,
                dandelion_error=dandelion_error,
                dandelion_script_exception=dandelion_exception,
                dp_result=dp_data,
                dp_error=dp_error,
                dp_script_exception=dp_exception
            )
            results.append(scoring_result)

    return results


In [3]:
scores = score(data_dir="data", html_storage_dir="data/htmls", force_download_html=False)
scores

Data files: 0it [00:00, ?it/s]

data/phrase.txt: 0it [00:00, ?it/s]

  utils.DeprecatedIn35,


data/word.txt: 0it [00:00, ?it/s]

data/wikidata_entity.txt: 0it [00:00, ?it/s]

data/html.txt: 0it [00:00, ?it/s]

[ScoringResult(entry=DatasetTextEntry(id=1, kind='phrase', text='sure!'), dandelion_result={'time': 1, 'annotations': [], 'lang': 'en', 'timestamp': '2022-07-07T16:49:47.752'}, dandelion_error=None, dandelion_script_exception=None, dp_result={'annotations': [], 'unlisted_annotations': [], 'lang': 'en', 'timestamp': '2022-07-07T16:49:48.036557'}, dp_error=None, dp_script_exception=None),
 ScoringResult(entry=DatasetTextEntry(id=2, kind='phrase', text='have a good one!'), dandelion_result={'time': 1, 'annotations': [], 'lang': 'en', 'timestamp': '2022-07-07T16:49:48.115'}, dandelion_error=None, dandelion_script_exception=None, dp_result={'annotations': [], 'unlisted_annotations': [], 'lang': 'en', 'timestamp': '2022-07-07T16:49:48.221052'}, dp_error=None, dp_script_exception=None),
 ScoringResult(entry=DatasetTextEntry(id=3, kind='phrase', text='is this item in stock?'), dandelion_result={'time': 0, 'annotations': [], 'lang': 'en', 'timestamp': '2022-07-07T16:49:48.296'}, dandelion_error

In [9]:
pd.DataFrame(s.dict() for s in scores).dropna(axis=0, subset=["dandelion_result", "dp_result"])

Unnamed: 0,entry,dandelion_result,dandelion_error,dandelion_script_exception,dp_result,dp_error,dp_script_exception
0,"{'id': 1, 'kind': 'phrase', 'text': 'sure!'}","{'time': 1, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
1,"{'id': 2, 'kind': 'phrase', 'text': 'have a go...","{'time': 1, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
2,"{'id': 3, 'kind': 'phrase', 'text': 'is this i...","{'time': 0, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
3,"{'id': 4, 'kind': 'phrase', 'text': 'my bad'}","{'time': 0, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
4,"{'id': 5, 'kind': 'word', 'text': 'at'}","{'time': 0, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
5,"{'id': 6, 'kind': 'word', 'text': 'it's'}","{'time': 1, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
6,"{'id': 7, 'kind': 'word', 'text': 'space'}","{'time': 0, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
7,"{'id': 8, 'kind': 'word', 'text': 'death'}","{'time': 0, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
8,"{'id': 9, 'kind': 'word', 'text': 'come on'}","{'time': 1, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
9,"{'id': 10, 'kind': 'word', 'text': 'musk?'}","{'time': 1, 'annotations': [], 'lang': 'en', '...",,,"{'annotations': [], 'unlisted_annotations': []...",,
