In [15]:
import pandas as pd
import requests
from tqdm import tqdm

import dandelion

In [16]:
!wget https://files.deeppavlov.ai/deeppavlov_data/entity_linking/test_dataset.pickle

--2022-06-27 20:53:31--  https://files.deeppavlov.ai/deeppavlov_data/entity_linking/test_dataset.pickle
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 178.63.27.41
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17668715 (17M) [application/octet-stream]
Saving to: ‘test_dataset.pickle’


2022-06-27 20:53:33 (9.26 MB/s) - ‘test_dataset.pickle’ saved [17668715/17668715]



In [17]:
import pickle

with open("test_dataset.pickle", "rb") as dataset_f:
    dataset = pickle.load(dataset_f)

full_dataset = []
for kind, value_list in dataset.items():
    data_rows = [{"kind": kind, "full_text": value} for value in value_list]
    full_dataset += data_rows

dataset_df = pd.DataFrame(full_dataset)
slice_size = 3500
max_text_len = dataset_df["full_text"].str.len().max()
max_slices = max_text_len // slice_size

for s_idx in range(max_slices):
    start = s_idx * slice_size
    stop = start + slice_size
    dataset_df[f"text_{s_idx}"] = dataset_df["full_text"].str.slice(start, stop)

dataset_df = dataset_df.melt(id_vars=["kind", "full_text"], value_name="text")
dataset_df.replace("", float("NaN"), inplace=True)
dataset_df.dropna(subset=["text"], inplace=True)
dataset_df.reset_index(inplace=True)
dataset_df[["kind", "text"]].to_csv("test_dataset.csv", sep="|", index=False)

In [18]:
!rm test_dataset.pickle

In [19]:
dataset_df = pd.read_csv("test_dataset.csv", sep="|")
dataset_df

Unnamed: 0,kind,text
0,news_texts,Chicago church embraces ‘The Gospel According ...
1,news_texts,Opinion | Endorsements for Montgomery County C...
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning..."
3,news_texts,"NBA draft winners and losers: Banchero, Smith ..."
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o..."
...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...
6102,blog_texts,he QR code located on the site or on any 2020 ...
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...
6104,blog_texts,egan with eco-friendly packaging — sustainable...


In [20]:
def run_dandelion(df, fresh_start=True):
    results = []

    with requests.Session() as sess:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            if not pd.isna(row.get("dandelion_results", float("NaN"))):
                continue

            data = error = exception = float("NaN")
            try:
                result = dandelion.extract_dandelion(sess, row["text"])
                if result.get("error"):
                    error = result
                    if error.get("code") in ["error.authenticationError", "error.forbiddenError"]:
                        print("Exceeded API token limit, come back tomorrow!")
                        break
                else:
                    data = result
            except Exception as e:
                exception = f"{type(e)}: {e}"
            results.append({"index": idx, "dandelion_results": data, "dandelion_error": error, "script_exception": exception})

    if results:
        results_df = pd.DataFrame(results)
        results_df.set_index("index", inplace=True)
        if fresh_start:
            results_df = pd.merge(df, results_df, left_index=True, right_index=True)
        else:
            df.update(results_df, overwrite=False)
            results_df = df

        results_df.to_csv("test_dataset_dandelion.csv", sep="|", index=False)
    else:
        results_df = df

    return results_df


In [27]:
# Fresh run
dandelion_results_df = run_dandelion(dataset_df, fresh_start=True)
dandelion_results_df


  0%|          | 0/6106 [00:00<?, ?it/s][A

Exceeded API token limit, come back tomorrow!





Unnamed: 0,kind,text
0,news_texts,Chicago church embraces ‘The Gospel According ...
1,news_texts,Opinion | Endorsements for Montgomery County C...
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning..."
3,news_texts,"NBA draft winners and losers: Banchero, Smith ..."
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o..."
...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...
6102,blog_texts,he QR code located on the site or on any 2020 ...
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...
6104,blog_texts,egan with eco-friendly packaging — sustainable...


In [26]:
# Run again using dandelion_results_df to update missing results
dandelion_results_df = pd.read_csv("test_dataset_dandelion.csv", sep="|")
dandelion_results_df = run_dandelion(dandelion_results_df, fresh_start=False)
dandelion_results_df


  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:00<00:02,  3.15it/s][A
 33%|███▎      | 3/9 [00:00<00:01,  4.05it/s][A
 56%|█████▌    | 5/9 [00:00<00:00,  5.05it/s][A
 78%|███████▊  | 7/9 [00:00<00:00,  6.09it/s][A
100%|██████████| 9/9 [00:00<00:00,  9.07it/s][A


Unnamed: 0,kind,text,dandelion_results_x,dandelion_error_x,script_exception_x,dandelion_results_y,dandelion_error_y,script_exception_y
0,news_texts,Bavarian castle hosting G-7 was Nazi vacation ...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
1,news_texts,Warzone Season 2 Reloaded loadout tips: A guid...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
2,news_texts,All-Met history - The Washington Post Accessib...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
3,news_texts,'Elden Ring' and 'Lost Ark' tackle fantasy RPG...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
4,news_texts,Outlook http://www.washingtonpost.com/pb/outl...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
5,news_texts,Bavarian castle hosting G-7 was Nazi vacation ...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
6,news_texts,"Allstate, Progressive drop Maine insurance fir...",,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
7,news_texts,Mishael Morgan is 1st Black lead winner at Day...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
8,blog_texts,Why Everything is Suddenly Getting More Expens...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",


In [23]:
df1 = pd.read_csv("test_dataset_dandelion_1.csv", sep="|")
df1

Unnamed: 0,kind,text,dandelion_results,dandelion_error,script_exception
0,news_texts,Chicago church embraces ‘The Gospel According ...,"{'time': 96, 'annotations': [{'start': 0, 'end...",,
1,news_texts,Opinion | Endorsements for Montgomery County C...,"{'time': 92, 'annotations': [{'start': 27, 'en...",,
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'time': 137, 'annotations': [{'start': 0, 'en...",,
3,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'time': 119, 'annotations': [{'start': 30, 'e...",,
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'time': 148, 'annotations': [{'start': 0, 'en...",,
...,...,...,...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...,,"{'error': True, 'status': 403, 'code': 'error....",
6102,blog_texts,he QR code located on the site or on any 2020 ...,,"{'error': True, 'status': 403, 'code': 'error....",
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...,,"{'error': True, 'status': 403, 'code': 'error....",
6104,blog_texts,egan with eco-friendly packaging — sustainable...,,"{'error': True, 'status': 403, 'code': 'error....",


In [24]:
dandelion_results_df

Unnamed: 0,kind,text,dandelion_results_x,dandelion_error_x,script_exception_x,dandelion_results_y,dandelion_error_y,script_exception_y
0,news_texts,Bavarian castle hosting G-7 was Nazi vacation ...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
1,news_texts,Warzone Season 2 Reloaded loadout tips: A guid...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
2,news_texts,All-Met history - The Washington Post Accessib...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
3,news_texts,'Elden Ring' and 'Lost Ark' tackle fantasy RPG...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
4,news_texts,Outlook http://www.washingtonpost.com/pb/outl...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
5,news_texts,Bavarian castle hosting G-7 was Nazi vacation ...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
6,news_texts,"Allstate, Progressive drop Maine insurance fir...",,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
7,news_texts,Mishael Morgan is 1st Black lead winner at Day...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
8,blog_texts,Why Everything is Suddenly Getting More Expens...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",


In [25]:
pd.merge(df1, dandelion_results_df, left_index=True, right_index=True)

Unnamed: 0,kind_x,text_x,dandelion_results,dandelion_error,script_exception,kind_y,text_y,dandelion_results_x,dandelion_error_x,script_exception_x,dandelion_results_y,dandelion_error_y,script_exception_y
0,news_texts,Chicago church embraces ‘The Gospel According ...,"{'time': 96, 'annotations': [{'start': 0, 'end...",,,news_texts,Bavarian castle hosting G-7 was Nazi vacation ...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
1,news_texts,Opinion | Endorsements for Montgomery County C...,"{'time': 92, 'annotations': [{'start': 27, 'en...",,,news_texts,Warzone Season 2 Reloaded loadout tips: A guid...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'time': 137, 'annotations': [{'start': 0, 'en...",,,news_texts,All-Met history - The Washington Post Accessib...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
3,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'time': 119, 'annotations': [{'start': 30, 'e...",,,news_texts,'Elden Ring' and 'Lost Ark' tackle fantasy RPG...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'time': 148, 'annotations': [{'start': 0, 'en...",,,news_texts,Outlook http://www.washingtonpost.com/pb/outl...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
5,news_texts,Opinion | Coming not soon: ‘Top Drone’ - The W...,"{'time': 101, 'annotations': [{'start': 41, 'e...",,,news_texts,Bavarian castle hosting G-7 was Nazi vacation ...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
6,news_texts,Submit an Op-Ed - The Washington Post ...,"{'time': 18, 'annotations': [{'start': 10, 'en...",,,news_texts,"Allstate, Progressive drop Maine insurance fir...",,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
7,news_texts,These Are the Gun Measures Advancing in US Con...,"{'time': 91, 'annotations': [{'start': 40, 'en...",,,news_texts,Mishael Morgan is 1st Black lead winner at Day...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
8,news_texts,Opinion | ‘Top Gun: Maverick’ has a female pil...,"{'time': 121, 'annotations': [{'start': 11, 'e...",,,blog_texts,Why Everything is Suddenly Getting More Expens...,,"{'error': True, 'status': 414, 'code': 'error....",,,"{'error': True, 'status': 414, 'code': 'error....",
