In [1]:
import pandas as pd
import requests
from tqdm.notebook import tqdm

import dandelion



In [2]:
!wget https://files.deeppavlov.ai/deeppavlov_data/entity_linking/test_dataset.pickle

--2022-06-28 12:58:57--  https://files.deeppavlov.ai/deeppavlov_data/entity_linking/test_dataset.pickle
Resolving files.deeppavlov.ai (files.deeppavlov.ai)... 178.63.27.41
Connecting to files.deeppavlov.ai (files.deeppavlov.ai)|178.63.27.41|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17668715 (17M) [application/octet-stream]
Saving to: ‘test_dataset.pickle’


2022-06-28 12:58:59 (9.37 MB/s) - ‘test_dataset.pickle’ saved [17668715/17668715]



## Download pickled dataset and prepare it for scoring

In [3]:
import pickle

with open("test_dataset.pickle", "rb") as dataset_f:
    dataset = pickle.load(dataset_f)

full_dataset = []
for kind, value_list in dataset.items():
    data_rows = [{"kind": kind, "full_text": value} for value in value_list]
    full_dataset += data_rows

dataset_df = pd.DataFrame(full_dataset)
slice_size = 3500
max_text_len = dataset_df["full_text"].str.len().max()
max_slices = max_text_len // slice_size

for s_idx in range(max_slices):
    start = s_idx * slice_size
    stop = start + slice_size
    dataset_df[f"text_{s_idx}"] = dataset_df["full_text"].str.slice(start, stop)

dataset_df = dataset_df.melt(id_vars=["kind", "full_text"], value_name="text")
dataset_df.replace("", float("NaN"), inplace=True)
dataset_df.dropna(subset=["text"], inplace=True)
dataset_df.reset_index(inplace=True)
dataset_df[["kind", "text"]].to_csv("test_dataset.csv", sep="|", index=False)

In [4]:
!rm test_dataset.pickle

In [5]:
dataset_df = pd.read_csv("test_dataset.csv", sep="|")
dataset_df

Unnamed: 0,kind,text
0,news_texts,Chicago church embraces ‘The Gospel According ...
1,news_texts,Opinion | Endorsements for Montgomery County C...
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning..."
3,news_texts,"NBA draft winners and losers: Banchero, Smith ..."
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o..."
...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...
6102,blog_texts,he QR code located on the site or on any 2020 ...
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...
6104,blog_texts,egan with eco-friendly packaging — sustainable...


In [6]:
def run_dandelion(df, fresh_start=True):
    results = []

    with requests.Session() as sess:
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            if not pd.isna(row.get("dandelion_results", float("NaN"))):
                continue

            data = error = exception = float("NaN")
            try:
                result = dandelion.extract_dandelion(sess, row["text"])
                if result.error:
                    error = result.error
                    if error.get("code") in ["error.authenticationError", "error.forbiddenError"]:
                        print("Exceeded API token limit, come back tomorrow!")
                        break
                else:
                    data = result.data
            except Exception as e:
                exception = f"{type(e)}: {e}"
            results.append({"index": idx, "dandelion_results": data, "dandelion_error": error, "dandelion_script_exception": exception})

    if results:
        results_df = pd.DataFrame(results)
        results_df.set_index("index", inplace=True)
        if fresh_start:
            results_df = pd.merge(df, results_df, left_index=True, right_index=True, how="left")
        else:
            df.update(results_df, overwrite=False)
            results_df = df

        results_df.to_csv("test_dataset_dandelion.csv", sep="|", index=False)
    else:
        results_df = df

    return results_df


## Fresh run
Execute this cell only if you don't have a test_dataset_dandelion.csv yet

In [9]:
# Fresh run
dandelion_results_df = run_dandelion(dataset_df, fresh_start=True)
dandelion_results_df

  utils.DeprecatedIn35,
 18%|█▊        | 1088/6106 [04:26<15:29,  5.40it/s]  

Exceeded API token limit, come back tomorrow!


Unnamed: 0,kind,text,dandelion_results,dandelion_error,script_exception
0,news_texts,Chicago church embraces ‘The Gospel According ...,"{'time': 100, 'annotations': [{'start': 0, 'en...",,
1,news_texts,Opinion | Endorsements for Montgomery County C...,"{'time': 92, 'annotations': [{'start': 27, 'en...",,
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'time': 138, 'annotations': [{'start': 0, 'en...",,
3,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'time': 119, 'annotations': [{'start': 30, 'e...",,
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'time': 157, 'annotations': [{'start': 0, 'en...",,
...,...,...,...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...,,,
6102,blog_texts,he QR code located on the site or on any 2020 ...,,,
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...,,,
6104,blog_texts,egan with eco-friendly packaging — sustainable...,,,


## Consecutive runs
Run this if you already have a test_dataset_dandelion.csv from previous runs

In [7]:
# Run again using dandelion_results_df to update missing results
dandelion_results_df = pd.read_csv("test_dataset_dandelion.csv", sep="|")
dandelion_results_df = run_dandelion(dandelion_results_df, fresh_start=False)
dandelion_results_df

  utils.DeprecatedIn35,
 38%|███▊      | 2319/6106 [03:35<10:30,  6.01it/s]  

Exceeded API token limit, come back tomorrow!


Unnamed: 0,kind,text,dandelion_results,dandelion_error,script_exception
0,news_texts,Chicago church embraces ‘The Gospel According ...,"{'time': 100, 'annotations': [{'start': 0, 'en...",,
1,news_texts,Opinion | Endorsements for Montgomery County C...,"{'time': 92, 'annotations': [{'start': 27, 'en...",,
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'time': 138, 'annotations': [{'start': 0, 'en...",,
3,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'time': 119, 'annotations': [{'start': 30, 'e...",,
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'time': 157, 'annotations': [{'start': 0, 'en...",,
...,...,...,...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...,,,
6102,blog_texts,he QR code located on the site or on any 2020 ...,,,
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...,,,
6104,blog_texts,egan with eco-friendly packaging — sustainable...,,,


In [33]:
def run_deeppavlov(df, url, slice_start=None, slice_stop=None):
    df_slice = df[slice_start:slice_stop]
    results = []

    with requests.Session() as sess:
        for idx, row in tqdm(df_slice.iterrows(), total=len(df_slice)):
            data = error = exception = float("NaN")
            try:
                response = sess.post(url, json={"text": row["text"]})
                result = response.json()
                if result.get("detail"):
                    error = result
                else:
                    data = result
            except Exception as e:
                exception = f"{type(e)}: {e}"
            results.append({"index": idx, "dp_results": data, "dp_error": error, "dp_script_exception": exception})

    # if results:
    results_df = pd.DataFrame(results)
    results_df.set_index("index", inplace=True)
    results_df = pd.merge(df, results_df, left_index=True, right_index=True, how="left")

    results_df.to_csv("test_dataset_dp.csv", sep="|", index=False)
    # else:
    #     results_df = df

    return results_df

In [34]:
dp_results_df = run_deeppavlov(dataset_df, "http://10.11.1.6:9999/", 0, 2000)
dp_results_df

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




Unnamed: 0,kind,text,dp_results,dp_error,script_exception
0,news_texts,Chicago church embraces ‘The Gospel According ...,,,<class 'json.decoder.JSONDecodeError'>: Expect...
1,news_texts,Opinion | Endorsements for Montgomery County C...,"{'annotations': [{'start': 27, 'end': 52, 'spo...",,
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'annotations': [{'start': 0, 'end': 12, 'spot...",,
3,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'annotations': [{'start': 0, 'end': 3, 'spot'...",,
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'annotations': [{'start': 56, 'end': 61, 'spo...",,
...,...,...,...,...,...
6101,blog_texts,shing minerality on the palate and blossom sce...,,,
6102,blog_texts,he QR code located on the site or on any 2020 ...,,,
6103,blog_texts,baker’s chocolate with a hint of acidity.Fresh...,,,
6104,blog_texts,egan with eco-friendly packaging — sustainable...,,,


## Merging results and comparing

In [28]:
import ast


def try_literal_eval(df_row):
    try:
        result = ast.literal_eval(df_row)
    except:
        result = float("NaN")
    return result

dandelion_results_df = pd.read_csv("test_dataset_dandelion.csv", sep="|", converters={"dandelion_results": try_literal_eval, "dandelion_error": try_literal_eval})
dp_results_df = pd.read_csv("test_dataset_dp.csv", sep="|", converters={"dp_results": try_literal_eval, "dp_error": try_literal_eval})

comparison_df = pd.merge(dandelion_results_df, dp_results_df, left_index=True, right_index=True, how="left")
comparison_df = comparison_df.dropna(subset=["dandelion_results", "dp_results"], how="any")
comparison_df

Unnamed: 0,kind_x,text_x,dandelion_results,dandelion_error,script_exception_x,kind_y,text_y,dp_results,dp_error,script_exception_y
1,news_texts,Opinion | Endorsements for Montgomery County C...,"{'time': 92, 'annotations': [{'start': 27, 'en...",,,news_texts,Opinion | Endorsements for Montgomery County C...,"{'annotations': [{'start': 27, 'end': 52, 'spo...",,
2,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'time': 138, 'annotations': [{'start': 0, 'en...",,,news_texts,"Arch Manning, nephew of Peyton and Eli Manning...","{'annotations': [{'start': 0, 'end': 12, 'spot...",,
3,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'time': 119, 'annotations': [{'start': 30, 'e...",,,news_texts,"NBA draft winners and losers: Banchero, Smith ...","{'annotations': [{'start': 0, 'end': 3, 'spot'...",,
4,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'time': 157, 'annotations': [{'start': 0, 'en...",,,news_texts,"Gaetz, Brooks, Biggs, Gohmert, Perry accused o...","{'annotations': [{'start': 56, 'end': 61, 'spo...",,
5,news_texts,Opinion | Coming not soon: ‘Top Drone’ - The W...,"{'time': 101, 'annotations': [{'start': 41, 'e...",,,news_texts,Opinion | Coming not soon: ‘Top Drone’ - The W...,"{'annotations': [{'start': 0, 'end': 7, 'spot'...",,
...,...,...,...,...,...,...,...,...,...,...
1994,news_texts,ese — is itself made of upcycled items. The fl...,"{'time': 110, 'annotations': [{'start': 24, 'e...",,,news_texts,ese — is itself made of upcycled items. The fl...,"{'annotations': [{'start': 287, 'end': 293, 's...",,
1995,news_texts,"an be used safely.”Timothy Ingalsbee, executiv...","{'time': 71, 'annotations': [{'start': 62, 'en...",,,news_texts,"an be used safely.”Timothy Ingalsbee, executiv...","{'annotations': [{'start': 19, 'end': 36, 'spo...",,
1996,news_texts,by the fact that he would not correspond to r...,"{'time': 81, 'annotations': [{'start': 8, 'end...",,,news_texts,by the fact that he would not correspond to r...,"{'annotations': [{'start': 114, 'end': 118, 's...",,
1997,news_texts,s into the room where the House select committ...,"{'time': 79, 'annotations': [{'start': 32, 'en...",,,news_texts,s into the room where the House select committ...,"{'annotations': [{'start': 26, 'end': 31, 'spo...",,


In [30]:
def sample_diff(df, n=10):
    for idx, row in df.sample(n=n, axis="index").iterrows():
        dandelion_substrings = {(ann["start"], ann["end"]): ann["spot"] for ann in row["dandelion_results"]["annotations"]}
        dp_substrings = {(ann["start"], ann["end"]): ann["spot"] for ann in row["dp_results"]["annotations"]}

        negative_diff = set(dandelion_substrings) - set(dp_substrings)
        positive_diff = set(dp_substrings) - set(dandelion_substrings)
        print(f"Entry idx={idx}")
        for nd in negative_diff:
            print(f"- {dandelion_substrings[nd]}")

        for pd in positive_diff:
            print(f"+ {dp_substrings[pd]}")

        print()

sample_diff(comparison_df, n=5)

Entry idx=947
- Market
- LTV
- payback
- health
- develop
- payback
- acquire
- gold standard
- stay here forever
- behavior
- premium
- customers
- vc
- growth
- efficient
- fundamentals
- product
- companies
- macro
- growth
- customers
- growth
- valuation
- margins
- Cash
- working capital
- invest
- public markets
- profit margins
- business
- business
- healthy
- company
- market, making
- capital
- marketing
- metrics
- metric
- Growth
- market forces
- efficiency
- ARPU
- Technological advancement
- influence
- companies
- margin
- cash
- cash
- ARPU
- acquisition
- gross
- profitability
- marketing
- economics
- ROI
- growth
- cash flow
- companies
- quadrant
- margins
- gross
- business
- quadrant
- benchmarks
- Payback
- competition
- payback
- Cash
- quadrant
- customers
- email
- growth
- margin
- companies
- customers
- payback
- business
- growth
- efficiency
- business
- Payback
- payback
- economics
- LTV
- consumer preferences
- payback
- profitability
- market forces

In [43]:
# Search for text for the given idx in comparison_df

comparison_df.loc[947]["text_x"]

'The Importance of CAC Payback in Today’s Market Environment | by Parsa Saljoughian | parsa.vc | Jun, 2022 | MediumGet unlimited accessOpen in appHomeNotificationsListsStoriesWritePublished inparsa.vcParsa SaljoughianFollowJun 8·10 min readThe Importance of CAC Payback in Today’s Market EnvironmentOver the last few years, LTV / CAC has become the gold standard metric to determine the growth and profitability potential of a direct-to-consumer business. Somewhere along the way, the concept of CAC payback got lost, but it has huge implications on a company’s ability to scale efficiently. In today’s evolving macro environment, specifically highlighted by the recent emphasis on cash flow and profitability, cash efficiency is becoming much more important. In this post I’ll: 1) share why long CAC payback times can be a silent killer to growth and efficiency, 2) identify the major market forces that can erode unit economics, 3) highlight key benchmarks for a healthy business, and 4) share case