In [1]:
import pandas as pd

In [2]:
# from textprofilerbackend.transform import get_word_tokens_batch, get_byte_encoding_batch
# from textprofilerbackend.textclean import get_textcol_metadata_embeddings

In [3]:
vis_papers = pd.read_parquet(
    "../datasets/local/vis_papers/processed/vis_papers.parquet"
)

In [4]:
from nltk.tokenize import TreebankWordTokenizer as twt
from concurrent.futures import ThreadPoolExecutor


tokenizer = twt()


def get_words_w_span(text):
    token_idx = tokenizer.span_tokenize(text)
    _df = pd.DataFrame(token_idx, columns=["span_start", "span_end"])
    _df["word"] = _df.apply(lambda x: text[x.span_start : x.span_end], axis=1)

    return _df


def get_words_w_span_batch(arr: list[str], num_threads=8) -> list[list[str]]:
    """Split array of strings into word tokens using nltk.
    Returns: 2d array of strings"""
    with ThreadPoolExecutor(num_threads) as executor:
        return list(executor.map(get_words_w_span, arr))

In [5]:
abstracts = vis_papers["Abstract"]
abstracts_non_null = abstracts.dropna().reset_index(drop=True)

In [6]:
get_words_w_span(abstracts_non_null.iloc[0])

Unnamed: 0,span_start,span_end,word
0,0,13,Accessibility
1,14,24,guidelines
2,25,30,place
3,31,43,restrictions
4,44,46,on
...,...,...,...
297,2084,2086,at
298,2087,2092,https
299,2092,2093,:
300,2093,2108,//osf.io/8kzmg/


In [7]:
r = get_words_w_span_batch(abstracts_non_null)

In [9]:
# r is your list of dataframes
for i, _mydf in enumerate(r):
    _mydf["id"] = i

In [14]:
all_results = pd.concat(r, ignore_index=True)

In [16]:
all_results.to_parquet(
    "../datasets/local/vis_papers/processed/vis_papers_words_span.parquet"
)

In [13]:
pd.concat(r, ignore_index=True)

Unnamed: 0,span_start,span_end,word,id
0,0,13,Accessibility,0
1,14,24,guidelines,0
2,25,30,place,0
3,31,43,restrictions,0
4,44,46,on,0
...,...,...,...,...
613515,942,944,gt,3548
613516,944,945,;,3548
613517,945,946,&,3548
613518,946,948,gt,3548


In [18]:
vis_papers_w_ab = vis_papers[vis_papers["Abstract"].notnull()]

In [19]:
vis_papers_w_ab.columns

Index(['id', 'Conference', 'Year', 'Title', 'DOI', 'Link', 'FirstPage',
       'LastPage', 'PaperType', 'Abstract', 'AuthorNames-Deduped',
       'AuthorNames', 'AuthorAffiliation', 'InternalReferences',
       'AuthorKeywords', 'AminerCitationCount', 'CitationCount_CrossRef',
       'PubsCited_CrossRef', 'Award', 'Abstract_text_length',
       'Abstract_num_words', 'Abstract_max_word_length',
       'Abstract_avg_word_length', 'Abstract_perc_special_chars'],
      dtype='object')

In [20]:
df = vis_papers_w_ab[["Abstract", "AuthorNames-Deduped", "AuthorKeywords"]]

In [21]:
df

Unnamed: 0,Abstract,AuthorNames-Deduped,AuthorKeywords
0,Accessibility guidelines place restrictions on...,Laura South;Michelle Borkin,"accessibility,photosensitive epilepsy,photosen..."
1,Horizontal federated learning (HFL) enables di...,Xumeng Wang;Wei Chen 0001;Jiazhi Xia;Zhen Wen;...,"Federated learning,data heterogeneity,cluster ..."
2,"We present Rigel, an interactive system for ra...",Ran Chen;Di Weng;Yanwei Huang;Xinhuan Shu;Jiay...,"Data transformation,self-service data transfor..."
3,We developed and validated a rating scale to a...,Tingying He;Petra Isenberg;Raimund Dachselt;To...,"Aesthetics,aesthetic pleasure,validated scale,..."
4,The success of DL can be attributed to hours o...,Anjul Tyagi;Cong Xie;Klaus Mueller 0001,"Deep Learning,Neural Network Architecture Sear..."
...,...,...,...
3544,The author applied image processing and volume...,Nahum D. Gershon,
3545,The problem of presenting and gaining deeper u...,Wayne E. Fordyce;Jeffrey Ventrella,
3546,A hierarchical triangulation built from a digi...,Lori L. Scarlatos;Theodosios Pavlidis,
3547,A methodology for guiding the choice of visual...,Philip K. Robertson,


In [51]:
def parseCol(item):
    if not item:
        return None

    l = item.split(";")
    idxs = []
    curr = 0
    for item in l:
        idxs.append((curr, curr + len(item)))
        curr += len(item) + 1

    _df = pd.DataFrame(idxs, columns=["span_start", "span_end"])
    _df["item"] = l

    return _df


def parseCol2(item):
    if not item:
        return None

    l = item.split(",")
    idxs = []
    curr = 0
    for item in l:
        idxs.append((curr, curr + len(item)))
        curr += len(item) + 1

    _df = pd.DataFrame(idxs, columns=["span_start", "span_end"])
    _df["item"] = l

    return _df

In [36]:
r = df["AuthorNames-Deduped"].apply(parseCol)

In [38]:
def apply_batch(arr, func, num_threads=8):
    with ThreadPoolExecutor(num_threads) as executor:
        return list(executor.map(func, arr))

In [39]:
r = apply_batch(df["AuthorNames-Deduped"], parseCol)

In [41]:
for i, _mydf in enumerate(r):
    if _mydf is not None:
        _mydf["id"] = i

In [48]:
authors = pd.concat(r, ignore_index=True)

In [47]:
df[df["AuthorKeywords"].isna()]

Unnamed: 0,Abstract,AuthorNames-Deduped,AuthorKeywords
163,"In theory, efficient and high-quality renderin...",Ingo Wald;Nathan Morrical;Stefan Zellmann,
311,"In the last two decades, interactive visualiza...",Leilani Battle;Carlos Scheidegger,
367,Existing interactive visualization tools for d...,Xinyi Huang;Suphanut Jamonnak;Ye Zhao 0003;Boy...,
730,Textual criticism consists of the identificati...,Stefan Jänicke;David Joseph Wrisley,
741,Subspace analysis methods have gained interest...,Dominik Jäckle;Michael Blumenschein;Michael Be...,
...,...,...,...
3544,The author applied image processing and volume...,Nahum D. Gershon,
3545,The problem of presenting and gaining deeper u...,Wayne E. Fordyce;Jeffrey Ventrella,
3546,A hierarchical triangulation built from a digi...,Lori L. Scarlatos;Theodosios Pavlidis,
3547,A methodology for guiding the choice of visual...,Philip K. Robertson,


In [50]:
authors.to_parquet(
    "../datasets/local/vis_papers/processed/vis_papers_authors_span.parquet"
)

In [52]:
r2 = apply_batch(df["AuthorKeywords"], parseCol2)

In [54]:
for i, _mydf in enumerate(r2):
    if _mydf is not None:
        _mydf["id"] = i

In [57]:
keywords = pd.concat(r2, ignore_index=True)

In [67]:
keywords.to_parquet(
    "../datasets/local/vis_papers/processed/w_span/vis_papers_keywords_span.parquet"
)

In [69]:
keywords

Unnamed: 0,span_start,span_end,item,id
0,0,13,accessibility,0
1,14,37,photosensitive epilepsy,0
2,38,54,photosensitivity,0
3,55,66,interaction,0
4,67,85,data visualization,0
...,...,...,...,...
11818,0,23,Grammar-directed design,3462
11819,24,56,cooperative design and modeling,3462
11820,57,75,design automation,3462
11821,76,103,human-computer interaction,3462
