In [1]:
from datetime import datetime
import os

import dask.dataframe as dd
from dask.distributed import Client
from nltk.tokenize import word_tokenize
from pyprojroot import here

In [2]:
def find_set_in_list(sent_set, terms):
    """Determins if a term exists in a list of sentences
    """
    matches = {}
    for trm in terms:
        trm_set = set(trm.split(" "))
        for st in sent_set:
            if (trm_set.issubset(st)):
                matches[trm] = True
                break # if there is a pattern match go to the next term
    return(matches)


# testing function
test_sent = ['dan hello', 'hello you', 'the quick brown fox', 'my I have a word?']
test_sent_set = [set(word_tokenize(sent)) for sent in test_sent]
assert find_set_in_list(test_sent_set,
                        ['hello dan', 'fox', 'tom nook', 'word']) == {
                          'hello dan': True,
                          'fox': True,
                          'word': True,
                        }

In [3]:
# search for terms
search_terms = [
    "incubation period",
    "infectiousness period",
    "recovery rate",
    "case fatality ratio",
    "case fatality rate",
    "asymptomatic fraction",
    "asymptomatic proportion",
    "asymptomatic ratio",
    "hospitalized fraction",
    "hospitalized proportion",
    "latent period",
]

In [4]:
client = Client(n_workers=6, threads_per_worker=1, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:49580  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 6  Cores: 6  Memory: 6.00 GB


In [5]:
files =str(here("./data/db/working/kaggle/id_model_inputs/01-03-tokenize_sentences.json.gzip"))
pth = os.path.join(files, "*.part")

In [6]:
paper_df = dd.read_json(pth, compression="gzip")

In [7]:
paper_df.head()

Unnamed: 0,pid,num_authors,title,text,text_sent_lower,sent_set
0,PMC1054884,7,Recombination Every Day: Abundant Recombinatio...,As increasing numbers of full-length viral seq...,[as increasing numbers of full-length viral se...,"[[or, sequences, frequently, more, recombinant..."
1,PMC1065028,1,Why can't I visit? The ethics of visitation re...,The sudden emergence of severe acute respirato...,[the sudden emergence of severe acute respirat...,"[[sars, much, april, (, severe, emergence, in,..."
2,PMC1065064,8,Prospective evaluation of an internet-linked h...,The rate of expansion of medical knowledge is ...,[the rate of expansion of medical knowledge is...,"[[rapidly, frequently, difficult, to, new, kno..."
3,PMC1065120,4,Scanning the horizon: emerging hospital-wide t...,This series of articles provides regular surve...,[this series of articles provides regular surv...,"[[., regular, care, provides, articles, techno..."
4,PMC1065257,3,Characterization of the frameshift signal of E...,Programmed −1 ribosomal frameshifting (hereaft...,[programmed −1 ribosomal frameshifting (hereaf...,"[[particular, quantities, frameshifting, progr..."


In [8]:
paper_df["found_terms"] = paper_df["sent_set"].map_partitions(lambda s: s.apply(find_set_in_list, terms=search_terms), meta="object")

In [9]:
start_time = datetime.now()

paper_df.to_json(str(here("./data/db/working/kaggle/id_model_inputs/01-04-found_terms.json.gzip", warn=False)), compression="gzip") # about 6 minutes

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))

Duration: 0:05:39.710007


In [10]:
client.shutdown()