In [1]:
from datetime import datetime
import os

import dask.dataframe as dd
from dask.distributed import Client
import janitor
from numpy import vectorize
import pandas as pd
from pyprojroot import here

In [2]:
def prefix_dict_keys(dictionary, prefix):
    return {f"{prefix}-{key.replace(' ', '_')}": dictionary.get(key) for key in dictionary.keys()}

In [3]:
client = Client(n_workers=3, threads_per_worker=1, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:51060  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 3  Cores: 3  Memory: 6.00 GB


In [4]:
files = str(here("./data/db/working/kaggle/id_model_inputs/02-found_sentences.json.gzip"))
pth = os.path.join(files, "*.part")

In [5]:
found_terms = dd.read_json(pth, compression="gzip")

In [6]:
# len(found_terms) # about 15 seconds with 6 cores 1 gb ram

In [7]:
df = found_terms.compute() # about 1 minute

In [8]:
client.shutdown()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


In [9]:
df.head()

Unnamed: 0,pid,num_authors,title,text,text_sent_lower,sent_set,found_terms,found_sent_idx,found_sent
0,PMC1074749,8,Species-independent detection of RNA virus by ...,Identifying the causative agent of an infectio...,[identifying the causative agent of an infecti...,"[[its, eventual, causative, for, control, ., i...",{'incubation period': True},{'incubation period': [72]},{'incubation period': [['after the mixture was...
1,PMC1090610,1,GIDEON: a comprehensive Web-based resource for...,"As of 2005, the world is confronted by 338 gen...","[as of 2005, the world is confronted by 338 ge...","[[generic, countries, 2005, scattered, as, in,...",{'incubation period': True},{'incubation period': [12]},{'incubation period': [['the diagnosis module ...
2,PMC1181873,4,Appropriate Models for the Management of Infec...,The past decade has seen a dramatic increase i...,[the past decade has seen a dramatic increase ...,"[[seen, increase, to, health, attached, in, si...",{'latent period': True},"{'latent period': [23, 27, 48, 49, 50, 51, 52,...",{'latent period': [['the effects of n on the d...
3,PMC1215526,1,Macrophages and cytokines in the early defence...,Virus-host interactions are crucial for the ou...,[virus-host interactions are crucial for the o...,"[[., for, interactions, the, infections, are, ...",{'asymptomatic proportion': True},{'asymptomatic proportion': [83]},{'asymptomatic proportion': [['seropositivity ...
4,PMC1247620,7,Understanding the Spatial Clustering of Severe...,We used spatial and nonspatial data in this st...,[we used spatial and nonspatial data in this s...,"[[this, data, ., spatial, used, we, in, nonspa...",{'incubation period': True},"{'incubation period': [23, 53]}",{'incubation period': [['a 5-day incubation pe...


In [10]:
df['found_count'] = df["found_sent_idx"].apply(lambda d: {k: len(d[k]) for k in d.keys()})

In [11]:
# rename keys for pivot into columns
df = (df
      .assign(
          found_terms = df["found_terms"].apply(prefix_dict_keys, prefix="has"),
          found_count = df["found_count"].apply(prefix_dict_keys, prefix="ct"),
          found_sent_idx = df["found_sent_idx"].apply(prefix_dict_keys, prefix="idx"),
          found_sent = df["found_sent"].apply(prefix_dict_keys, prefix="sent")
          )
      .reset_index()
)

In [12]:
normalized_cols = [pd.json_normalize(value) for colname, value in df[['found_terms', 'found_count', 'found_sent_idx', 'found_sent']].iteritems()]

In [13]:
df.index

RangeIndex(start=0, stop=5928, step=1)

In [14]:
list(map(lambda x: x.index, normalized_cols))

[RangeIndex(start=0, stop=5928, step=1),
 RangeIndex(start=0, stop=5928, step=1),
 RangeIndex(start=0, stop=5928, step=1),
 RangeIndex(start=0, stop=5928, step=1)]

In [15]:
# make sure all the index values are the same before concat
assert all([(df.index == x).all() for x in list(map(lambda x: x.index, normalized_cols))])

In [19]:
concat_dfs = []
concat_dfs.extend([df])
concat_dfs.extend(normalized_cols)
len(concat_dfs)

5

In [25]:
normalized_df = pd.concat(concat_dfs, axis="columns")

In [26]:
normalized_df

Unnamed: 0,index,pid,num_authors,title,text,text_sent_lower,sent_set,found_terms,found_sent_idx,found_sent,...,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio
0,0,PMC1074749,8,Species-independent detection of RNA virus by ...,Identifying the causative agent of an infectio...,[identifying the causative agent of an infecti...,"[[its, eventual, causative, for, control, ., i...",{'has-incubation_period': True},{'idx-incubation_period': [72]},{'sent-incubation_period': [['after the mixtur...,...,,,,,,,,,,
1,1,PMC1090610,1,GIDEON: a comprehensive Web-based resource for...,"As of 2005, the world is confronted by 338 gen...","[as of 2005, the world is confronted by 338 ge...","[[generic, countries, 2005, scattered, as, in,...",{'has-incubation_period': True},{'idx-incubation_period': [12]},{'sent-incubation_period': [['the diagnosis mo...,...,,,,,,,,,,
2,2,PMC1181873,4,Appropriate Models for the Management of Infec...,The past decade has seen a dramatic increase i...,[the past decade has seen a dramatic increase ...,"[[seen, increase, to, health, attached, in, si...",{'has-latent_period': True},"{'idx-latent_period': [23, 27, 48, 49, 50, 51,...",{'sent-latent_period': [['the effects of n on ...,...,[[the effects of n on the distribution of the ...,,,,,,,,,
3,3,PMC1215526,1,Macrophages and cytokines in the early defence...,Virus-host interactions are crucial for the ou...,[virus-host interactions are crucial for the o...,"[[., for, interactions, the, infections, are, ...",{'has-asymptomatic_proportion': True},{'idx-asymptomatic_proportion': [83]},{'sent-asymptomatic_proportion': [['seropositi...,...,,[[seropositivity to hsv-1 does not render any ...,,,,,,,,
4,4,PMC1247620,7,Understanding the Spatial Clustering of Severe...,We used spatial and nonspatial data in this st...,[we used spatial and nonspatial data in this s...,"[[this, data, ., spatial, used, we, in, nonspa...",{'has-incubation_period': True},"{'idx-incubation_period': [23, 53]}",{'sent-incubation_period': [['a 5-day incubati...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5923,36,PMC7298926,4,Impacts of epidemic outbreaks on supply chains...,How does the OSCM literature address issues re...,[how does the oscm literature address issues r...,"[[to, outbreaks, oscm, how, and, terms, of, th...",{'has-latent_period': True},{'idx-latent_period': [51]},{'sent-latent_period': [['another interesting ...,...,[[another interesting concentration is related...,,,,,,,,,
5924,37,PMC7299143,3,Significance of geographical factors to the CO...,Coronavirus disease 2019 (COVID-19) already co...,[coronavirus disease 2019 (covid-19) already c...,"[[as, significantly, already, disease, 2019, m...","{'has-incubation_period': True, 'has-recovery_...","{'idx-incubation_period': [8], 'idx-recovery_r...",{'sent-incubation_period': [['this contact tra...,...,,,,[[these workers did not observe any noticeable...,,,,,,
5925,38,PMC7299147,5,A new SAIR model on complex networks for analy...,"In the last two decades, large-scale pandemics...","[in the last two decades, large-scale pandemic...","[[two, ,, province, acute, coronaviruses, midd...","{'has-recovery_rate': True, 'has-latent_period...","{'idx-recovery_rate': [64, 69, 81, 84], 'idx-l...","{'sent-recovery_rate': [['first, we explore th...",...,"[[for an asymptomatically infected individual,...",,,"[[first, we explore the impact of recovery rat...",,,,,,
5926,39,PMC7299369,5,Current Perspective of Antiviral Strategies ag...,An understanding of the replication cycle and ...,[an understanding of the replication cycle and...,"[[antivirals, understanding, and, vaccines, of...",{'has-incubation_period': True},{'idx-incubation_period': [14]},{'sent-incubation_period': [['in most patients...,...,,,,,,,,,,


In [27]:
assert len(normalized_df) == len(df)

In [29]:
normalized_df.to_json(here("./data/db/final/kaggle/id_model_inputs/03-normalized_columns.json.gzip", warn=False), compression="gzip")