In [36]:
import re

import pandas as pd
from pyprojroot import here

In [2]:
# about 40 seconds - 2 minutes depending on system resources to load
df = pd.read_json(here("./data/db/final/kaggle/id_model_inputs/03-normalized_columns.json.gzip"), compression="gzip")

In [3]:
#df.info(memory_usage="deep")

In [4]:
#df.drop(['text', 'sent_set', 'found_sent', 'found_count'], axis='columns', inplace=True)

In [5]:
for col in df.filter(regex='num_authors|ct-').columns:
    print(col)
    df[col] = df[col].astype(pd.Int16Dtype())

num_authors
ct-incubation_period
ct-latent_period
ct-asymptomatic_proportion
ct-case_fatality_ratio
ct-recovery_rate
ct-infectiousness_period
ct-case_fatality_rate
ct-hospitalized_proportion
ct-asymptomatic_fraction
ct-hospitalized_fraction
ct-asymptomatic_ratio


In [6]:
for col in df.filter(regex='has-').columns:
    print(col)
    df[col] = df[col].astype('bool')

has-incubation_period
has-latent_period
has-asymptomatic_proportion
has-case_fatality_ratio
has-recovery_rate
has-infectiousness_period
has-case_fatality_rate
has-hospitalized_proportion
has-asymptomatic_fraction
has-hospitalized_fraction
has-asymptomatic_ratio


In [7]:
df["title"] = df["title"].astype('string')

In [8]:
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5928 entries, 0 to 5927
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   index                         5928 non-null   int64 
 1   pid                           5928 non-null   object
 2   num_authors                   5928 non-null   Int16 
 3   title                         5928 non-null   string
 4   text                          5928 non-null   object
 5   text_sent_lower               5928 non-null   object
 6   sent_set                      5928 non-null   object
 7   found_terms                   5928 non-null   object
 8   found_sent_idx                5928 non-null   object
 9   found_sent                    5928 non-null   object
 10  found_count                   5928 non-null   object
 11  has-incubation_period         5928 non-null   bool  
 12  has-latent_period             5928 non-null   bool  
 13  has-asymptomatic_p

In [9]:
covid_papers = [
    "PMC7121484", # Early Transmission Dynamics in Wuhan
    "PMC7186508" # To mask or not to mask
]
df[df.pid.isin(covid_papers)].filter(like="sent-")

Unnamed: 0,sent-incubation_period,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio
2719,"[[the incubation period distribution (i.e., th...",,,,,,,,,,
4528,"[[additionally, η accounts for the relative in...",,"[[unsurprisingly, the greater the proportion o...",,"[[additionally, η accounts for the relative in...","[[additionally, η accounts for the relative in...",,,"[[additionally, η accounts for the relative in...",[[we assume that some fraction of symptomatic ...,


## Which papers contain which KWs

In [10]:
sent_cols = df.columns[df.columns.str.contains("sent-")]
sent_cols

Index(['sent-incubation_period', 'sent-latent_period',
       'sent-asymptomatic_proportion', 'sent-case_fatality_ratio',
       'sent-recovery_rate', 'sent-infectiousness_period',
       'sent-case_fatality_rate', 'sent-hospitalized_proportion',
       'sent-asymptomatic_fraction', 'sent-hospitalized_fraction',
       'sent-asymptomatic_ratio'],
      dtype='object')

In [11]:
sent_bool = df[sent_cols].astype(bool)
sent_bool

Unnamed: 0,sent-incubation_period,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio
0,True,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
5923,False,True,False,False,False,False,False,False,False,False,False
5924,True,False,False,False,True,False,False,False,False,False,False
5925,False,True,False,False,True,False,False,False,False,False,False
5926,True,False,False,False,False,False,False,False,False,False,False


In [12]:
sent_bool['num_matches'] = sent_bool.apply(sum, axis=1)

In [13]:
sent_bool['num_matches'].value_counts()

1    4898
2     803
3     169
4      43
5      11
6       3
7       1
Name: num_matches, dtype: int64

In [14]:
num_sent_matches = sent_bool.apply(sum, axis=1).value_counts(dropna=False)
num_sent_matches

2     4898
4      803
6      169
8       43
10      11
12       3
14       1
dtype: int64

In [15]:
len(sent_bool)

5928

In [16]:
num_sent_matches.sum()

5928

In [17]:
counts = sent_bool.groupby(list(sent_cols))['num_matches'].sum()

In [18]:
count_df = pd.DataFrame(counts, dtype=int)
count_df.columns = ["count"]
count_df = count_df.reset_index()
count_df['contain_ct'] = count_df.filter(like='sent-').apply(sum, axis=1)

In [19]:
counts = count_df.sort_values(by=["count"], ascending=False)

In [20]:
def color_pos(val):
    color = 'red' if val is True else 'grey'
    return 'color: %s' % color

In [21]:
counts.style.applymap(color_pos)

Unnamed: 0,sent-incubation_period,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio,count,contain_ct
66,True,False,False,False,False,False,False,False,False,False,False,2774,1
7,False,False,False,False,False,False,True,False,False,False,False,920,1
72,True,False,False,False,False,False,True,False,False,False,False,626,2
14,False,False,False,False,True,False,False,False,False,False,False,546,1
30,False,False,True,False,False,False,False,False,False,False,False,164,1
4,False,False,False,False,False,False,False,True,False,False,False,160,1
50,False,True,False,False,False,False,False,False,False,False,False,160,1
118,True,True,False,False,False,False,False,False,False,False,False,156,2
81,True,False,False,False,True,False,False,False,False,False,False,154,2
98,True,False,True,False,False,False,False,False,False,False,False,102,2


In [22]:
count_df.sort_values(['contain_ct'] + sent_cols.tolist()).style.applymap(color_pos)

Unnamed: 0,sent-incubation_period,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio,count,contain_ct
0,False,False,False,False,False,False,False,False,False,False,True,29,1
1,False,False,False,False,False,False,False,False,False,True,False,17,1
2,False,False,False,False,False,False,False,False,True,False,False,32,1
4,False,False,False,False,False,False,False,True,False,False,False,160,1
7,False,False,False,False,False,False,True,False,False,False,False,920,1
12,False,False,False,False,False,True,False,False,False,False,False,26,1
14,False,False,False,False,True,False,False,False,False,False,False,546,1
23,False,False,False,True,False,False,False,False,False,False,False,70,1
30,False,False,True,False,False,False,False,False,False,False,False,164,1
50,False,True,False,False,False,False,False,False,False,False,False,160,1


In [23]:
7263-5928

1335

In [24]:
counts["count"].sum()

7263

In [25]:
len(df.pid.unique())

5928

In [26]:
sent_long = df.filter(regex="pid|sent-").melt(id_vars='pid')
sent_long

Unnamed: 0,pid,variable,value
0,PMC1074749,sent-incubation_period,[[after the mixture was heated to 99°c for 4 m...
1,PMC1090610,sent-incubation_period,[[the diagnosis module is designed to generate...
2,PMC1181873,sent-incubation_period,
3,PMC1215526,sent-incubation_period,
4,PMC1247620,sent-incubation_period,"[[a 5-day incubation period, consistent with a..."
...,...,...,...
65203,PMC7298926,sent-asymptomatic_ratio,
65204,PMC7299143,sent-asymptomatic_ratio,
65205,PMC7299147,sent-asymptomatic_ratio,
65206,PMC7299369,sent-asymptomatic_ratio,


In [27]:
# # this should match value above
# means "extra" papers is becuase some papers contain multiple sentence matches
sent_long.loc[sent_long['value'].astype(bool)].pid.duplicated().sum()

1335

## Filter on covid19 related terms

In [28]:
covid_papers

['PMC7121484', 'PMC7186508']

In [29]:
df[df.pid.isin(covid_papers)]

Unnamed: 0,index,pid,num_authors,title,text,text_sent_lower,sent_set,found_terms,found_sent_idx,found_sent,...,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio
2719,49,PMC7121484,45,"Early Transmission Dynamics in Wuhan, China, o...",The earliest cases were identified through the...,[the earliest cases were identified through th...,"[[pathogen, through, days, for, to, without, )...",{'has-incubation_period': True},"{'idx-incubation_period': [24, 28, 40, 63]}",{'sent-incubation_period': [['the incubation p...,...,,,,,,,,,,
4528,28,PMC7186508,8,To mask or not to mask: Modeling the potential...,Under the ongoing COVID-19 pandemic (caused by...,[under the ongoing covid-19 pandemic (caused b...,"[[us, as, prevention, are, general, although, ...","{'has-incubation_period': True, 'has-infectiou...","{'idx-incubation_period': [58], 'idx-infectiou...","{'sent-incubation_period': [['additionally, η ...",...,,"[[unsurprisingly, the greater the proportion o...",,"[[additionally, η accounts for the relative in...","[[additionally, η accounts for the relative in...",,,"[[additionally, η accounts for the relative in...",[[we assume that some fraction of symptomatic ...,


In [33]:
df[df.pid.isin(covid_papers)].text_sent_lower.str.get(0) # same as regex filtering method

2719    the earliest cases were identified through the...
4528    under the ongoing covid-19 pandemic (caused by...
Name: text_sent_lower, dtype: object

In [34]:
filter_words_lower_regex = [
    r".*covid\s?-?\s?19.*", # covid-19
    r".*sars-?cov-?2.*",    # sars-cov-2
    r".*ncov.*",            # ncov
    r".*coronavirus.*",     # coronavirus
]
pattern = "|".join(filter_words_lower_regex)
pattern

'.*covid\\s?-?\\s?19.*|.*sars-?cov-?2.*|.*ncov.*|.*coronavirus.*'

In [37]:
r = re.compile(pattern)
r

re.compile(r'.*covid\s?-?\s?19.*|.*sars-?cov-?2.*|.*ncov.*|.*coronavirus.*',
re.UNICODE)

In [39]:
df.shape

(5928, 55)

In [41]:
df.columns

Index(['index', 'pid', 'num_authors', 'title', 'text', 'text_sent_lower',
       'sent_set', 'found_terms', 'found_sent_idx', 'found_sent',
       'found_count', 'has-incubation_period', 'has-latent_period',
       'has-asymptomatic_proportion', 'has-case_fatality_ratio',
       'has-recovery_rate', 'has-infectiousness_period',
       'has-case_fatality_rate', 'has-hospitalized_proportion',
       'has-asymptomatic_fraction', 'has-hospitalized_fraction',
       'has-asymptomatic_ratio', 'ct-incubation_period', 'ct-latent_period',
       'ct-asymptomatic_proportion', 'ct-case_fatality_ratio',
       'ct-recovery_rate', 'ct-infectiousness_period', 'ct-case_fatality_rate',
       'ct-hospitalized_proportion', 'ct-asymptomatic_fraction',
       'ct-hospitalized_fraction', 'ct-asymptomatic_ratio',
       'idx-incubation_period', 'idx-latent_period',
       'idx-asymptomatic_proportion', 'idx-case_fatality_ratio',
       'idx-recovery_rate', 'idx-infectiousness_period',
       'idx-case_fata

In [45]:
from tqdm import tqdm

In [48]:
tqdm.pandas(desc="sentence")

matched_papers = []
for pidx, paper_text in enumerate(tqdm(df.text_sent_lower)):
    for sidx, sentence in enumerate(paper_text):
        if r.match(sentence.lower()):
            #print(f"paper: {pidx}, sentence: {sidx}")
            matched_papers.append(df.iloc[pidx])
            break

100%|█████████████████████████████████████████████████████████████████████████████| 5928/5928 [00:07<00:00, 836.05it/s]


In [44]:
len(matched_papers)

4054

In [49]:
# look for pattern matches all lower case
has_covid_term = df['text_sent_lower'].apply(lambda x: any([True for sent in x if r.match(sent.lower())]))

In [50]:
covid_kw_sent = df[has_covid_term]

In [51]:
covid_kw_sent.shape

(4054, 55)

In [52]:
covid_kw_sent[covid_kw_sent.pid.isin(covid_papers)]

Unnamed: 0,index,pid,num_authors,title,text,text_sent_lower,sent_set,found_terms,found_sent_idx,found_sent,...,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio
2719,49,PMC7121484,45,"Early Transmission Dynamics in Wuhan, China, o...",The earliest cases were identified through the...,[the earliest cases were identified through th...,"[[pathogen, through, days, for, to, without, )...",{'has-incubation_period': True},"{'idx-incubation_period': [24, 28, 40, 63]}",{'sent-incubation_period': [['the incubation p...,...,,,,,,,,,,
4528,28,PMC7186508,8,To mask or not to mask: Modeling the potential...,Under the ongoing COVID-19 pandemic (caused by...,[under the ongoing covid-19 pandemic (caused b...,"[[us, as, prevention, are, general, although, ...","{'has-incubation_period': True, 'has-infectiou...","{'idx-incubation_period': [58], 'idx-infectiou...","{'sent-incubation_period': [['additionally, η ...",...,,"[[unsurprisingly, the greater the proportion o...",,"[[additionally, η accounts for the relative in...","[[additionally, η accounts for the relative in...",,,"[[additionally, η accounts for the relative in...",[[we assume that some fraction of symptomatic ...,


In [53]:
covid_kw_sent[covid_kw_sent.pid.isin(covid_papers)].filter(like="sent-incu").iloc[0][0]

[['the incubation period distribution (i.e., the time delay from infection to illness onset) was estimated by fitting a log-normal distribution to data on exposure histories and onset dates in a subset of cases with detailed information available.',
  'we used an informative prior distribution for the serial interval based on the serial interval of sars with a mean of 8.4 and a standard deviation of 3.8.11\nanalyses of the incubation period, serial interval, growth rate, and r0 were performed with the use of matlab software (mathworks).',
  'we examined data on exposures among 10 confirmed cases, and we estimated the mean incubation period to be 5.2 days (95% confidence interval [ci], 4.1 to 7.0); the 95th percentile of the distribution was 12.5 days (95% ci, 9.2 to 18) (figure 2a).',
  'our preliminary estimate of the incubation period distribution provides important evidence to support a 14-day medical observation period or quarantine for exposed persons.']]

## explode sentences

In [54]:
covid_kw_sent.pid.duplicated().any() # should be false

False

In [55]:
sent_cols.to_list()

['sent-incubation_period',
 'sent-latent_period',
 'sent-asymptomatic_proportion',
 'sent-case_fatality_ratio',
 'sent-recovery_rate',
 'sent-infectiousness_period',
 'sent-case_fatality_rate',
 'sent-hospitalized_proportion',
 'sent-asymptomatic_fraction',
 'sent-hospitalized_fraction',
 'sent-asymptomatic_ratio']

In [56]:
covid_kw_sent.columns

Index(['index', 'pid', 'num_authors', 'title', 'text', 'text_sent_lower',
       'sent_set', 'found_terms', 'found_sent_idx', 'found_sent',
       'found_count', 'has-incubation_period', 'has-latent_period',
       'has-asymptomatic_proportion', 'has-case_fatality_ratio',
       'has-recovery_rate', 'has-infectiousness_period',
       'has-case_fatality_rate', 'has-hospitalized_proportion',
       'has-asymptomatic_fraction', 'has-hospitalized_fraction',
       'has-asymptomatic_ratio', 'ct-incubation_period', 'ct-latent_period',
       'ct-asymptomatic_proportion', 'ct-case_fatality_ratio',
       'ct-recovery_rate', 'ct-infectiousness_period', 'ct-case_fatality_rate',
       'ct-hospitalized_proportion', 'ct-asymptomatic_fraction',
       'ct-hospitalized_fraction', 'ct-asymptomatic_ratio',
       'idx-incubation_period', 'idx-latent_period',
       'idx-asymptomatic_proportion', 'idx-case_fatality_ratio',
       'idx-recovery_rate', 'idx-infectiousness_period',
       'idx-case_fata

In [59]:
non_sent_cols = ["pid", "title"]#, "publish_time_dt", 'doi', 'url']
non_sent_cols

['pid', 'title']

In [60]:
covid_kw_sent_subset = covid_kw_sent[non_sent_cols + sent_cols.to_list()]
covid_kw_sent_subset

Unnamed: 0,pid,title,sent-incubation_period,sent-latent_period,sent-asymptomatic_proportion,sent-case_fatality_ratio,sent-recovery_rate,sent-infectiousness_period,sent-case_fatality_rate,sent-hospitalized_proportion,sent-asymptomatic_fraction,sent-hospitalized_fraction,sent-asymptomatic_ratio
0,PMC1074749,Species-independent detection of RNA virus by ...,[[after the mixture was heated to 99°c for 4 m...,,,,,,,,,,
3,PMC1215526,Macrophages and cytokines in the early defence...,,,[[seropositivity to hsv-1 does not render any ...,,,,,,,,
5,PMC1262710,Molecular signature of clinical severity in re...,[[after an incubation period from 2 to 10 days...,,,,,,,,,,
6,PMC1276795,A simple and rapid approach for screening of S...,[[the thermal profile consists of an initial i...,,,,,,,,,,
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,,,"[[more recently, a coronavirus has been identi...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,[[another interesting concentration is related...,,,,,,,,,
5924,PMC7299143,Significance of geographical factors to the CO...,[[this contact transmissible disease has an av...,,,,[[these workers did not observe any noticeable...,,,,,,
5925,PMC7299147,A new SAIR model on complex networks for analy...,,"[[for an asymptomatically infected individual,...",,,"[[first, we explore the impact of recovery rat...",,,,,,
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,"[[in most patients, the median time of incubat...",,,,,,,,,,


In [89]:
non_sent_cols + [sent_cols[0]]

['pid', 'title', 'sent-incubation_period']

In [93]:
covid_kw_sent_subset[non_sent_cols + [sent_cols[0]]].explode(sent_cols[0]).explode(sent_cols[0])

Unnamed: 0,pid,title,sent-incubation_period
0,PMC1074749,Species-independent detection of RNA virus by ...,after the mixture was heated to 99°c for 4 min...
3,PMC1215526,Macrophages and cytokines in the early defence...,
5,PMC1262710,Molecular signature of clinical severity in re...,"after an incubation period from 2 to 10 days, ..."
6,PMC1276795,A simple and rapid approach for screening of S...,the thermal profile consists of an initial inc...
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,
...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,
5924,PMC7299143,Significance of geographical factors to the CO...,this contact transmissible disease has an aver...
5925,PMC7299147,A new SAIR model on complex networks for analy...,
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,"in most patients, the median time of incubatio..."


In [199]:
tqdm.pandas(desc="explode")

exploded_df_list = []

for idx, sc in enumerate(tqdm(sent_cols)):
    print(idx, sc)
    exploded = covid_kw_sent_subset[non_sent_cols + [sc]].explode(sc).explode(sc)
    exploded = exploded.rename({sc: "sentence"}, axis='columns')
    exploded['match_type'] = sc
    display(exploded)
    exploded_df_list.append(exploded)

  from pandas import Panel
  0%|                                                                                           | 0/11 [00:00<?, ?it/s]

0 sent-incubation_period


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,after the mixture was heated to 99°c for 4 min...,sent-incubation_period
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-incubation_period
5,PMC1262710,Molecular signature of clinical severity in re...,"after an incubation period from 2 to 10 days, ...",sent-incubation_period
6,PMC1276795,A simple and rapid approach for screening of S...,the thermal profile consists of an initial inc...,sent-incubation_period
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-incubation_period
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-incubation_period
5924,PMC7299143,Significance of geographical factors to the CO...,this contact transmissible disease has an aver...,sent-incubation_period
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-incubation_period
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,"in most patients, the median time of incubatio...",sent-incubation_period


1 sent-latent_period


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-latent_period
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-latent_period
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-latent_period
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-latent_period
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-latent_period
...,...,...,...,...
5925,PMC7299147,A new SAIR model on complex networks for analy...,"4, we realize that the net increment rate of s...",sent-latent_period
5925,PMC7299147,A new SAIR model on complex networks for analy...,and throughout the spread of covid-19 in wuhan...,sent-latent_period
5925,PMC7299147,A new SAIR model on complex networks for analy...,"thus, the opinion that the latent period can b...",sent-latent_period
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-latent_period


 18%|███████████████                                                                    | 2/11 [00:00<00:00, 15.75it/s]

2 sent-asymptomatic_proportion


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-asymptomatic_proportion
3,PMC1215526,Macrophages and cytokines in the early defence...,seropositivity to hsv-1 does not render any pr...,sent-asymptomatic_proportion
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-asymptomatic_proportion
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-asymptomatic_proportion
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-asymptomatic_proportion
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-asymptomatic_proportion
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-asymptomatic_proportion
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-asymptomatic_proportion
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-asymptomatic_proportion


3 sent-case_fatality_ratio


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-case_fatality_ratio
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-case_fatality_ratio
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-case_fatality_ratio
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-case_fatality_ratio
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,"more recently, a coronavirus has been identifi...",sent-case_fatality_ratio
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-case_fatality_ratio
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-case_fatality_ratio
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-case_fatality_ratio
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-case_fatality_ratio


 36%|██████████████████████████████▏                                                    | 4/11 [00:00<00:00, 15.90it/s]

4 sent-recovery_rate


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-recovery_rate
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-recovery_rate
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-recovery_rate
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-recovery_rate
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-recovery_rate
...,...,...,...,...
5925,PMC7299147,A new SAIR model on complex networks for analy...,these indicate that increasing the recovery ra...,sent-recovery_rate
5925,PMC7299147,A new SAIR model on complex networks for analy...,"in summary, the best way to curb the spread of...",sent-recovery_rate
5925,PMC7299147,A new SAIR model on complex networks for analy...,increasing the recovery rate and the removed r...,sent-recovery_rate
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-recovery_rate


5 sent-infectiousness_period


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-infectiousness_period
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-infectiousness_period
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-infectiousness_period
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-infectiousness_period
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-infectiousness_period
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-infectiousness_period
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-infectiousness_period
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-infectiousness_period
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-infectiousness_period


 55%|█████████████████████████████████████████████▎                                     | 6/11 [00:00<00:00, 15.97it/s]

6 sent-case_fatality_rate


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-case_fatality_rate
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-case_fatality_rate
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-case_fatality_rate
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-case_fatality_rate
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-case_fatality_rate
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-case_fatality_rate
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-case_fatality_rate
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-case_fatality_rate
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-case_fatality_rate


7 sent-hospitalized_proportion


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-hospitalized_proportion
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-hospitalized_proportion
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-hospitalized_proportion
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-hospitalized_proportion
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-hospitalized_proportion
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-hospitalized_proportion
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-hospitalized_proportion
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-hospitalized_proportion
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-hospitalized_proportion


 73%|████████████████████████████████████████████████████████████▎                      | 8/11 [00:00<00:00, 16.13it/s]

8 sent-asymptomatic_fraction


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-asymptomatic_fraction
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-asymptomatic_fraction
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-asymptomatic_fraction
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-asymptomatic_fraction
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-asymptomatic_fraction
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-asymptomatic_fraction
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-asymptomatic_fraction
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-asymptomatic_fraction
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-asymptomatic_fraction


9 sent-hospitalized_fraction


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-hospitalized_fraction
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-hospitalized_fraction
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-hospitalized_fraction
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-hospitalized_fraction
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-hospitalized_fraction
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-hospitalized_fraction
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-hospitalized_fraction
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-hospitalized_fraction
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-hospitalized_fraction


 91%|██████████████████████████████████████████████████████████████████████████▌       | 10/11 [00:00<00:00, 16.41it/s]

10 sent-asymptomatic_ratio


Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,,sent-asymptomatic_ratio
3,PMC1215526,Macrophages and cytokines in the early defence...,,sent-asymptomatic_ratio
5,PMC1262710,Molecular signature of clinical severity in re...,,sent-asymptomatic_ratio
6,PMC1276795,A simple and rapid approach for screening of S...,,sent-asymptomatic_ratio
8,PMC1298938,Functional and Genetic Analysis of Coronavirus...,,sent-asymptomatic_ratio
...,...,...,...,...
5923,PMC7298926,Impacts of epidemic outbreaks on supply chains...,,sent-asymptomatic_ratio
5924,PMC7299143,Significance of geographical factors to the CO...,,sent-asymptomatic_ratio
5925,PMC7299147,A new SAIR model on complex networks for analy...,,sent-asymptomatic_ratio
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,,sent-asymptomatic_ratio


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 16.39it/s]


In [200]:
explode_df = pd.concat(exploded_df_list)

In [201]:
explode_df.shape

(48466, 4)

In [202]:
print(f"Number of unique pids: {explode_df.pid.unique().shape}")

Number of unique pids: (4054,)


In [203]:
explode_df.pid.unique().shape

(4054,)

In [204]:
explode_df[explode_df.pid.isin(covid_papers)]

Unnamed: 0,pid,title,sentence,match_type
2719,PMC7121484,"Early Transmission Dynamics in Wuhan, China, o...","the incubation period distribution (i.e., the ...",sent-incubation_period
2719,PMC7121484,"Early Transmission Dynamics in Wuhan, China, o...",we used an informative prior distribution for ...,sent-incubation_period
2719,PMC7121484,"Early Transmission Dynamics in Wuhan, China, o...",we examined data on exposures among 10 confirm...,sent-incubation_period
2719,PMC7121484,"Early Transmission Dynamics in Wuhan, China, o...",our preliminary estimate of the incubation per...,sent-incubation_period
4528,PMC7186508,To mask or not to mask: Modeling the potential...,"additionally, η accounts for the relative infe...",sent-incubation_period
2719,PMC7121484,"Early Transmission Dynamics in Wuhan, China, o...",,sent-latent_period
4528,PMC7186508,To mask or not to mask: Modeling the potential...,,sent-latent_period
2719,PMC7121484,"Early Transmission Dynamics in Wuhan, China, o...",,sent-asymptomatic_proportion
4528,PMC7186508,To mask or not to mask: Modeling the potential...,"unsurprisingly, the greater the proportion of ...",sent-asymptomatic_proportion
4528,PMC7186508,To mask or not to mask: Modeling the potential...,"unsurprisingly, this benefit is greater if a l...",sent-asymptomatic_proportion


In [205]:
pd.isna(None)

True

In [206]:
explode_df = explode_df[~explode_df.sentence.isna()]

In [207]:
explode_df.shape

(8873, 4)

In [208]:
explode_df.pid.unique().shape

(4054,)

In [209]:
explode_df

Unnamed: 0,pid,title,sentence,match_type
0,PMC1074749,Species-independent detection of RNA virus by ...,after the mixture was heated to 99°c for 4 min...,sent-incubation_period
5,PMC1262710,Molecular signature of clinical severity in re...,"after an incubation period from 2 to 10 days, ...",sent-incubation_period
6,PMC1276795,A simple and rapid approach for screening of S...,the thermal profile consists of an initial inc...,sent-incubation_period
10,PMC1435987,Prions Adhere to Soil Minerals and Remain Infe...,the sedimented fraction of these control sampl...,sent-incubation_period
10,PMC1435987,Prions Adhere to Soil Minerals and Remain Infe...,the 10-d increase in incubation period for mte...,sent-incubation_period
...,...,...,...,...
5794,PMC7283745,"Estimation of the basic reproduction number, a...",let r\na = c\na/c be the ratio of asymptomatic...,sent-asymptomatic_ratio
5849,PMC7290384,Forecasting COVID-19-Associated Hospitalizatio...,"more specifically, we assumed that the ratio o...",sent-asymptomatic_ratio
5870,PMC7292581,SARS-CoV-2 shedding and seroconversion among p...,"despite the positive clinical findings, six of...",sent-asymptomatic_ratio
5871,PMC7292610,COVID-19 in patients with thoracic malignancie...,one patient required icu admission and three p...,sent-asymptomatic_ratio


In [210]:
explode_df.sentence.str.len()

0        302
5        135
6        234
10       201
10       228
        ... 
5794      89
5849     170
5870     195
5871    2160
5902     414
Name: sentence, Length: 8873, dtype: int64

In [211]:
(pd.DataFrame(explode_df.match_type
    .value_counts(dropna=False))
    .reset_index()
    .rename({'index': 'keyword', 'match_type': 'sentence_count'}, axis='columns')
    .merge(pd.DataFrame(sent_cols), left_on="keyword", right_on=0, how="outer").drop(0, axis="columns")
)

Unnamed: 0,keyword,sentence_count
0,sent-incubation_period,5183
1,sent-case_fatality_rate,1550
2,sent-recovery_rate,753
3,sent-asymptomatic_proportion,402
4,sent-latent_period,309
5,sent-case_fatality_ratio,212
6,sent-hospitalized_proportion,183
7,sent-infectiousness_period,102
8,sent-asymptomatic_fraction,74
9,sent-asymptomatic_ratio,73


In [212]:
print(explode_df.shape)

(8873, 4)


In [213]:
# incubation period + day
incubation_period_day = explode_df[(explode_df.match_type == "sent-incubation_period") & (explode_df.sentence.str.contains('day|days', regex=True))]
incubation_period_day["match_type"] = "sent-incubation_period_day"
incubation_period_day

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incubation_period_day["match_type"] = "sent-incubation_period_day"


Unnamed: 0,pid,title,sentence,match_type
5,PMC1262710,Molecular signature of clinical severity in re...,"after an incubation period from 2 to 10 days, ...",sent-incubation_period_day
15,PMC1562405,Factors associated with nosocomial SARS-CoV tr...,inclusion and exclusion criteria for study cas...,sent-incubation_period_day
15,PMC1562405,Factors associated with nosocomial SARS-CoV tr...,the estimated 6 day incubation period is based...,sent-incubation_period_day
17,PMC1618839,A super-spreading ewe infects hundreds with Q ...,"taking may 4, 2003 as the day of exposure for ...",sent-incubation_period_day
17,PMC1618839,A super-spreading ewe infects hundreds with Q ...,the point-source nature of the outbreak permit...,sent-incubation_period_day
...,...,...,...,...
5921,PMC7298562,Olfactory and Oral Manifestations of COVID-19:...,estimated incubation period was calculated as ...,sent-incubation_period_day
5921,PMC7298562,Olfactory and Oral Manifestations of COVID-19:...,statistical data and figures were analyzed usi...,sent-incubation_period_day
5924,PMC7299143,Significance of geographical factors to the CO...,this contact transmissible disease has an aver...,sent-incubation_period_day
5926,PMC7299369,Current Perspective of Antiviral Strategies ag...,"in most patients, the median time of incubatio...",sent-incubation_period_day


In [214]:
incubation_period_day.shape

(2485, 4)

In [215]:
#explode_df = explode_df[explode_df.match_type != "sent-incubation_period"]
#explode_df

In [216]:
explode_df.shape

(8873, 4)

In [217]:
incubation_period_day.match_type.value_counts(dropna=False)

sent-incubation_period_day    2485
Name: match_type, dtype: int64

In [218]:
explode_df.match_type.value_counts(dropna=False)

sent-incubation_period          5183
sent-case_fatality_rate         1550
sent-recovery_rate               753
sent-asymptomatic_proportion     402
sent-latent_period               309
sent-case_fatality_ratio         212
sent-hospitalized_proportion     183
sent-infectiousness_period       102
sent-asymptomatic_fraction        74
sent-asymptomatic_ratio           73
sent-hospitalized_fraction        32
Name: match_type, dtype: int64

In [219]:
combined = pd.concat([incubation_period_day, explode_df], ignore_index=True)

In [220]:
print(combined.shape)

(11358, 4)


In [223]:
# recount table above
(pd.DataFrame(combined.match_type
    .value_counts(dropna=False))
    .reset_index()
    .rename({'index': 'keyword', 'match_type': 'sentence_count'}, axis='columns')
    .merge(pd.DataFrame(sent_cols), left_on="keyword", right_on=0, how="outer").drop(0, axis="columns")
)

Unnamed: 0,keyword,sentence_count
0,sent-incubation_period,5183
1,sent-incubation_period_day,2485
2,sent-case_fatality_rate,1550
3,sent-recovery_rate,753
4,sent-asymptomatic_proportion,402
5,sent-latent_period,309
6,sent-case_fatality_ratio,212
7,sent-hospitalized_proportion,183
8,sent-infectiousness_period,102
9,sent-asymptomatic_fraction,74
