In [1]:
import json


def reader(json_file_path: str):
    with open(json_file_path) as curr_json:
        return json.load(curr_json)


class ArticleReader:

    def __init__(self, json_file_path):
        data = reader(json_file_path)
        self.file = json_file_path
        self.body = data["body_text"]
        self.metadata = data["metadata"]
        self.id = data["paper_id"]
        self.abstract = data["abstract"]
        self.bib = data["bib_entries"]

    def get_text_parts(self):
        return "".join([x['text'] for x in self.body])

In [5]:
import os
from typing import List
from ArticleReader import ArticleReader


def gather_docs_by_keyword(data_path: str, keywords: List[str]) -> (List[str], dict):
    docs = []
    keyword_counts = {}
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith(".json"):
                curr_file_path = os.path.join(root, file)
                article = ArticleReader(curr_file_path)
                text_parts = article.get_text_parts()
                status = contains_keywords(text_parts, keywords)
                if status:
                    docs.append(os.path.join(root, file))
                    keyword_count = get_keyword_count(text_parts, keywords)
                    keyword_counts[file] = keyword_count

    return docs, keyword_counts


def contains_keywords(text_parts: str, keywords: List[str]) -> bool:
    for keyword in keywords:
        status = keyword in text_parts
        if status:
            return True
        else:
            continue

    return False


def get_keyword_count(text_parts: str, keywords: List[str]) -> int:
    keyword_count = 0
    for keyword in keywords:
        keyword_count += text_parts.count(keyword)

    return keyword_count


def get_top_docs_by_keyword(keyword_counts: dict, num_top_aricles: int) -> List[str]:
    top_doc_counts = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:num_top_aricles]
    return [x[0] for x in top_doc_counts]

def get_sentences_with_keywords(docs: List[str], keywords: List[str], n = 30) -> List[str]:
    for doc in docs:
        if n > 0:
            article = ArticleReader(doc)
            text = article.get_text_parts()
            text = text.split('. ')
            for sentence in text:
                if any(keyword in sentence for keyword in keywords):
                    print(article.metadata['title'])
                    print()
                    print(sentence)
                    print()
                    print()
                    print()
                    n = n - 1

In [6]:
docs, keyword_counts = gather_docs_by_keyword('data', ['R0'])

In [11]:
docs[0]

'data\\biorxiv_medrxiv\\biorxiv_medrxiv\\06c1b3535b83251cf92c01258b5048beeab7a460.json'

In [12]:
article = ArticleReader(docs[0])

In [15]:
article.get_text_parts()

'The basic reproductive number -R 0 -is one of the most common and most commonly misapplied numbers in public health. Nevertheless, estimating R 0 for every transmissible pathogen, emerging or endemic, remains a priority for epidemiologists the world over. Although often used to compare outbreaks and forecast pandemic risk, this single number belies the complexity that two different pathogens can exhibit, even when they have the same R 0 . Here, we show how predicting outbreak size requires both an estimate of R 0 and an estimate of the heterogeneity in the number of secondary infections. To facilitate rapid determination of outbreak risk, we propose a reformulation of a classic result from random network theory that relies on contact tracing data to simultaneously determine the first moment (R 0 ) and the higher moments (representing the heterogeneity) in the distribution of secondary infections. Further, we show how this framework is robust in the face of the typically limited amount

In [7]:
get_sentences_with_keywords(docs, ['R0'])

Beyond R 0 : the importance of contact tracing when predicting epidemics

The range of potential R0 comes from a 95% confidence interval using a early data and a classic deterministic models [18, 19] 



Quantifying the success of measles vaccination campaigns in the Rohingya refugee camps

The model tracks the number of individuals that move between the compartments each day and is run for one year using the ordinary differential equations 1-3.The transmission coefficient, β, was estimated according to the relationship β = R0/N D, where N is the total 93 population (i.e., S + I + R) and D is the average duration of infectiousness ( The model uses one infected case as the initial value for the number of individuals in the infected compartment, I(0).The initial value for the recovered compartment was calculated as (Table 1) 



A simple model to assess Wuhan lock-down effect and region efforts during COVID-19 epidemic in China Mainland

The latest estimated value of R0 and the control r

In [10]:
get_sentences_with_keywords(docs, ['school closure'])

The timing of one-shot interventions for epidemic control

After a 2 week school closure with many additional interventions in place, the number of cases in Mexico City dropped dramatically and did not significantly increase after relaxing the controls.As the disease was introduced into other geographic regions, many schools closed in response to the first observations of infection



The timing of one-shot interventions for epidemic control

The school closure provided increased time to prepare, but the overall epidemic was very similar.We can understand these historical patterns by observing that some of the most drastic social distancing interventions are unsustainable



The timing of one-shot interventions for epidemic control

These appear to have significantly reduced transmission, apparently reducing the effective reproduction number (the number of new infections per infected individual) very close or below one [6, 10] , although there is still a lot of uncertainty about the ef