In [1]:
import json


def reader(json_file_path: str):
    with open(json_file_path) as curr_json:
        return json.load(curr_json)


class ArticleReader:

    def __init__(self, json_file_path):
        data = reader(json_file_path)
        self.file = json_file_path
        self.body = data["body_text"]
        self.metadata = data["metadata"]
        self.id = data["paper_id"]
        self.abstract = data["abstract"]
        self.bib = data["bib_entries"]

    def get_text_parts(self):
        return "".join([x['text'] for x in self.body])

In [2]:
import os
from typing import List
from ArticleReader import ArticleReader


def gather_docs_by_keyword(data_path: str, keywords: List[str]) -> (List[str], dict):
    docs = []
    keyword_counts = {}
    for root, dirs, files in os.walk(data_path):
        for file in files:
            if file.endswith(".json"):
                curr_file_path = os.path.join(root, file)
                article = ArticleReader(curr_file_path)
                text_parts = article.get_text_parts()
                status = contains_keywords(text_parts, keywords)
                if status:
                    docs.append(os.path.join(root, file))
                    keyword_count = get_keyword_count(text_parts, keywords)
                    keyword_counts[file] = keyword_count

    return docs, keyword_counts


def contains_keywords(text_parts: str, keywords: List[str]) -> bool:
    for keyword in keywords:
        status = keyword in text_parts
        if status:
            return True
        else:
            continue

    return False


def get_keyword_count(text_parts: str, keywords: List[str]) -> int:
    keyword_count = 0
    for keyword in keywords:
        keyword_count += text_parts.count(keyword)

    return keyword_count


def get_top_docs_by_keyword(keyword_counts: dict, num_top_aricles: int) -> List[str]:
    top_doc_counts = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:num_top_aricles]
    return [x[0] for x in top_doc_counts]

def get_sentences_with_keywords(docs: List[str], keywords: List[str], n = 30) -> List[str]:
    for doc in docs:
        if n > 0:
            article = ArticleReader(doc)
            text = article.get_text_parts()
            text = text.split('. ')
            for sentence in text:
                if any(keyword in sentence for keyword in keywords):
                    print(article.metadata['title'])
                    print()
                    print(sentence)
                    print()
                    print()
                    print()
                    n = n - 1

In [3]:
docs, keyword_counts = gather_docs_by_keyword('data', ['R0'])

In [4]:
docs[0]

'data/custom_license/custom_license/eb9634441822b1c1c933683fc5f45fb7c5670303.json'

In [5]:
article = ArticleReader(docs[0])

In [6]:
article.get_text_parts()

'RT-PCR/ESI-MS has previously demonstrated the capability to detect and identify respiratory viral pathogens in nasopharyngeal swabs. This study expands on previous research by performing a prospective evaluation of RT-PCR/ESI-MS to detect and identify Influenza A and B viruses compared to Prodesse ProFlu Plus and combined ProFlu Plus and Cepheid Xpert Flu. ProFlu Plus was also used as a gold standard for comparison for respiratory syncytial virus detection. Using ProFlu Plus as a gold standard, RT-PCR/ESI-MS had sensitivity and specificity of 82.1% (23/28) and 100% (258/258), respectively, for Influenza A, 100% (16/16) and 99.6% (269/270), respectively for Influenza B, and 88.6% (39/44) and 99.6% (241/242) for any Influenza virus. Using matching results from ProFlu Plus and Xpert Flu as a gold standard, RT-PCR/ESI-MS had 85.2% (23/27) and 100% (259/259) sensitivity and specificity respectively for Influenza A, 100% (14/14) and 99.6% (270/272), respectively for Influenza B virus. Overa

In [7]:
get_sentences_with_keywords(docs, ['R0'])

Prospective comparison of RT-PCR/ESI-MS to Prodesse ProFlu Plus and Cepheid GenXpert for the detection of Influenza A and B viruses

The lower sensitivity in this study could be explained by differences in the limit of detection between assays, as ProFlu Plus and Xpert Flu report limits of detection as low as 5 PFU/ml, while RT-PCR/ESI-MS has a reported limit of detection of approximately 150 copies of target per well (Chen et al., 2011a,b) .The National Center for the Study of Preparedness and Catastrophic Response: 2010-ST 061 PA0001 Grant 108822 and the project described was supported by Award Number S10RR027016 from the National Center For Research Resources



Human CD4 + memory T-lymphocyte responses to SARS coronavirus infection

In the PBMC from one recovered SARS patient (SARS patient #1 -HLA-DR03 + 08 + ), there were 17 peptide pools that produced ≥ 25 IFN-γ spot-forming cells (Singh and Raghava, 2001; Rammensee et al., 1999) 



Human CD4 + memory T-lymphocyte responses to S

In [8]:
get_sentences_with_keywords(docs, ['school closure'])

Emerging and Reemerging Infectious Disease Threats Chapter 14 Emerging and Reemerging Infectious Disease Threats

188 A variety of nonpharmaceutical interventions, including school closure, were implemented to mitigate the impact of the 2009 influenza pandemic



Emerging and Reemerging Infectious Disease Threats Chapter 14 Emerging and Reemerging Infectious Disease Threats

172 A number of countries, including the United States, promulgated school closure policies, and observational studies suggest these policies led to reductions in transmission and in the occurrence of respiratory illness



The epidemic of 2019-novel-coronavirus (2019-nCoV) pneumonia and insights for emerging infectious diseases in the future

In addition, many other compulsory measures limiting population mobility, such as cancellation of mass gatherings, school closures, work-from-home arrangements, was taken to reduce withinpopulation contact rates.Human coronaviruses (CoVs) could cause respiratory, gastrointest