In [1]:
# This code is needed for using the project modules in the notebook

# This path should point to the current project folder
project_path = 'D:/Personal_Projects/Public/uh-trec-covid/uh-trec-covid/'
raw_data_folder = 'D:/Personal_Projects/Public/uh-trec-covid/uh-trec-covid/dataset_sample/'
pdf_path = raw_data_folder + 'document_parses/pdf_json/'

import os
import sys
sys.path.insert(0, os.path.abspath(project_path))


In [2]:
import json
import os
import io
import pandas as pd
import pickle


class CovidDatasetManager:
    
    def __init__(self, metadata_file_path, data_folder_path):
        self.metadata_file_path = metadata_file_path
        self.data_folder_path = data_folder_path
        self.metadata_dict = {}
        self.paper_dict = {}
    
    # Reads the metadata file and store its information in a dict
    def load_metadata_from_csv(self):
        df = pd.read_csv(self.data_folder_path + 'metadata.csv', low_memory=False, dtype=str)

        # for index, cord_uid, sha, source_x, title, doi, pmcid, pubmed_id, license, abstract, publish_time, \
        #     authors, journal, mag_id, who_covidence_id, arxiv_id, pdf_json_files, pmc_json_files, url, s2_id \
        #         in df.itertuples():
        
        for index, cord_uid, sha, source_x, title, doi, pmcid, pubmed_id, license, abstract, publish_time, \
            authors, journal, mag_id, who_covidence_id, arxiv_id, pdf_json_files, pmc_json_files, url \
                in df.itertuples():

            # Check there are no repeated keys
            if cord_uid not in self.metadata_dict:
                # Check there is a pdf or pmc file corresponding to that key
                if not (pd.isna(pdf_json_files) and pd.isna(pmc_json_files)):
                    self.metadata_dict[cord_uid] = {'title': title, 
                                                    'abstract': abstract, 
                                                    'pdf_file': pdf_json_files,
                                                    'pmc_file': pmc_json_files}
    
    # Saves metadata dict to disk           
    def save_metadata_as_pickle(self):
        with open(self.data_folder_path + 'metadata.pickle', 'wb') as file_handle:
            pickle.dump(self.metadata_dict, file_handle, pickle.HIGHEST_PROTOCOL)
    
    # Loads the previously saved metadata dict
    def load_metadata_from_pickle(self):
        with open(self.data_folder_path + 'metadata.pickle', 'rb') as file_handle:
            self.metadata_dict = pickle.load(file_handle)
    
    # Loads a document data from a json file
    def _load_doc_from_json_(self, doc_file_path):
        doc_json_file = io.open(file=self.data_folder_path + doc_file_path, mode='r', encoding='utf-8')
        doc_json = json.load(doc_json_file)
        
        paper_title = doc_json['metadata']['title']
        
        paper_body_text = []
        
        for t in doc_json['body_text']:
            paper_body_text.append(t['text'])
        
        doc_json_file.close()
        
        return {'title': paper_title, 'text': paper_body_text}
    
    # Given a cord_uid returns a document in dict format from its json file. The text att is a list of str
    def get_document_from_jsom(self, cord_uid):
        
        if cord_uid not in self.metadata_dict:
            raise Exception("Provided cord_uid does not match any document in our dataset")
        
        metadata = self.metadata_dict[cord_uid]
        
        doc = {}
        if not pd.isna(metadata['pmc_file']):
            # Check if the file exists in disk
            pmc_file = metadata['pmc_file']
            if not os.path.isfile(self.data_folder_path + pmc_file):
                raise Exception("Provided cord_uid does not match any document in our dataset")
            
            doc = self._load_doc_from_json_(pmc_file)
        else:
            # Check if the file exists in disk
            pdf_file = metadata['pdf_file']
            if not os.path.isfile(self.data_folder_path + pdf_file):
                raise Exception("Provided cord_uid does not match any document in our dataset")
            
            doc = self._load_doc_from_json_(pdf_file)
            
        doc['cord_uid'] = cord_uid
        
        return doc
    
    # Given a cord_uid returns a document in dict format from its json file. The text att is a single str
    def get_document_from_jsom_no_paragraph_list(self, cord_uid):
        
        if cord_uid not in self.metadata_dict:
            raise Exception("Provided cord_uid does not match any document in our dataset")
        
        metadata = self.metadata_dict[cord_uid]
        
        pre_doc = {}
        if not pd.isna(metadata['pmc_file']):
            # Check if the file exists in disk
            pmc_file = metadata['pmc_file']
            if not os.path.isfile(self.data_folder_path + pmc_file):
                raise Exception("Provided cord_uid does not match any document in our dataset")
            
            # load the file data
            pre_doc = self._load_doc_from_json_(metadata[pmc_file])
        else:
            # Check if the file exists in disk
            pdf_file = metadata['pdf_file']
            if not os.path.isfile(self.data_folder_path + pdf_file):
                raise Exception("Provided cord_uid does not match any document in our dataset")
            
            # load the file data
            pre_doc = self._load_doc_from_json_(pdf_file)
        
        doc = {'cord_uid': cord_uid, 'title': pre_doc['title'], 'text': ''.join(pre_doc['text'])}
        
        return doc
    
    # Creates an in-memory dict with all the documents (can be memory-expensive)
    def create_papers_dict(self):
        for cord_uid, metadata in self.metadata_dict.items():
            doc = {}
            pmc_file = metadata['pmc_file']
            pdf_file = metadata['pdf_file']
            if not pd.isna(pmc_file):
                if os.path.isfile(self.data_folder_path + pmc_file):
                    doc = self._load_doc_from_json_(pmc_file)
            else:
                if os.path.isfile(self.data_folder_path + pdf_file):
                    doc = self._load_doc_from_json_(pdf_file)
                    
            # Check if a document was found in the folders
            if doc:
                doc['cord_uid'] = cord_uid
                self.paper_dict[cord_uid] = doc

    # Saves to disc the in-memory dict of documents
    def save_docs_dict_as_pickle(self):
        with open(self.data_folder_path + 'docs_dict.pickle', 'wb') as file_handle:
            pickle.dump(self.paper_dict, file_handle, pickle.HIGHEST_PROTOCOL)
    
    # Loads the previously saved dict of documents
    def load_docs_dict_from_pickle(self):
        with open(self.data_folder_path + 'docs_dict.pickle', 'rb') as file_handle:
            self.paper_dict = pickle.load(file_handle)

    # Given a cord_uid returns a document in dict format from the in-memory dict. The text att is a list of str
    def get_document_from_dict(self, cord_uid):
        
        if cord_uid not in self.paper_dict:
            raise Exception("Provided cord_uid does not match any document in our dataset")
        
        doc = self.paper_dict[cord_uid]
        return doc
    
    # Given a cord_uid returns a document in dict format from the in-memory dict. The text att is a single str 
    def get_document_from_dict_no_paragraph_list(self, cord_uid):
        
        if cord_uid not in self.paper_dict:
            raise Exception("Provided cord_uid does not match any document in our dataset")
        
        pre_doc = self.paper_dict[cord_uid]
        doc = {'cord_uid': cord_uid, 'title': pre_doc['title'], 'text': ''.join(pre_doc['text'])}
        return doc


In [3]:

cov_dm = CovidDatasetManager(raw_data_folder + 'metadata.csv', raw_data_folder)

cov_dm.load_metadata_from_csv()

print(cov_dm.metadata_dict['69gftii4'])


{'title': 'The gene of an archaeal α-l-fucosidase is expressed by translational frameshifting', 'abstract': 'The standard rules of genetic translational decoding are altered in specific genes by different events that are globally termed recoding. In Archaea recoding has been unequivocally determined so far only for termination codon readthrough events. We study here the mechanism of expression of a gene encoding for a α-l-fucosidase from the archaeon Sulfolobus solfataricus (fucA1), which is split in two open reading frames separated by a −1 frameshifting. The expression in Escherichia coli of the wild-type split gene led to the production by frameshifting of full-length polypeptides with an efficiency of 5%. Mutations in the regulatory site where the shift takes place demonstrate that the expression in vivo occurs in a programmed way. Further, we identify a full-length product of fucA1 in S.solfataricus extracts, which translate this gene in vitro by following programmed −1 frameshift

In [4]:

cov_dm.save_metadata_as_pickle()

cov_dm2 = CovidDatasetManager(raw_data_folder + 'metadata.csv', raw_data_folder)

cov_dm2.load_metadata_from_pickle()

print(pd.isna(cov_dm2.metadata_dict['69gftii4']['pmc_file']))


False


In [5]:
doc = cov_dm.get_document_from_jsom('69gftii4')
doc


{'title': 'The gene of an archaeal α-l-fucosidase is expressed by translational frameshifting',
 'text': ['Translation is optimally accurate and the correspondence between the nucleotide and the protein sequences are often considered as an immutable dogma. However, the genetic code is not quite universal: in certain organelles and in a small number of organisms the meaning of different codons has been reassigned and all the mRNAs are decoded accordingly. More surprisingly, the standard rules of genetic decoding are altered in specific genes by different events that are globally termed recoding (1). In all cases, translational recoding occurs in competition with normal decoding, with a proportion of the ribosomes not obeying to the ‘universal’ rules. Translational recoding has been identified in both prokaryotes and eukaryotes. It has crucial roles in the regulation of gene expression and includes stop codon readthrough, ribosome hopping and ±1 programmed frameshifting [for reviews see 

In [6]:
cov_dm.create_papers_dict()


In [7]:
# for k, v in cov_dm.paper_dict.items():
#     print(v)


In [8]:
cov_dm.save_docs_dict_as_pickle()

cov_dm.load_docs_dict_from_pickle()


In [9]:
# for k, v in cov_dm.paper_dict.items():
#     print(v)


In [10]:
cov_dm.get_document_from_dict('69gftii4')


{'title': 'The gene of an archaeal α-l-fucosidase is expressed by translational frameshifting',
 'text': ['Translation is optimally accurate and the correspondence between the nucleotide and the protein sequences are often considered as an immutable dogma. However, the genetic code is not quite universal: in certain organelles and in a small number of organisms the meaning of different codons has been reassigned and all the mRNAs are decoded accordingly. More surprisingly, the standard rules of genetic decoding are altered in specific genes by different events that are globally termed recoding (1). In all cases, translational recoding occurs in competition with normal decoding, with a proportion of the ribosomes not obeying to the ‘universal’ rules. Translational recoding has been identified in both prokaryotes and eukaryotes. It has crucial roles in the regulation of gene expression and includes stop codon readthrough, ribosome hopping and ±1 programmed frameshifting [for reviews see 

In [11]:
cov_dm.get_document_from_dict_no_paragraph_list('69gftii4')


{'cord_uid': '69gftii4',
 'title': 'The gene of an archaeal α-l-fucosidase is expressed by translational frameshifting',
 'text': "Translation is optimally accurate and the correspondence between the nucleotide and the protein sequences are often considered as an immutable dogma. However, the genetic code is not quite universal: in certain organelles and in a small number of organisms the meaning of different codons has been reassigned and all the mRNAs are decoded accordingly. More surprisingly, the standard rules of genetic decoding are altered in specific genes by different events that are globally termed recoding (1). In all cases, translational recoding occurs in competition with normal decoding, with a proportion of the ribosomes not obeying to the ‘universal’ rules. Translational recoding has been identified in both prokaryotes and eukaryotes. It has crucial roles in the regulation of gene expression and includes stop codon readthrough, ribosome hopping and ±1 programmed framesh

In [13]:
cov_dm.get_document_from_jsom('uo98qzm2')

Exception: Provided cord_uid does not match any document in our dataset