In [1]:
PARENT = "/usr/local/git/kaggle-covid-19/"
DATA_RAW = PARENT + "data/raw/CORD-19-research-challenge/"
DATA_INTERIM = PARENT + "data/interim/"

FIGURE = PARENT + "/reports/figures"

METADATA = DATA_RAW + "metadata.csv"


In [None]:
def read_textual_file(file_path):
    """
    Read file at the file_path location into string (python str type)
    """
    with open(file_path) as f:
        text = f.read()
    if isinstance(text, bytes):
        text = text.decode('utf-8')
        
    return text

In [12]:
import pandas as pd

def detail_csv_file(file_path, **kwargs):
    print("Kwargs: ", kwargs)
    
    df = pd.read_csv(file_path, **kwargs)
    
    print("DataFrame Size: {}".format(df.shape))
    print("DataFrame Columns: {}".format(df.columns))
    
    return df

In [None]:
import json

def read_json_file(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

def write_json_file(file_path, data):
    with open(file_path, 'w') as f:
        json.dump(data, f)

In [4]:
class Authors:
    def __init__(self, author_position, json_data):
        assert isinstance(author_position, int), "author position signified by numeric"
        assert isinstance(json_data, dict), "json_data should be dict python type"
        
        self.author_position = author_position
        self.first = json_data.get("first")
        self.middle = json_data.get("middle")
        self.last = json_data.get("last")
        self.suffix = json_data.get("suffix")
        self.affiliation = json_data.get("affiliation")
        self.email = json_data.get("email")



class Abstract:
    def __init__(self, para_position, json_data):
        assert isinstance(para_position, int), "paragraph position signified by numeric"
        assert isinstance(json_data, dict), "json_data should be dict python type"
        
        self.para_position = para_position
        self.text = json_data.get("text")
        self.cite_spans = json_data.get("cite_spans")
        self.ref_spans = json_data.get("ref_spans")

class BodyText:
    def __init__(self, json_data):
        assert isinstance(json_data, dict), "json_data should be dict python type"
        self.text = json_data.get("text")
        self.cite_spans = json_data.get("cite_spans")
        self.ref_spans = json_data.get("ref_spans")
        self.section = json_data.get("section")

class BibEntries:
    def __init__(self, bib_ref, json_data):
        assert isinstance(bib_ref, str), "bib_ref needs to be the name of the bibreference e.g. BIBREF0"
        assert isinstance(json_data, dict), "json_data should be dict python type"
        self.bib_ref = bib_ref    
        self.ref_id = json_data.get("ref_id")
        self.title = json_data.get("title")
        self.authors = [Authors(author_position=i, json_data=x) for i, x in enumerate(json_data.get("authors"))]      
        self.year = json_data.get("year")
        self.venue = json_data.get("venue")
        self.volume = json_data.get("volume")        
        self.issn = json_data.get("issn")
        self.pages = json_data.get("pages")
        self.other_ids = json_data.get("other_ids")
        self.cite_spans = json_data.get("cite_spans")
        self.ref_spans = json_data.get("ref_spans")
        self.section = json_data.get("section")

class KaggleCovidJSON:
    
    def __init__(self, json_data):
        assert isinstance(json_data, dict), "the json data should be dict python type"
        json_data_corrected = json.dumps(json_data).replace('None', 'null')
        json_data = json.loads(json_data_corrected)
        self.paper_id = json_data.get("paper_id")
        
        _meta_data = json_data.get("metadata")

        self.title = _meta_data.get("title")
        if _meta_data.get("authors"):
            self.authors = \
              [Authors(author_position=i, json_data=x) for i, x in enumerate(_meta_data.get("authors"))]
        else:
            self.authors = None

        if _meta_data.get("abstract"):
            self.abstract = \
              [Abstract(para_position=i, json_data=x) for i, x in enumerate(_meta_data.get("abstract"))]
        else:
            self.abstract = None
    
        if json_data.get("body_text"):
            self.body_text = \
              [BodyText(json_data=x) for x in json_data.get("body_text")]
        else:
            self.body_text = None
        
        if json_data.get("bib_entries"):
            self.bib_entries = [BibEntries(bib_ref=x, json_data=y) for x,y in json_data.get("bib_entries").items()]
        else:
            self.bib_entries = None

        self.ref_entries = json_data.get("ref_entries")
        
        if json_data.get("back_matter"):
            self.back_matter = [BodyText(json_data=x) for x in json_data.get("back_matter")]
        else:
            self.back_matter = None