# Load Json Papers

In [9]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
%load_ext autoreload
%autoreload 2
from cord.cord19 import ResearchPapers, load_json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#research_papers = ResearchPapers.from_data_dir()
research_papers = ResearchPapers.from_pickle()

In [3]:
jpaper = research_papers[3].full_text_path()

In [4]:
jpath = 'data/CORD-19-research-challenge/custom_license/custom_license/aecbc613ebdab36753235197ffb4f35734b5ca63.json'

## Loading Json

In [21]:
import json

def load_json(json_file):
    with open(json_file, 'r') as f:
        return json.load(f)

paper_json = load_json(jpath)

pp.pprint(paper_json)

{   'abstract': [   {   'cite_spans': [],
                        'ref_spans': [],
                        'section': 'Abstract',
                        'text': 'Middle-aged female identical twins, one of '
                                'whom had systemic lupus erythematosus (SLE), '
                                'were evaluated for immunologic reactivity to '
                                'previous antigenic challenges, including '
                                'primary immunization with a foreign antigen, '
                                'keyhole limpet hemocyanin (KLH). These two '
                                'women had lived together for all of their 58 '
                                'years and neither was receiving '
                                'anti-inflammatory or immunosuppressive drugs '
                                'at the time of these studies. Both twins '
                                'demonstrated comparable 7s and 19s humoral '
                 

## Get Body and Abstract

In [73]:
import collections
from functools import  partial

def get_text(paper, text_key):
    body_dict = collections.defaultdict(list)
    for rec in paper[text_key]:
        body_dict[rec['section']].append(rec['text'])
    
    body = ''
    for section, text_sections in body_dict.items():
        body += section + '\n\n'
        for text in text_sections:
            body += text + '\n\n'
    return body

get_body = partial(get_text, text_key='body_text')
get_abstract = partial(get_text, text_key='abstract') 
body = get_body(paper_json)
abstract = get_abstract(paper_json)
#print(body)
print(abstract)

Abstract

Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were evaluated for immunologic reactivity to previous antigenic challenges, including primary immunization with a foreign antigen, keyhole limpet hemocyanin (KLH). These two women had lived together for all of their 58 years and neither was receiving anti-inflammatory or immunosuppressive drugs at the time of these studies. Both twins demonstrated comparable 7s and 19s humoral antibody response to KLH, as well as similar viral antibody titers. However, the twin with SLE was anergic to common antigens, streptokinase-streptodornase, Trichophyton and Candida; furthermore delayed hypersensitivity to KLH did not develop after immunization. This observed discrepancy between humoral and cellular immunity in genetically similar subjects may be significant in the pathogenesis of SLE.

Reports of an increased incidence of systemic lupus erythematosus (SLE), other connective tissue diseases, and sero

## Authors

In [96]:
def author_name(author_json):
    first = author_json.get('first')
    middle = "".join(author_json.get('middle'))
    last = author_json.get('last')
    if middle:
        return ' '.join([first, middle, last])
    return ' '.join([first, last])

def get_affiliation(author_json):
    affiliation = author_json['affiliation']
    institution = affiliation.get('institution', '')
    location = affiliation.get('location')
    if location:
        location = ' '.join(location.values())
    return f'{institution}, {location}'

def get_authors(paper, include_affiliation=False):
    if include_affiliation:
        return [f'{author_name(a)}, {get_affiliation(a)}'
                   for a in paper['metadata']['authors']]
    else:
        return [author_name(a) for a in paper['metadata']['authors']]

authors = get_authors(paper_json, True)
authors

['Carolyn M Brunner, University of Virginia, Charlottesville Virginia',
 'A David, University of Virginia, Charlottesville Virginia',
 ' Horwitz, University of Virginia, Charlottesville Virginia',
 'K Mary, University of Virginia, Charlottesville Virginia',
 ' Shann, University of Virginia, Charlottesville Virginia',
 'Benjamin A Sturgill, University of Virginia, Charlottesville Virginia',
 'S John, University of Virginia, Charlottesville Virginia',
 ' Davis, University of Virginia, Charlottesville Virginia',
 'Virginia Charlottesville, University of Virginia, Charlottesville Virginia']

In [81]:
paper_json['metadata']['authors']

[{'first': 'Carolyn',
  'middle': ['M'],
  'last': 'Brunner',
  'suffix': '',
  'affiliation': {'laboratory': '',
   'institution': 'University of Virginia',
   'location': {'settlement': 'Charlottesville', 'region': 'Virginia'}},
  'email': ''},
 {'first': 'A',
  'middle': [],
  'last': 'David',
  'suffix': '',
  'affiliation': {'laboratory': '',
   'institution': 'University of Virginia',
   'location': {'settlement': 'Charlottesville', 'region': 'Virginia'}},
  'email': ''},
 {'first': '',
  'middle': [],
  'last': 'Horwitz',
  'suffix': '',
  'affiliation': {'laboratory': '',
   'institution': 'University of Virginia',
   'location': {'settlement': 'Charlottesville', 'region': 'Virginia'}},
  'email': ''},
 {'first': 'K',
  'middle': [],
  'last': 'Mary',
  'suffix': '',
  'affiliation': {'laboratory': '',
   'institution': 'University of Virginia',
   'location': {'settlement': 'Charlottesville', 'region': 'Virginia'}},
  'email': ''},
 {'first': '',
  'middle': [],
  'last': 'Sha

In [10]:
pp.pprint(jpaper.paper)

{   'abstract': [   {   'cite_spans': [],
                        'ref_spans': [],
                        'section': 'Abstract',
                        'text': 'Middle-aged female identical twins, one of '
                                'whom had systemic lupus erythematosus (SLE), '
                                'were evaluated for immunologic reactivity to '
                                'previous antigenic challenges, including '
                                'primary immunization with a foreign antigen, '
                                'keyhole limpet hemocyanin (KLH). These two '
                                'women had lived together for all of their 58 '
                                'years and neither was receiving '
                                'anti-inflammatory or immunosuppressive drugs '
                                'at the time of these studies. Both twins '
                                'demonstrated comparable 7s and 19s humoral '
                 

## JPaper

In [22]:
jpaper

## Abstract

In [15]:
jpaper.paper['abstract'][0]

{'text': 'Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were evaluated for immunologic reactivity to previous antigenic challenges, including primary immunization with a foreign antigen, keyhole limpet hemocyanin (KLH). These two women had lived together for all of their 58 years and neither was receiving anti-inflammatory or immunosuppressive drugs at the time of these studies. Both twins demonstrated comparable 7s and 19s humoral antibody response to KLH, as well as similar viral antibody titers. However, the twin with SLE was anergic to common antigens, streptokinase-streptodornase, Trichophyton and Candida; furthermore delayed hypersensitivity to KLH did not develop after immunization. This observed discrepancy between humoral and cellular immunity in genetically similar subjects may be significant in the pathogenesis of SLE.',
 'cite_spans': [],
 'ref_spans': [],
 'section': 'Abstract'}

## Section

In [18]:
jpaper.paper['body_text']

[{'text': 'The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia.',
  'cite_spans': [{'start': 12, 'end': 16, 'text': '(Fo,', 'ref_id': None}],
  'ref_spans': [],
  'section': ''},
 {'text': 'In July 1967 she was referred to the University of Virginia Hospital (UVH) because of edema of the legs and facial swelling of recent onset. She admitted having arthralgias but denied having recent rash, pleurisy or hair loss. Previous medical history revealed that a systolic heart murmur had been heard in 1962. A hemogram and urinalysis at that time were normal. In June 1963 she had been admitted to her local hospital with congestive heart failure attributed to mitral insufficiency.',
  'cite_spans': [],
  'ref_spans': [],
  'section': ''},
 {'text': 'Laboratory studies included a positive lupus erythematosus cell preparation, and she received a brief course of prednisone therapy. A pruritic rash of her neck and trunk was recorded in Jun

## Text

In [30]:
jpaper.get_text()

"The patient (Fo, ) was a 58 year old mentally retarded white woman, born in a rural area of southwestern Virginia. \n In July 1967 she was referred to the University of Virginia Hospital (UVH) because of edema of the legs and facial swelling of recent onset. She admitted having arthralgias but denied having recent rash, pleurisy or hair loss. Previous medical history revealed that a systolic heart murmur had been heard in 1962. A hemogram and urinalysis at that time were normal. In June 1963 she had been admitted to her local hospital with congestive heart failure attributed to mitral insufficiency. \n Laboratory studies included a positive lupus erythematosus cell preparation, and she received a brief course of prednisone therapy. A pruritic rash of her neck and trunk was recorded in June 1964. \n Family members included a healthy twin sister (Case 2), who was identical in appearance. \n The sisters shared the phenotype Gm (3, 5, 13, 14) ; Inv(-l)* and common red blood cell groups (t