In [None]:
import tensorflow as tf
tf.__version__

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# make sure we can see/access data
!ls ~/CORD19v3

In [None]:
root_path = '/home/eugene_chuvyrov/CORD19v3/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.head()

In [None]:
meta_df.info()

In [None]:
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

In [None]:
# define a class to hold info about each paper
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            self.references = []
            
            # Abstract
            if 'abstract' in content:
                for entry in content['abstract']:
                    self.abstract.append(entry['text'])
            if 'Abstract' in  content:
                for entry in content['Abstract']:
                    self.abstract.append(entry['text'])
            
            # Body text
            for entry in content['body_text']:
                self.body_text.append('--BEGIN PARAGRAPH---')
                self.body_text.append(entry['text'])
                self.body_text.append('--END PARAGRAPH---')
            #References
            for key, reference in content['bib_entries'].items():
                self.references.append(reference['title'])
                
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
    
first_row = FileReader(all_json[0])
print(first_row.body_text)

In [None]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [None]:
# loop through all papers and populate FileReader objects
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'references': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)

    # print('processing file ' + entry)
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    dict_['references'].append(content.references)
    
    # also create a column for the summary of abstract to be used in a plot
    if len(content.abstract) == 0: 
        # no abstract provided
        dict_['abstract_summary'].append("Not provided.")
    elif len(content.abstract.split(' ')) > 100:
        # abstract provided is too long for plot, take first 300 words append with ...
        info = content.abstract.split(' ')[:100]
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    try:
        # if more than one author
        authors = meta_data['authors'].values[0].split(';')
        if len(authors) > 2:
            # more than 2 authors, may be problem when plotting, so take first 2 append with ...
            dict_['authors'].append(". ".join(authors[:2]) + "...")
        else:
            # authors will fit in plot
            dict_['authors'].append(". ".join(authors))
    except Exception as e:
        # if only one author - or Null valie
        dict_['authors'].append(meta_data['authors'].values[0])
    
    # add the title information
    title = meta_data['title'].values[0]
    dict_['title'].append(title)
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
    
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'references', 'abstract_summary'])
df_covid.head()

In [None]:
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
df_covid.head()

In [None]:
df_covid.info()

In [None]:
df_covid['abstract'].describe(include='all')

In [None]:
# remove duplicates (based on identical abstract)
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
df_covid['abstract'].describe(include='all')

In [None]:
df_covid.head()

In [None]:
df_covid.describe()

In [None]:
df_covid.dropna(inplace=True)
df_covid.info()

In [None]:
# take butcher's knife and kill all special characters from our docs
#  this is brutal and leads to a massive information loss in our scientific corpus of data
#  but this will do for initial modeling
import re

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

In [None]:
# everything will be lowered (lowered-case that is)
def lower_case(input_str):
    input_str = input_str.lower()
    return input_str

df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))

In [None]:
# since classification of data for training was done manually (i.e., by reading each paragraph and selecting an appropriate category)
#  this is a feeble attempt at prioritizing which articles to read first
#  here, the ones I'll read first will be the ones that are most frequently referenced by other papers in the corpus
#  ala Google PageRank, much simplified
#  obvious weakness of this approach is the prioritization of older literature, which may contain outdated facts
def create_article_refs_counts(df_covid19):
    dict_all_refs = {}
    
    # scan through all the articles in the dataframe
    #  look at their references
    #  add +1 if the reference matches the one we've already seen before
    for idx in df_covid19.index:
        titles = df_covid19['references'][idx]
        
        for title in titles:
            if dict_all_refs.get(title) != None:
                dict_all_refs[title] = dict_all_refs[title] + 1
            else:
                dict_all_refs[title] = 1
                
    return dict_all_refs

In [None]:
from datetime import datetime, timedelta
from dateutil.parser import parse

def is_date(string, fuzzy=False):
    """
    Return whether the string can be parsed as a date in our expected format
    """
    
    if not isinstance(string, str):
        return False
    
    try: 
        meta_published_date = datetime.strptime(string, '%Y-%m-%d')
        return True

    except ValueError:
        return False
    
def get_top100_referenced_articles_metadata():
    dict_top100_ = {'article_title': [], 'meta_link': [], 'frequency': []}
    # now find most frequently referenced articles
    article_references = create_article_refs_counts(df_covid)
    sorted_article_refs = sorted(article_references.items(), key=lambda x: x[1], reverse=True)
        
    for index, row in meta_df.iterrows():
        meta_title = row['title']
        meta_link = row['doi']

        # some unexpected data types handling
        if isinstance(meta_title, str):
            meta_title = ''.join(meta_title.split()).lower()

        # more unexpected data types handling
        if isinstance(meta_link, str):
            if not meta_link.startswith('http://doi.org/'):
                meta_link = 'http://doi.org/' + meta_link

        for article_index in range(30000):
            # make sure the referenced article exists in our metadata
            frequency = sorted_article_refs[article_index][1]
            article_title = sorted_article_refs[article_index][0]
            frequent_title = ''.join(article_title.split()).lower()

            if(frequent_title == meta_title):
                dict_top100_['article_title'].append(article_title)
                dict_top100_['meta_link'].append(meta_link)
                dict_top100_['frequency'].append(frequency)
                
    return dict_top100_

In [None]:
top100md = get_top100_referenced_articles_metadata()
df_top100 = pd.DataFrame(top100md, columns=['article_title', 'meta_link', 'frequency'])

df_top100 = df_top100.sort_values(by=['frequency'], ascending=False)

In [None]:
top100md.keys()

In [None]:
df_top100.head()

In [None]:
# now produce a data frame listing the articles, one paragraph per row
#  the unit of classification will be a paragraph of text, not an article
#  something that may generalize well (or not)
def get_training_set():
    dict_trainingset_ = {'paper_id': [], 'article_title': [], 'meta_link': [], 'abstract': [], 'paragraph_text': [], 'frequency': [], 'authors': [], 'references': [], 'journal': [], 'abstract_summary': []}

    for index, row in df_covid.iterrows():
        covid_title = row['title']
        
        # some unexpected data types handling
        if isinstance(covid_title, str):
            covid_title = ''.join(covid_title.split()).lower()

        article_title = row['article_title']
        meta_link = row['meta_link']
        frequency = row['frequency']
        lower_title = ''.join(article_title.split()).lower()

        article_text = row['body_text']
        paragraphs = article_text.split('begin paragraph')

        for paragraph in paragraphs:                    
            dict_trainingset_['article_title'].append(article_title)
            dict_trainingset_['meta_link'].append(meta_link)
            dict_trainingset_['frequency'].append(frequency)

            dict_trainingset_['paper_id'].append(row['paper_id'])
            dict_trainingset_['abstract'].append(row['abstract'])
            dict_trainingset_['paragraph_text'].append(paragraph.strip())
            dict_trainingset_['authors'].append(row['authors'])
            dict_trainingset_['references'].append(row['references'])
            dict_trainingset_['journal'].append(row['journal'])
            dict_trainingset_['abstract_summary'].append(row['abstract_summary'])

    return dict_trainingset_

In [None]:
training_set = get_training_set()

In [None]:
#print(training_set['article_title'])

In [None]:
training_set.keys()
print(len(training_set['abstract']), len(training_set['paragraph_text']), len(training_set['frequency']), len(training_set['authors']), len(training_set['references']), len(training_set['journal']), len(training_set['abstract_summary']))


In [None]:
final_set = pd.DataFrame(training_set, columns=['paper_id', 'article_title', 'meta_link', 'abstract', 'paragraph_text', 'frequency', 'authors', 'references', 'journal', 'abstract_summary'])
final_set.head()


In [None]:
final_set.to_csv('~/CORD19v3/test_v3.csv')