**JSON, Pandas, Google Cloud NLP**

Continuation of intro-notebook-covid-19 data. Provides an overview of creating more elaborate pandas data frames using JSON and CORD-19 data. This may make it easier to use some analytical tools. 

* create a pandas dataframe with id, title, abstract, and body text
* create a pandas dataframe with id and location (when it exists)
* create a pandas dataframe with id and sentiment score (sentiment and magnitude)
* create a pandas dataframe with id and category 

Note - to use the NLP api from Google Cloud to read sentiment scores and categories, you'll need to create a google cloud platform project and link it to your kaggle notebook. 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import json
import pandas as pd
import glob
from googleapiclient.discovery import build
import getpass

In [None]:
json_files = glob.glob('/kaggle/input/CORD-19-research-challenge/document_parses/pdf_json/*.json')

In [None]:
paper_abstracts = []
paper_titles = []
paper_texts = []
paper_ids = []
paper_authors = []
paper_locations = []

i = 0

for jf in json_files[:100]:
    with open(jf) as json_file:
        data = json.load(json_file)
    
    paper_titles.append(data['metadata']['title'])
    paper_ids.append(data['paper_id'])
    
    paper_text = []
    for b in data['body_text']:
        paper_text.append(b['text'])
        
    paper_texts.append(" ".join(paper_text))
        
    paper_abstract = []
    for a in data['abstract']:
        paper_abstract.append(a['text'])
        
    paper_abstracts.append(" ".join(paper_abstract))    
    
    for auth in data['metadata']['authors']:
        if len(auth['affiliation'].keys()) == 3:
            paper_authors.append((data['paper_id'], auth['first'], " ".join(auth['middle']), auth['last']))
    
            if('country' in auth['affiliation']['location'].keys()):
                paper_locations.append((data['paper_id'], "".join(auth['affiliation']['location']['country'])))
           
        


In [None]:
df_papers = pd.DataFrame({"id": paper_ids, "title": paper_titles, "abstract": paper_abstracts, "body_text": paper_texts})

In [None]:
df_papers.head(10)

In [None]:
df_locations = pd.DataFrame(list(paper_locations), columns = ['paper_id', 'location'])

In [None]:
df_locations

In [None]:
APIKEY = getpass.getpass()

In [None]:
lservice = build('language', 'v1', developerKey=APIKEY)

In [None]:
def read_sentiment(document_str):
  response = lservice.documents().analyzeSentiment(
    body={
      'document': {
        'type': 'PLAIN_TEXT',
        'content': document_str
    }
  }).execute()
  score = response['documentSentiment']['score']
  magnitude = response['documentSentiment']['magnitude']
  return(score, magnitude)


In [None]:
def read_categories(text):
  try:
    response = lservice.documents().classifyText(
      body={
        'document': {
          'type': 'PLAIN_TEXT',
          'content': text }
    }).execute()
  except:
    response = ""
  return response

In [None]:
read_sentiment(df_papers.iloc[0]['body_text'])

In [None]:
read_categories(df_papers.iloc[0]['body_text'])

In [None]:
sentiment_scores = []
i = 0
for ir in df_papers.iterrows():
    smt = read_sentiment(ir[1]['body_text'])
    sentiment_scores.append((ir[1]['id'], smt[0], smt[1]))
    
    i += 1
    
    if i > 5:
        break

In [None]:
df_sentiment_scores = pd.DataFrame(list(sentiment_scores), columns = ['paper_id', 'sentiment', 'magnitude'])

In [None]:
df_sentiment_scores

In [None]:
category_probabilities = []

i = 0
for ir in df_papers.iterrows():
    cat = read_categories(ir[1]['body_text'])
    
    for c in cat['categories']:
        category_probabilities.append((ir[1]['id'], c['name'], c['confidence']))
    
    i += 1
    
    if i > 5:
        break

In [None]:
df_categories = pd.DataFrame(list(category_probabilities), columns = ['paper_id', 'category', 'confidence'])

In [None]:
df_categories

In [None]:
#df_locations.to_csv("locations.csv",index=False)
#df_papers.to_csv("papers.csv",index=False)
#df_categories.to_csv('categories.csv',index=False)
#df_sentiment_scores.to_csv('sentiments.csv',index=False)