In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from imp import reload
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd

from tensorflow.keras import datasets, layers, models

import textwrap
import random
import json

from simpletransformers.classification import ClassificationModel
import pandas as pd

In [None]:
import os

In [None]:
data_path = '../noncomm_use_subset/pmc_json/'

In [None]:
# Load up files from a directory

In [None]:
dir_path = '../noncomm_use_subset/pmc_json_test/'
filename = 'PMC1616946.xml.json'

print(os.listdir(dir_path))

print("---------" + dir_path + filename)
print()

articles = []

for file in os.listdir(dir_path):
    filepath = dir_path + file
    print("++---------" + filepath)
    with open(filepath, 'r') as infile:
        json_object = json.load(infile)
        #print(json.dumps(json_object, indent=2))
        a = Article(json_object)
        articles.append(a)
        
# Check result        
for a in articles:
    print("@@@@@@\n\n")
    print(len(a.get_text()))
    
            #df = pd.read_json(infile,
            #                 lines=True,
            #                 orient='columns')
    #df.head()


In [None]:
articles[1].sections.sections[2].get_text()

# Create a class object to represent papers/articles with contained methods

In [None]:
stopwords = []

def normalize(words):
    
    out_words = [word.lower() for word in words if word not in stopwords]
    
    return out_words

#######################################################################
# Class method for representing Section
#######################################################################

# Section class encapsulates loading, displaying, and simple helper operations 
# e.g. return text or bag of words representation
class Section:
    # Initialize with key article fields from JSON object
    # body_text (should preserve section type)
    def __init__(self, json_object):#, label_map):
        
        #print(json_input)
        
        # Extract
        self.type = json_object.get('section', '')
        self.text= json_object.get('text', '')
        self.normalized_text=normalize(self.text.split())
        self.bow = set(self.normalized_text)
        
    def get_text(self):
        return self.text
        
        
# Encapsulates list of sections, takes abstract and body_text JSON list 
# and generates subsections in order
class Sections:
    # Initialize with key article fields from JSON object
    # body_text (should preserve section type)
    def __init__(self, abstract=[], body_text=[]):#, label_map):
        
        self.sections=[]
        
        # Create a section from the abstract
        if abstract:
            self.sections.append(Section(abstract))
        
        # Iterate through the elements of the body_text and create sections, 
        # combining sequential items of the same type
        if body_text:
            current_type = ''
            current_text = ''
            delim_text = ''
            for paragraph in body_text:
                temp_type = paragraph.get('section', '')
                temp_text = paragraph.get('text', '')
                #print(temp_text)
                if temp_type and temp_type != current_type:
                    # Start new section
                    self.sections.append(Section({'section': current_type, 'text': current_text}))
                    current_type = temp_type
                    current_text = temp_text
                
                # Continue adding to current section data
                current_text += delim_text + temp_text
                
                if delim_text == '':
                    delim_text = ' '
                    
            # If anything remaining, add section
            if current_type or current_text:
                self.sections.append(Section({'section': current_type, 'text': current_text}))
        
        # Extract
        self.normalized_text=[]
        
    def __iter__(self):
        for s in self.sections:
            yield s
    
    def get_text(self):
        return '\n'.join([s.get_text() for s in self.sections])
        

#######################################################################
# Class method for representing Article (list of Sections)
#######################################################################

# Article class encapsulates loading, displaying, and simple helper operations 
# e.g. return text or bag of words representation
class Article:
    # Initialize with key article fields from JSON representation
    # paper_id, title, authors, abstract, body_text, sections
    def __init__(self, json_input):#, label_map):
        
        # Extract
        self.paperID=json_object['paper_id'] #Paper ID
        in_abs = json_object.get('abstract', {})
        self.abstract=Section(in_abs)

        self.sections = Sections(in_abs, json_object.get('body_text', []))

    # 
    def get_text(self):
        return self.sections.get_text()
    
    
    #def get_bow(self, stopwords=[]):
        #return self.card + ' ' + self.answer
    
    
    # Add methods for running ML

# LDA methods below will take a list of docs where each is the unicode string including newlines etc. for each article

In [None]:
# Update to larger dataset for better run now that it's debugged
dir_path = '../noncomm_use_subset/pmc_json/'
filename = 'PMC1616946.xml.json'

print(len(os.listdir(dir_path)))
print(os.listdir(dir_path))

print("---------" + dir_path + filename)
print()

articles = []

num_articles = 0
for file in os.listdir(dir_path):
    filepath = dir_path + file
    print("++---------" + filepath)
    with open(filepath, 'r') as infile:
        json_object = json.load(infile)
        #print(json.dumps(json_object, indent=2))
        a = Article(json_object)
        articles.append(a)
        
    #if num_articles > 100:
    #    break
    # Use all 2093 articles    
    #num_articles += 1

In [None]:
section_texts = []
sec_types = []
for a in articles:
    section_texts.extend(a.sections)
doc_texts = [s.get_text() for s in section_texts]
sec_types = [s.section for s in section_texts]

# Filter empty docs (sections)
meta_texts = [(t, doc) for t, doc in zip(sec_types, doc_texts) if doc]

In [None]:
len(doc_texts)

In [None]:
doc_texts[len(docs)+1]

In [None]:
doc_texts[1]

In [None]:
docs[1]

In [None]:
sections = []
for a in articles:
    sections.extend(a.sections)
docs = [s.get_text() for s in sections]

# Filter empty docs (sections)
docs = [d for d in docs if d]

In [None]:
len(docs)

In [None]:
docs[0]

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 2] for doc in docs]

In [None]:
import nltk
nltk.download('wordnet')
import gensim # conda install gensim

In [None]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
# Train LDA model.
#from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Set training parameters.
num_topics = 30
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='symmetric',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
    workers=5
)

In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

# Generate vectors and metadata file for loading and visualizing in embedding projector

In [None]:
model.get_document_topics()

In [None]:
vecs = []
for d in docs:
    vecs.append(model.get_document_topics(d))

In [None]:
len(doc_texts)

In [None]:
len(corpus)

In [None]:
doc_texts[0][0:300]

In [None]:
vecs = []
meta_data = []
counter = 0
for c in corpus:
    vecs.append(gensim.matutils.sparse2full(model.get_document_topics(c), num_topics))
    meta_data.append(doc_texts[counter][0:300])
    counter += 1

In [None]:
out_vecs_filename = 'output_vecs_v1.tsv'
with open(out_vecs_filename, 'w') as outfile:
    for v in vecs:
        outfile.write('\t'.join(v)+'\n')