In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from imp import reload
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd

from tensorflow.keras import datasets, layers, models

import textwrap
import random
import json

from simpletransformers.classification import ClassificationModel
import pandas as pd

In [3]:
import os

In [None]:
data_path = '../noncomm_use_subset/pmc_json/'

In [None]:
# Load up files from a directory

In [101]:
dir_path = '../noncomm_use_subset/pmc_json_test/'
filename = 'PMC1616946.xml.json'

print(os.listdir(dir_path))

print("---------" + dir_path + filename)
print()

articles = []

for file in os.listdir(dir_path):
    filepath = dir_path + file
    print("++---------" + filepath)
    with open(filepath, 'r') as infile:
        json_object = json.load(infile)
        #print(json.dumps(json_object, indent=2))
        a = Article(json_object)
        articles.append(a)
        
# Check result        
for a in articles:
    print("@@@@@@\n\n")
    print(len(a.get_text()))
    
            #df = pd.read_json(infile,
            #                 lines=True,
            #                 orient='columns')
    #df.head()


['PMC1616946.xml.json', 'PMC1616953.xml.json', 'PMC1616970.xml.json', 'PMC1635287.xml.json', 'PMC1636417.xml.json']
---------../noncomm_use_subset/pmc_json_test/PMC1616946.xml.json

++---------../noncomm_use_subset/pmc_json_test/PMC1616946.xml.json
++---------../noncomm_use_subset/pmc_json_test/PMC1616953.xml.json
++---------../noncomm_use_subset/pmc_json_test/PMC1616970.xml.json
++---------../noncomm_use_subset/pmc_json_test/PMC1635287.xml.json
++---------../noncomm_use_subset/pmc_json_test/PMC1636417.xml.json
@@@@@@


30635
@@@@@@


48107
@@@@@@


25672
@@@@@@


44202
@@@@@@


60627


In [98]:
articles[1].sections.sections[2].get_text()

"S.solfataricus cells were grown, and cell extracts obtained, as described previously (24,28). S.solfataricus cells were grown, and cell extracts obtained, as described previously (24,28). The expression in the E.coli strain BL21(RB791) of the wild-type gene fucA1 and of the mutant genes fucA1A [previously named FrameFuc in (24)], fucA1B, fucA1sm and fucA1tm as fusions of glutathione S-transferase (GST) and the purification of the recombinant proteins were performed as reported previously (23). The nomenclature used in this paper for the different α-fucosidase genes is listed in Table 1. For the western blot studies, equal amounts of E.coli cultures expressing the wild-type and mutant fucA1 genes, normalized for the OD600, were resuspended in SDS–PAGE loading buffer containing 0.03 M Tris–HCl buffer, pH 6.8, 3% SDS (w/v), 6.7% glycerol (w/v), 6.7% 2-mercaptoethanol (w/v) and 0.002% blue bromophenol (w/v). The samples were incubated at 100°C for 5 min (unless otherwise indicated) and we

# Create a class object to represent papers/articles with contained methods

In [100]:
stopwords = []

def normalize(words):
    
    out_words = [word.lower() for word in words if word not in stopwords]
    
    return out_words

#######################################################################
# Class method for representing Section
#######################################################################

# Section class encapsulates loading, displaying, and simple helper operations 
# e.g. return text or bag of words representation
class Section:
    # Initialize with key article fields from JSON object
    # body_text (should preserve section type)
    def __init__(self, json_object):#, label_map):
        
        #print(json_input)
        
        # Extract
        self.type = json_object.get('section', '')
        self.text= json_object.get('text', '')
        self.normalized_text=normalize(self.text.split())
        self.bow = set(self.normalized_text)
        
    def get_text(self):
        return self.text
        
        
# Encapsulates list of sections, takes abstract and body_text JSON list 
# and generates subsections in order
class Sections:
    # Initialize with key article fields from JSON object
    # body_text (should preserve section type)
    def __init__(self, abstract=[], body_text=[]):#, label_map):
        
        self.sections=[]
        
        # Create a section from the abstract
        if abstract:
            self.sections.append(Section(abstract))
        
        # Iterate through the elements of the body_text and create sections, 
        # combining sequential items of the same type
        if body_text:
            current_type = ''
            current_text = ''
            delim_text = ''
            for paragraph in body_text:
                temp_type = paragraph.get('section', '')
                temp_text = paragraph.get('text', '')
                #print(temp_text)
                if temp_type and temp_type != current_type:
                    # Start new section
                    self.sections.append(Section({'section': current_type, 'text': current_text}))
                    current_type = temp_type
                    current_text = temp_text
                
                # Continue adding to current section data
                current_text += delim_text + temp_text
                
                if delim_text == '':
                    delim_text = ' '
                    
            # If anything remaining, add section
            if current_type or current_text:
                self.sections.append(Section({'section': current_type, 'text': current_text}))
        
        # Extract
        self.normalized_text=[]
        
    def __iter__(self):
        for s in self.sections:
            yield s
    
    def get_text(self):
        return '\n'.join([s.get_text() for s in self.sections])
        

#######################################################################
# Class method for representing Article (list of Sections)
#######################################################################

# Article class encapsulates loading, displaying, and simple helper operations 
# e.g. return text or bag of words representation
class Article:
    # Initialize with key article fields from JSON representation
    # paper_id, title, authors, abstract, body_text, sections
    def __init__(self, json_input):#, label_map):
        
        # Extract
        self.paperID=json_object['paper_id'] #Paper ID
        in_abs = json_object.get('abstract', {})
        self.abstract=Section(in_abs)

        self.sections = Sections(in_abs, json_object.get('body_text', []))

    # 
    def get_text(self):
        return self.sections.get_text()
    
    
    #def get_bow(self, stopwords=[]):
        #return self.card + ' ' + self.answer
    
    
    # Add methods for running ML

# LDA methods below will take a list of docs where each is the unicode string including newlines etc. for each article

In [None]:
# Update to larger dataset for better run now that it's debugged
dir_path = '../noncomm_use_subset/pmc_json/'
filename = 'PMC1616946.xml.json'

print(os.listdir(dir_path))

print("---------" + dir_path + filename)
print()

articles = []

for file in os.listdir(dir_path):
    filepath = dir_path + file
    print("++---------" + filepath)
    with open(filepath, 'r') as infile:
        json_object = json.load(infile)
        #print(json.dumps(json_object, indent=2))
        a = Article(json_object)
        articles.append(a)

In [126]:
sections = []
for a in articles:
    sections.extend(a.sections)
docs = [s.get_text() for s in sections]

# Filter empty docs (sections)
docs = [d for d in docs if d]

In [127]:
docs[0]

'The standard triplet readout of the genetic code can be reprogrammed by signals in the mRNA to induce ribosomal frameshifting [reviewed in (1–3)]. Generally, the resulting trans-frame protein product is functional and may in some cases be expressed in equal amounts to the product of standard translation. This elaboration of the genetic code (4,5) demonstrates versatility in decoding.The standard triplet readout of the genetic code can be reprogrammed by signals in the mRNA to induce ribosomal frameshifting [reviewed in (1–3)]. Generally, the resulting trans-frame protein product is functional and may in some cases be expressed in equal amounts to the product of standard translation. This elaboration of the genetic code (4,5) demonstrates versatility in decoding. Requirements for eukaryotic ribosomal frameshifting include a shift-prone sequence at the decoding site and often a downstream secondary structure in mRNA. The majority of −1 programmed frameshift sites consist of a heptanucle

In [128]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 2] for doc in docs]

In [129]:
import nltk
nltk.download('wordnet')
import gensim # conda install gensim

[nltk_data] Downloading package wordnet to /Users/pedro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [130]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [131]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [132]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [133]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [134]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 22
Number of documents: 53


In [135]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [136]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.7028.
[([(0.23273917, 'gene'),
   (0.115503564, 'sequence'),
   (0.10922195, 'site'),
   (0.08608394, 'mutant'),
   (0.06438551, 'two'),
   (0.06207177, 'wild_type'),
   (0.06207177, 'wild'),
   (0.061865572, 'type'),
   (0.048662797, 'figure'),
   (0.04844857, 'protein'),
   (0.029689193, 'could'),
   (0.028351704, 'which'),
   (0.017382966, 'only'),
   (0.0123550845, 'used'),
   (0.011567903, 'these'),
   (0.003582806, 'result'),
   (0.003513739, 'using'),
   (0.00075725775, 'have'),
   (0.000601775, 'into'),
   (0.00046981158, 'cell')],
  -0.6637651146053567),
 ([(0.38496497, 'only'),
   (0.30783033, 'result'),
   (0.090352304, 'have'),
   (0.08116736, 'these'),
   (0.06413279, 'each'),
   (0.051096015, 'into'),
   (0.0012788294, 'gene'),
   (0.0012788104, 'could'),
   (0.0012787632, 'protein'),
   (0.0012786003, 'which'),
   (0.0012785437, 'sequence'),
   (0.0012785094, 'using'),
   (0.0012784905, 'used'),
   (0.0012784789, 'rna'),
   (0.0012784601, 'two