In [14]:
import re
import operator
import nltk
import math
import string
import os
import json

from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import*

In [15]:
debug = False
test = True

In [16]:
def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False

In [17]:
def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words

In [18]:
def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words

In [44]:
def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s\n')
    sentences = sentence_delimiters.split(text)
    return sentences

In [20]:
def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern

In [46]:
def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip().rstrip() )
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list

In [22]:
def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

     # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score

In [23]:
def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates

In [24]:
class Rake(object):
    def __init__(self, stop_words_path):
        self.stop_words_path = stop_words_path
        self.__stop_words_pattern = build_stop_word_regex(stop_words_path)

    def run(self, text):
        sentence_list = split_sentences(text)

        phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern)

        word_scores = calculate_word_scores(phrase_list)

        keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)

        sorted_keywords = sorted(keyword_candidates.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sorted_keywords

In [49]:
class Subfield:

    def __init__(self, name, keywords):
        self.name = name
        self.keywords = keywords

        
diagnosis = Subfield("Diagnosis", ["diagnos", "detect", "medical imag", "mri", "radiolog", "patholog", "audit"])
prognosis = Subfield("Prognosis", ["prognos", "predict"])
drug_discovery = Subfield("Drug Discovery", ["drug discovery", "drug design", "drug development"])
treatment = Subfield("Treatment", ["treatment", "personali", "therap", "prescription", "patient care", "dosing", "individualiz", "individualis"])
surgery = Subfield("Surgery", ["surgery"])
epidemiology = Subfield("Epidemiology", ["epidemi", "pandemi"])
robotics = Subfield("Robotics", ["robot", "autonom", "neuro-prosthetics", "brain computer interface"])
smart_healthcare = Subfield("Smart Healthcare", ["smart", "administrat", "analytics", "data process", "data manag", "data science", "data collect",
                                                "care data", "patient flow", "cost", "interoperability", "privacy", "digitali",
                                                "communicat", "e-health", "emr", "decision support", "data mining", "data security"])
subfields = [diagnosis, prognosis, drug_discovery, treatment, surgery, epidemiology, robotics, smart_healthcare]

In [51]:
path='D:\\A-Mannheim\\Team Project\\automatic'
files=os.listdir(path)
for file in files:
    if not os.path.isdir(file):
        f=open(path+"/"+file)
        setting = json.load(f)
        print(file)
        text = setting["coredata"]["dc:description"].replace('Abstract','')
        print(setting["coredata"]["dc:description"].replace('Abstract',''))
        print("\n")
        print(setting["subfield"])
        print(setting["classified"])
        print("\n")
        for index in range(len(subfields)):
            if (subfields[index].name in setting["subfield"])==True:
                print(subfields[index].keywords)
                print('\n')
        # Split text into sentences
        sentenceList = split_sentences(text)
        #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
        stoppath = "SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
        stopwordpattern = build_stop_word_regex(stoppath)

        # generate candidate keywords
        phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

        # calculate individual word scores
        wordscores = calculate_word_scores(phraseList)

        # generate candidate keyword scores
        keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
        if debug: print (keywordcandidates)

        sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)  
        if debug: print (sortedKeywords)

        totalKeywords = len(sortedKeywords)
        if debug: print (totalKeywords)
        print (sortedKeywords[0:(totalKeywords // 3)])
        print("\n")
        

https%3A%2F%2Fapi.elsevier.com%2Fcontent%2Farticle%2Fdoi%2F10.1016%2FS0933-3657%2801%2900089-6.json

               
               
                  This paper presents a method for the discovery of temporal patterns in multivariate time series and their conversion into a linguistic knowledge representation applied to sleep-related breathing disorders. The main idea lies in introducing several abstraction levels that allow a step-wise identification of temporal patterns. Self-organizing neural networks are used to discover elementary patterns in the time series. Machine learning (ML) algorithms use the results of the neural networks to automatically generate a rule-based description. At the next levels, temporal grammatical rules are inferred. This method covers one of the main “bottlenecks” in the design of knowledge-based systems, namely, the knowledge acquisition problem. An evaluation of the rules lead to an overall sensitivity of 0.762, and a specificity of 0.758.
              