In [17]:
# sentences = [sent.string.strip() for sent in doc]
import sys
import time
import json
import re
import multiprocessing
import os
from multiprocessing import Lock
from collections import deque
# from pycorenlp import StanfordCoreNLP
import spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG

corpusName = "test"
FLAGS_POS_TAGGING = 1
input_path = "../../data/"+corpusName+"/intermediate/segmentation.txt"


# text = u'The trie <phrase> data structure </phrase> has many properties which make it especially attractive for representing large files of data. These properties include fast retrieval time, quick unsuccessful search determination, and finding the longest match to a given identifier. The main drawback is the space requirement. In this paper the concept of trie compaction is formalized. An exact algorithm for optimal trie compaction and three algorithms for approximate trie compaction are given, and an analysis of the three algorithms is done. The analysis indicate that for actual tries, reductions of around 70 percent in the space required by the uncompacted trie can be expected. The quality of the compaction is shown to be insensitive to the number of nodes, while a more relevant parameter is the alphabet size of the key.'


In [18]:
nlp = spacy.load('en')

start_phrase = [{ORTH: u'<phrase>', LEMMA: u'', POS: u'START_PHRASE', TAG: u'START_PHRASE'}]
end_phrase = [{ORTH: u'</phrase>', LEMMA: u'', POS: u'END_PHRASE', TAG: u'END_PHRASE'}]

nlp.tokenizer.add_special_case(u'<phrase>', start_phrase)
nlp.tokenizer.add_special_case(u'</phrase>', end_phrase)

In [19]:
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    ## add space before and after <phrase> tags
    text = re.sub(r"<phrase>", " <phrase> ", text)
    text = re.sub(r"</phrase>", " </phrase> ", text)
    # text = re.sub(r"<phrase>", " ", text)
    # text = re.sub(r"</phrase>", " ", text)
    ## add space before and after special characters
    text = re.sub(r"([.,!:?()])", r" \1 ", text)
    ## replace multiple continuous whitespace with a single one
    text = re.sub(r"\s{2,}", " ", text)
    
    return text

def find(haystack, needle):
    """Return the index at which the sequence needle appears in the
    sequence haystack, or -1 if it is not found, using the Boyer-
    Moore-Horspool algorithm. The elements of needle and haystack must
    be hashable.

    >>> find([1, 1, 2], [1, 2])
    1

    """
    h = len(haystack)
    n = len(needle)
    skip = {needle[i]: n - i - 1 for i in range(n - 1)}
    i = n - 1
    while i < h:
        for j in range(n):
            if haystack[i - j] != needle[-j - 1]:
                i += skip.get(haystack[i], n)
                break
        else:
            return i - n + 1
    return -1

In [40]:
# tt = "A direct and unified approach is used to analyze the efficiency of batched searching of sequential and <phrase>tree-structured</phrase> files. The analysis is applicable to arbitrary search distributions, and closed-form expressions are obtained for the expected batched searching cost and savings. In particular, we consider a search distribution satisfying <phrase>Zipf's law</phrase> for sequential files and four types of uniform (random) search distribution for sequential and <phrase>tree-structured</phrase> files. These results unify and extend earlier research on batched searching and estimating block accesses for <phrase>database systems</phrase>."
tt = 'We construct a three-layer <phrase>feedforward network</phrase> (one hidden layer) which can solve the N-bit parity problem employing just two <phrase>hidden units</phrase>. We discuss the implications of employing problem constraints into <phrase>transfer functions</phrase> for general pattern classification problems.'

def process_one_doc():
    article = clean_text(tt)
    articleId = 0
    
    result = []
    
    phrases = []
    output_token_list = []
    
    # go over once 
    q = deque()
    IN_PHRASE_FLAG = False
    for token in article.split(" "):
        if token == "<phrase>":
            IN_PHRASE_FLAG = True            
        elif token == "</phrase>":
            current_phrase_list = []
            while (len(q) != 0):
                current_phrase_list.append(q.popleft())
            phrases.append(" ".join(current_phrase_list).lower())
            IN_PHRASE_FLAG = False
        else:
            if IN_PHRASE_FLAG: # in the middle of a phrase, push the (word, pos) tuple
                q.append(token)

            ## put all the token information into the output fields
            output_token_list.append(token)
            
    text = " ".join(output_token_list)
    
    doc = nlp(text)
    
    for sent in doc.sents:
#         print(sent.start, sent.end)
        sentId = 0
        NPs = []
        pos = []
        tokens = []
        for s in sent.noun_chunks:
            NPs.append(s)
            
        # get pos tag    
        for token in doc:
            tokens.append(token.text)
            pos.append(token.tag_)
        
        entityMentions = []
        # For each quality phrase, check if it's NP
        for p in phrases:
            for np in NPs:
                # find if p is a substring of np
                if np.text.find(p) != -1:
                    offset = find(tokens[np.start:np.end], p.split(" "))
                    start_offset = np.start + offset - sent.start
                    
                    ent = {
                        "text": p,
                        "start": start_offset,
                        "end": start_offset + len(p.split(" ")) - 1,
                        "type": "phrase"
                    }
                    entityMentions.append(ent)
                     
        res = {
            "articleId": articleId,
            "sentId": sentId,
            "tokens": tokens,
            "pos": pos,
            "entityMentions": entityMentions,
            "np_chunks": NPs
        }
        result.append(res)
        
        sentId += 1
        
    return result

json.dumps(process_one_doc())

a three-layer feedforward network
2 8
feedforward network
6 7
just two hidden units
23 27
hidden units
25 26
transfer functions
37 39
transfer functions
9 10


'[{"articleId": 0, "sentId": 0, "tokens": ["We", "construct", "a", "three", "-", "layer", "feedforward", "network", "(", "one", "hidden", "layer", ")", "which", "can", "solve", "the", "N", "-", "bit", "parity", "problem", "employing", "just", "two", "hidden", "units", ".", "We", "discuss", "the", "implications", "of", "employing", "problem", "constraints", "into", "transfer", "functions", "for", "general", "pattern", "classification", "problems", "."], "pos": ["PRP", "VBP", "DT", "CD", "HYPH", "NN", "NN", "NN", "-LRB-", "CD", "JJ", "NN", "-RRB-", "WDT", "MD", "VB", "DT", "NN", "HYPH", "NN", "NN", "NN", "VBG", "RB", "CD", "VBN", "NNS", ".", "PRP", "VBP", "DT", "NNS", "IN", "VBG", "NN", "NNS", "IN", "NN", "NNS", "IN", "JJ", "NN", "NN", "NNS", "."], "entityMentions": [{"text": "feedforward network", "start": 6, "end": 7, "type": "phrase"}, {"text": "hidden units", "start": 25, "end": 26, "type": "phrase"}]}, {"articleId": 0, "sentId": 0, "tokens": ["We", "construct", "a", "three", "-", "l

In [2]:
print(1)

1
