In [17]:
# sentences = [sent.string.strip() for sent in doc]
import sys
import time
import json
import re
import multiprocessing
import os
from multiprocessing import Lock
from collections import deque
# from pycorenlp import StanfordCoreNLP
import spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG

corpusName = "test"
FLAGS_POS_TAGGING = 1
input_path = "../../data/"+corpusName+"/intermediate/segmentation.txt"


# text = u'The trie <phrase> data structure </phrase> has many properties which make it especially attractive for representing large files of data. These properties include fast retrieval time, quick unsuccessful search determination, and finding the longest match to a given identifier. The main drawback is the space requirement. In this paper the concept of trie compaction is formalized. An exact algorithm for optimal trie compaction and three algorithms for approximate trie compaction are given, and an analysis of the three algorithms is done. The analysis indicate that for actual tries, reductions of around 70 percent in the space required by the uncompacted trie can be expected. The quality of the compaction is shown to be insensitive to the number of nodes, while a more relevant parameter is the alphabet size of the key.'


In [18]:
nlp = spacy.load('en')

start_phrase = [{ORTH: u'<phrase>', LEMMA: u'', POS: u'START_PHRASE', TAG: u'START_PHRASE'}]
end_phrase = [{ORTH: u'</phrase>', LEMMA: u'', POS: u'END_PHRASE', TAG: u'END_PHRASE'}]

nlp.tokenizer.add_special_case(u'<phrase>', start_phrase)
nlp.tokenizer.add_special_case(u'</phrase>', end_phrase)

In [19]:
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    ## add space before and after <phrase> tags
    text = re.sub(r"<phrase>", " <phrase> ", text)
    text = re.sub(r"</phrase>", " </phrase> ", text)
    # text = re.sub(r"<phrase>", " ", text)
    # text = re.sub(r"</phrase>", " ", text)
    ## add space before and after special characters
    text = re.sub(r"([.,!:?()])", r" \1 ", text)
    ## replace multiple continuous whitespace with a single one
    text = re.sub(r"\s{2,}", " ", text)
    
    return text

def find(haystack, needle):
    """Return the index at which the sequence needle appears in the
    sequence haystack, or -1 if it is not found, using the Boyer-
    Moore-Horspool algorithm. The elements of needle and haystack must
    be hashable.

    >>> find([1, 1, 2], [1, 2])
    1

    """
    h = len(haystack)
    n = len(needle)
    skip = {needle[i]: n - i - 1 for i in range(n - 1)}
    i = n - 1
    while i < h:
        for j in range(n):
            if haystack[i - j] != needle[-j - 1]:
                i += skip.get(haystack[i], n)
                break
        else:
            return i - n + 1
    return -1

In [68]:
# tt = "A direct and unified approach is used to analyze the efficiency of batched searching of sequential and <phrase>tree-structured</phrase> files. The analysis is applicable to arbitrary search distributions, and closed-form expressions are obtained for the expected batched searching cost and savings. In particular, we consider a search distribution satisfying <phrase>Zipf's law</phrase> for sequential files and four types of uniform (random) search distribution for sequential and <phrase>tree-structured</phrase> files. These results unify and extend earlier research on batched searching and estimating block accesses for <phrase>database systems</phrase>."
# tt = 'We construct a three-layer <phrase>feedforward network</phrase> (one hidden layer) which can solve the N-bit parity problem employing just two <phrase>hidden units</phrase>. We discuss the implications of employing problem constraints into <phrase>transfer functions</phrase> for general pattern classification problems.'
# tt = 'A review of the application of the quad tesseral representation tosupport <phrase>spatial reasoning</phrase> is presented. The principal feature of therepresentation is that it linearises multi-dimensional space, while stillproviding for the description of <phrase>individual objects</phrase> within that space andthe relationships that may exist between those objects (in any directionand through any number of dimensions). In addition the representation issupported by an arithmetic which allows the manipulation (translation etc.)of <phrase>spatial objects</phrase>. Consequently, when incorporated into a spatialreasoning system, all necessary processing can be implemented as if in onlyone dimension. This offers two significant advantages over moreconventional multi-directional approaches to <phrase>spatial reasoning</phrase>. Firstly,many of the concerns associated with the exponential increase in the numberor relations that need to be considered (as the number of dimensions underconsideration increases) are no longer relevant. Secondly, the computationalcost of manipulating and comparing <phrase>spatial objects</phrase> remains static at itsone dimensional level, regardless of the number of dimensions underconsideration.'
tt = 'Galileo, a programming language for <phrase>database applications</phrase>, is presented. Galileo is a <phrase>strongly-typed</phrase>, interactive <phrase>programming language</phrase> designed specifically to support semantic data model features (classification, aggregation, and specialization), as well as the abstraction mechanisms of modern <phrase>programming languages</phrase> (types, abstract types, and modularization). The main contributions of Galileo are (a) a flexible type system to model database structure and semantic <phrase>integrity constraints</phrase>; (b) the inclusion of type hierarchies to support the specialization abstraction mechanisms of semantic data models; (c) a modularization mechanism to structure data and operations into interrelated units (d) the integration of abstraction mechanisms into an expression-based language that allows interactive use of the database without resorting to a new stand-alone query language.Galileo will be used in the immediate future as a tool for <phrase>database design</phrase> and, in the long term, as a high-level interface for DBMSs.'
def process_one_doc():
    article = clean_text(tt)
    articleId = 0
    
    result = []
    
    phrases = []
    output_token_list = []
    
    # go over once 
    q = deque()
    IN_PHRASE_FLAG = False
    for token in article.split(" "):
        if token == "<phrase>":
            IN_PHRASE_FLAG = True            
        elif token == "</phrase>":
            current_phrase_list = []
            while (len(q) != 0):
                current_phrase_list.append(q.popleft())
            phrases.append(" ".join(current_phrase_list).lower())
            IN_PHRASE_FLAG = False
        else:
            if IN_PHRASE_FLAG: # in the middle of a phrase, push the (word, pos) tuple
                q.append(token)

            ## put all the token information into the output fields
            output_token_list.append(token)
            
    text = " ".join(output_token_list)
    
    doc = nlp(text)
    
    for sent in doc.sents:
#         print(sent.start, sent.end)
        sentId = 0
        NPs = []
        pos = []
        tokens = []
        for s in sent.noun_chunks:
            NPs.append(s)
            
        # get pos tag    
        for token in doc:
            tokens.append(token.text)
            pos.append(token.tag_)
        
        entityMentions = []
        # For each quality phrase, check if it's NP
        for p in phrases:
            for np in NPs:
                # find if p is a substring of np
                if np.text.find(p) != -1:
#                     offset = find(np.text.split(" "), p.split(" "))
                    
                    tmp = nlp(p)
                    p_tokens = [tok.text for tok in tmp]
            
                    offset = find(tokens[], p_tokens)
                    if offset == -1:
                        print("ERROR!", tokens[np.start:np.end], "->", p)
                        continue
                        
                    start_offset = np.start + offset - sent.start
                    
                    ent = {
                        "text": p,
                        "start": start_offset,
                        "end": start_offset + len(p.split(" ")) - 1,
                        "type": "phrase"
                    }
                    entityMentions.append(ent)
                    
                    print("##", " ".join([tok.text for tok in np]))
                    print(np)
                    print(np.start, np.end)
                    print(p)
                    print(ent["start"], ent["end"])
                    
                    
                    
        
        res = {
            "articleId": articleId,
            "sentId": sentId,
            "tokens": tokens,
            "pos": pos,
            "entityMentions": entityMentions,
            "np_chunks": [t.text for t in NPs]
        }
        result.append(res)
        
        sentId += 1
        
    return result

json.dumps(process_one_doc())
# process_one_doc()

AttributeError: 'spacy.tokens.span.Span' object has no attribute 'token'

In [55]:
tmp = nlp("concurrent data structure")
for token in tmp:
    print(token)

concurrent
data
structure


In [56]:
def find(haystack, needle):
    """Return the index at which the sequence needle appears in the
    sequence haystack, or -1 if it is not found, using the Boyer-
    Moore-Horspool algorithm. The elements of needle and haystack must
    be hashable.

    >>> find([1, 1, 2], [1, 2])
    1

    """
    h = len(haystack)
    n = len(needle)
    skip = {needle[i]: n - i - 1 for i in range(n - 1)}
    i = n - 1
    while i < h:
        for j in range(n):
            if haystack[i - j] != needle[-j - 1]:
                i += skip.get(haystack[i], n)
                break
        else:
            return i - n + 1
    return -1

In [61]:
find("text modern programming languages".split(" "), "programming language".split(" "))

-1