In [1]:
import sys
import time
import json
import re
import multiprocessing
import os
from multiprocessing import Lock
from collections import deque
# from pycorenlp import StanfordCoreNLP

corpusName = "dblpv2"
FLAGS_POS_TAGGING = 1
input_path = "../../data/"+corpusName+"/intermediate/segmentation.txt"

In [15]:
import spacy

nlp = spacy.load('en')

from spacy.symbols import ORTH, LEMMA, POS, TAG
start_phrase = [
    {ORTH: u'<phrase>', LEMMA: u'', POS: u'X', TAG: u'START_PHRASE'}, 
]
end_phrase =     [{ORTH: u'</phrase>', LEMMA: u'', POS: u'X', TAG: u'END_PHRASE'} ]

nlp.tokenizer.add_special_case(u'<phrase>', start_phrase)
nlp.tokenizer.add_special_case(u'</phrase>', end_phrase)




text = u'The trie data-structure has many properties which make it especially attractive for representing large files of data. These properties include fast retrieval time, quick unsuccessful search determination, and finding the longest match to a given identifier. The main drawback is the space requirement. In this paper the concept of trie compaction is formalized. An exact algorithm for optimal trie compaction and three algorithms for approximate trie compaction are given, and an analysis of the three algorithms is done. The analysis indicate that for actual tries, reductions of around 70 percent in the space required by the uncompacted trie can be expected. The quality of the compaction is shown to be insensitive to the number of nodes, while a more relevant parameter is the alphabet size of the key.'


In [16]:
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
## add space before and after <phrase> tags
text = re.sub(r"<phrase>", " <phrase> ", text)
text = re.sub(r"</phrase>", " </phrase> ", text)
# text = re.sub(r"<phrase>", " ", text)
# text = re.sub(r"</phrase>", " ", text)
## add space before and after special characters
text = re.sub(r"([.,!:?()])", r" \1 ", text)
## replace multiple continuous whitespace with a single one
text = re.sub(r"\s{2,}", " ", text)
print(text)
doc = nlp(text)

The trie data-structure has many properties which make it especially attractive for representing large files of data . These properties include fast retrieval time , quick unsuccessful search determination , and finding the longest match to a given identifier . The main drawback is the space requirement . In this paper the concept of trie compaction is formalized . An exact algorithm for optimal trie compaction and three algorithms for approximate trie compaction are given , and an analysis of the three algorithms is done . The analysis indicate that for actual tries , reductions of around 70 percent in the space required by the uncompacted trie can be expected . The quality of the compaction is shown to be insensitive to the number of nodes , while a more relevant parameter is the alphabet size of the key . 


In [17]:
# print(doc)

# for s in doc.sents:
#     print(s)

for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#           token.shape_, token.is_alpha, token.is_stop)
    print(token.text, token.lemma_, token.pos_, token.tag_)
    
print("=================")

for chunk in doc.noun_chunks:
    print(chunk.text,chunk.start,chunk.end)


The the DET DT
trie trie NOUN NN
data data NOUN NN
- - PUNCT HYPH
structure structure NOUN NN
has have VERB VBZ
many many ADJ JJ
properties property NOUN NNS
which which ADJ WDT
make make VERB VBP
it -PRON- PRON PRP
especially especially ADV RB
attractive attractive ADJ JJ
for for ADP IN
representing represent VERB VBG
large large ADJ JJ
files file NOUN NNS
of of ADP IN
data datum NOUN NNS
. . PUNCT .
These these DET DT
properties property NOUN NNS
include include VERB VBP
fast fast ADJ JJ
retrieval retrieval NOUN NN
time time NOUN NN
, , PUNCT ,
quick quick ADJ JJ
unsuccessful unsuccessful ADJ JJ
search search NOUN NN
determination determination NOUN NN
, , PUNCT ,
and and CCONJ CC
finding find VERB VBG
the the DET DT
longest long ADJ JJS
match match NOUN NN
to to ADP IN
a a DET DT
given give VERB VBN
identifier identifier ADJ JJ
. . PUNCT .
The the DET DT
main main ADJ JJ
drawback drawback NOUN NN
is be VERB VBZ
the the DET DT
space space NOUN NN
requirement requirement NOUN NN
. . P

In [5]:
test = [ele for ele in doc.noun_chunks]
a = test[1]
print(a)
print(doc[a.start].pos_)
print(a.end)

doc[a.start].pos_ in ["DT"]
doc[a.start] 

many properties
ADJ
9


many

In [98]:
print(a.__dir__())
print(a.sentiment)

['__repr__', '__hash__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__iter__', '__len__', '__getitem__', '__new__', 'set_extension', 'get_extension', 'has_extension', 'as_doc', 'merge', 'similarity', 'get_lca_matrix', 'to_array', '_recalculate_indices', '_', 'vocab', 'sent', 'has_vector', 'vector', 'vector_norm', 'sentiment', 'text', 'text_with_ws', 'noun_chunks', 'root', 'lefts', 'rights', 'n_lefts', 'n_rights', 'subtree', 'ent_id', 'ent_id_', 'orth_', 'lemma_', 'upper_', 'lower_', 'string', 'label_', 'doc', 'start', 'end', 'start_char', 'end_char', 'label', '_vector', '_vector_norm', '__doc__', '__pyx_vtable__', '__reduce__', '__setstate__', '__str__', '__getattribute__', '__setattr__', '__delattr__', '__init__', '__reduce_ex__', '__subclasshook__', '__init_subclass__', '__format__', '__sizeof__', '__dir__', '__class__']
0.0


In [6]:
class AutoPhraseOutput(object):
    def __init__(self, input_path, nlp):
        self.input_path = input_path
        self.nlp = nlp
        self.phrase_to_pos_sequence = {}  # key: lower case phrase, value: a dict of {"pos_sequence": count}
        self.pos_sequence_to_score = {}  # key: a pos sequence, value: a score in [0,0, 1.0]
        self.candidate_phrase = []

    def parse_one_doc(self, doc):
        """ Parse each document, update the phrase_to_pos_sequence, and convert the json format from Stanford
        CoreNLP to Ellen's sentences.json.raw format

        :param doc:
        :return:
        """
        ## replace non-ascii character
        doc = re.sub(r'[^\x00-\x7F]+', ' ', doc)
        ## add space before and after <phrase> tags
        doc = re.sub(r"<phrase>", " <phrase> ", doc)
        doc = re.sub(r"</phrase>", " </phrase> ", doc)
        ## add space before and after special characters
        doc = re.sub(r"([.,!:?()])", r" \1 ", doc)
        ## replace multiple continuous whitespace with a single one
        doc = re.sub(r"\s{2,}", " ", doc)

        res = self.nlp.annotate(doc, properties={
            "annotators": "tokenize,ssplit,pos",
            "outputFormat": "json"
        })
        output_sents = []
        for sent in res['sentences']:
            ## a new sentence
            output_token_list = []
            output_pos_list = []

            IN_PHRASE_FLAG = False
            q = deque()
            for token in sent['tokens']:
                word = token['word']
                pos = token['pos']
                if word == "<phrase>": # the start of a phrase
                    IN_PHRASE_FLAG = True
                    ## Mark the position of a phrase for postprecessing
                    output_token_list.append(word)
                    output_pos_list.append("START_PHRASE")
                elif word == "</phrase>": # the end of a phrase
                    ## obtain the information of current phrase
                    current_phrase_list = []
                    while (len(q) != 0):
                        current_phrase_list.append(q.popleft())
                    phrase = " ".join([ele[0] for ele in current_phrase_list]).lower() # convert to lower case
                    pos_sequence = " ".join([ele[1] for ele in current_phrase_list])

                    ## update phrase information
                    if phrase not in self.phrase_to_pos_sequence:
                        self.phrase_to_pos_sequence[phrase] = {}

                    if pos_sequence not in self.phrase_to_pos_sequence[phrase]:
                        self.phrase_to_pos_sequence[phrase][pos_sequence] = 1
                    else:
                        self.phrase_to_pos_sequence[phrase][pos_sequence] += 1

                    IN_PHRASE_FLAG = False

                    ## Mark the position of a phrase for postprecessing
                    output_token_list.append(word)
                    output_pos_list.append("END_PHRASE")
                else:
                    if IN_PHRASE_FLAG: # in the middle of a phrase, push the (word, pos) tuple
                        q.append((word,pos))

                    ## put all the token information into the output fields
                    output_token_list.append(word)
                    output_pos_list.append(pos)

            ## Finish processing one sentence, add the result into output_sents
            output_sents.append({
                "tokens": output_token_list,
                "pos": output_pos_list
            })

        if (len(q) != 0):
            print("[ERROR]: mismatched </phrase> in document: %s" % doc)

        return output_sents

    def save_phrase_to_pos_sequence(self, output_path=""):
        with open(output_path, "w") as fout:
            for phrase in self.phrase_to_pos_sequence:
                fout.write(phrase)
                fout.write("\t")
                fout.write(str(self.phrase_to_pos_sequence[phrase]))
                fout.write("\n")

    def load_phrase_to_pos_sequence(self, input_path=""):
        with open(input_path, "r") as fin:
            for line in fin:
                line = line.strip()
                seg = line.split("\t")
                if len(seg) < 2:
                    continue
                phrase = seg[0]
                pos_sequence = eval(seg[1])
                self.phrase_to_pos_sequence[phrase] = pos_sequence

    def score_pos_sequene(self):
        for pos_sequence_list in self.phrase_to_pos_sequence.values():
            for pos_sequence in pos_sequence_list:
                if pos_sequence not in self.pos_sequence_to_score:
                    if "NN" not in pos_sequence:
                        self.pos_sequence_to_score[pos_sequence] = 0.0
                    else:
                        self.pos_sequence_to_score[pos_sequence] = 1.0

    def obtain_candidate_phrase(self, threshold = 0.8, min_sup = 5):
        print("Number of phrases before filtering = %s" % len(self.phrase_to_pos_sequence))
        for phrase in self.phrase_to_pos_sequence:
            phrase_score = 0
            freq = sum(self.phrase_to_pos_sequence[phrase].values())
            if freq < min_sup:
                continue
            for pos_sequence in self.phrase_to_pos_sequence[phrase].keys():
                pos_sequence_weight = float(self.phrase_to_pos_sequence[phrase][pos_sequence]) / freq
                pos_sequence_score = self.pos_sequence_to_score[pos_sequence]
                phrase_score += (pos_sequence_weight * pos_sequence_score)
            if phrase_score >= threshold:
                self.candidate_phrase.append(phrase)
        print("Number of phrases after filtering = %s" % len(self.candidate_phrase))

    def save_candidate_phrase(self, output_path=""):
        with open(output_path, "w") as fout:
            for phrase in self.candidate_phrase:
                fout.write(phrase+"\n")
                
                
autoPhraseOutput = AutoPhraseOutput(input_path=input_path, nlp=None)