In [1]:
import sys
import langid
import os
import itertools
from operator import itemgetter
import networkx as nx
import stemming.porter2 as porter
from tokenizer import StanfordCoreNlpTokenizer
from stanfordcorenlp import StanfordCoreNLP
import requests

path_or_host = 'http://localhost'
url = path_or_host + ':' + str(9000)
stanford_core = StanfordCoreNLP(path_or_host)
tokenizer = StanfordCoreNlpTokenizer(path_or_host)
stem = porter.stem

def stanford_service_request(annotators=None, data=None, *args, **kwargs):
    if sys.version_info.major >= 3:
        data = data.encode('utf-8')

    properties = {'annotators': annotators, 'outputFormat': 'json'}
    params = {'properties': str(properties), 'pipelineLanguage': 'en'}
    if 'pattern' in kwargs:
        params = {"pattern": kwargs['pattern'], 'properties': str(properties), 'pipelineLanguage': 'en', 'timeout' : 180000}
    r = requests.post(url, params=params, data=data, headers={'Connection': 'close'}, timeout=180000)
    r_dict = json.loads(r.text)
    return r_dict


In [2]:
import json
import os

datasets_dir = '../datasets'
datanames = ['nus']
storiesMap = {}

for dataset in datanames:
    infilename  = os.path.join(datasets_dir, '%s-standard.json' % dataset)
    with open(infilename) as f:
        stories = json.loads(f.read())
    storiesMap[dataset] = stories
    
for d, v in storiesMap.items():
    print(d, ' ', len(v))
    
for dataset, stories in storiesMap.items():
    stories_ = [story for story in stories if len(story['entities']) > 0]
    print(dataset, 'total_stories: ' , len(stories), ', stories with entities: ', len(stories_))
    print(stories[0].keys())
    
for dataset, stories in storiesMap.items():
    print(dataset)
    dataset_dir = os.path.join('../datasets', dataset + '_post_processed')
    if not os.path.isdir(dataset_dir): 
        os.mkdir(dataset_dir)
        
    for idx, story in enumerate(stories):
        story_file = os.path.join(dataset_dir, str(idx) + '.json')
        if os.path.isfile(story_file):
            with open(story_file) as f:
                story_ = json.loads(f.read())
                if len(story_['entities']) > 0:
                    story = story_
        if not os.path.isfile(story_file) or len(story_['entities']) == 0:
            content = story['headline'] + '\n' + story['body']
            story['content'] = content
            story['suid'] = idx
            
            # stem story content
            original_words, phrases = tokenizer.tokenize(content)
            stemmed_words = [stem(word).lower() for word in original_words]
            unique_word_list = set(stemmed_words)
            story['original_words'] = original_words
            story['stemmed_words'] = stemmed_words
            story['phrases'] = phrases
            story['unique_word_list'] = list(unique_word_list)
            
            # parse story content
            try:
                dep = stanford_service_request('depparse', content)
                sentences = dep['sentences']
            except:
                print('\nfailed processing: ', idx, ' len: ', len(content))
                story['body'] = story['headline'] = ''
                story['entities'] = []
                stories[idx] = story
                continue 
                
            word_parsing_levels = {}
            dep_types_counter = {}
            for sentence in sentences:
                dependencies = sentence['basicDependencies']
                dep_to_gov_map = {}
                for dep in dependencies:
                    dep_to_gov_map[dep['dependent']] = dep['governor']
                    dep_name = stem(dep['dependentGloss']).lower()
                    if dep_name not in unique_word_list:
                        continue
                    dep_type = dep['dep']
                    if dep_name not in dep_types_counter:
                        dep_types_counter[dep_name] = {}
                    if dep_type not in dep_types_counter[dep_name]:
                        dep_types_counter[dep_name][dep_type] = 1
                    else:
                        dep_types_counter[dep_name][dep_type] += 1
                dep_level = {}
                def get_level(token_idx):
                    if token_idx == 0: return 0
                    if token_idx in dep_level: return dep_level[token_idx]
                    level = 1 + get_level(dep_to_gov_map[token_idx])
                    dep_level[token_idx] = level
                    return level
                        
                for token in sentence['tokens']:
                    word = stem(token['originalText']).lower()
                    if word not in unique_word_list:
                        continue
                    level = get_level(token['index'])
                    if word not in word_parsing_levels:
                        word_parsing_levels[word] = [level]
                    else:
                        word_parsing_levels[word].append(level)
            story['dep_types_counter'] = dep_types_counter
            for w, l in word_parsing_levels.items():
                word_parsing_levels[w] = sorted(l)
            story['word_parsing_levels'] = word_parsing_levels
            
            # all words position
            word_positions = {}
            for i, word in enumerate(stemmed_words):
                pos = i+1
                if word not in word_positions:
                    word_positions[word] = [pos]
                else:
                    word_positions[word].append(pos)
            story['word_positions'] = word_positions
            
            with open(story_file, 'w') as f:
                f.write(json.dumps(story))
        stories[idx] = story
        sys.stdout.write('\r%s story %d/%d [len %d] [entities: %d]'% (dataset, idx, len(stories), len(story['content']), len(story['entities'])))
    print('')
for dataset, stories in storiesMap.items():
    print(dataset)
    level = 0
    pos   = 0
    dep   = 0
    stemmed = 0
    for story in stories:
        stemmed_words = story['stemmed_words']
        level_ = len([s for s in stemmed_words if s in story['word_parsing_levels']])
        dep_   = len([s for s in stemmed_words if s in story['dep_types_counter']])
        pos_   = len([s for s in stemmed_words if s in story['word_positions']])
        stemmed += len(stemmed_words)
        level += level_
        pos += pos_
        dep += dep_
    print('dep type coverage: %.3f' % (dep / stemmed))
    print('level coverage   : %.3f' % (level / stemmed))
    print('pos   coverage   : %.3f' % (pos / stemmed))
    print('')
    
    grammar_parts = set()
    for story in stories:
        for w, keys in story['dep_types_counter'].items():
            grammar_parts.update(keys)
    for part in grammar_parts:
        print(part)
        

nus   183
nus total_stories:  183 , stories with entities:  183
dict_keys(['headline', 'body', 'entities', 'stemmed_content'])
nus
nus story 182/183 [len 59812] [entities: 10]
nus
dep type coverage: 1.000
level coverage   : 0.995
pos   coverage   : 1.000

advmod
csubj
neg
iobj
discourse
compound:prt
amod
punct
acl:relcl
aux
xcomp
auxpass
nmod:poss
dep
nmod
ROOT
nsubjpass
nmod:npmod
det
parataxis
nmod:tmod
csubjpass
compound
appos
advcl
case
ccomp
nsubj
mwe
acl
det:predet
dobj
root
cc:preconj
conj
cop
mark
cc
nummod


In [146]:
print([(i['dependentGloss'], i['dep']) for i in sentence['basicDependencies']])

def get_strength(d):
    def get_count(k):
        if k in d:
            return d[k]
        else:
            return 0
    return sum([get_count(k) for k in d.keys() if 'subj' in k])

keys = sorted(dep_types_counter.keys(), key = lambda e : get_strength(dep_types_counter[e]), reverse=True)
for w in keys:
    print(w, get_strength(dep_types_counter[w]))
    l = dep_types_counter[w].items()
    for t, c in l:
        print('  ', t, c)
assert(False)

[('159', 'ROOT')]
servic 26
   dobj 34
   compound 95
   nmod 100
   nsubj 25
   appos 2
   nsubjpass 1
   ROOT 3
   dep 1
   root 1
   conj 2
term 17
   nmod 77
   compound 50
   nsubj 12
   dep 9
   dobj 25
   root 3
   nsubjpass 5
   appos 2
   conj 1
set 14
   advcl 1
   nmod 22
   dep 1
   appos 1
   dobj 12
   amod 5
   nsubjpass 5
   ROOT 1
   nsubj 9
   conj 1
   compound 1
prober 14
   nmod 17
   xcomp 1
   nsubj 14
   dep 3
   conj 3
   dobj 6
   nmod:poss 1
   compound 1
document 12
   dobj 22
   nmod 52
   nsubj 6
   conj 3
   compound 12
   appos 1
   nsubjpass 6
   dep 2
   ROOT 1
j 10
   dep 12
   compound 9
   conj 9
   amod 1
   nmod 11
   nsubj 9
   appos 20
   dobj 3
   nsubjpass 1
summari 9
   compound 12
   dep 3
   xcomp 2
   nsubj 8
   dobj 21
   nmod 15
   conj 1
   nsubjpass 1
approach 7
   dep 2
   nsubj 5
   nsubjpass 2
   dobj 1
   nmod 1
   ROOT 1
techniqu 7
   conj 1
   dobj 10
   nsubjpass 3
   nmod 13
   nsubj 4
   dep 1
   ROOT 1
relationship 7
   nmod 

   amod 10
   conj 1
bias 0
   amod 25
   conj 2
   compound 4
   dobj 2
   ROOT 1
   nmod 4
   dep 1
abstract 0
   nmod 1
in 0
   case 202
   mark 10
   appos 2
   advmod 1
   dep 1
present 0
   acl:relcl 2
   ROOT 1
   compound 1
discoveri 0
   dobj 6
   compound 15
   nmod 7
   dep 2
   ROOT 2
data-intens 0
   amod 20
support 0
   ROOT 5
   compound 2
   nmod 1
   acl:relcl 3
   conj 1
   ccomp 1
   dep 1
   dobj 1
   advcl 3
first 0
   amod 16
   advmod 4
view 0
   dobj 5
   ccomp 1
uniqu 0
   amod 2
featur 0
   dobj 1
abl 0
   parataxis 1
determin 0
   xcomp 4
   conj 1
   advcl 3
   root 1
   acl 1
   ROOT 1
few 0
   amod 8
   advmod 1
interact 0
   nmod 3
target 0
   compound 32
   dobj 7
   xcomp 1
   nmod 15
   conj 1
   appos 1
   nmod:poss 1
evalu 0
   conj 3
   acl 1
   advcl 3
   ROOT 8
   xcomp 6
   nmod 2
respect 0
   nmod 11
   advmod 1
other 0
   amod 20
   nmod 4
use 0
   acl:relcl 2
   xcomp 12
   nmod 2
   ROOT 9
   acl 14
   parataxis 2
   conj 1
   advcl 3
   ccom

global 0
   dep 1
   amod 1
   conj 1
valuabl 0
   amod 1
issu 0
   dobj 1
mind 0
   nmod 1
perspect 0
   nmod 2
   dobj 1
step 0
   nmod 1
help 0
   ROOT 2
import 0
   ROOT 2
   ccomp 1
inequ 0
   compound 1
hold 0
   acl:relcl 1
   dep 2
candid 0
   compound 2
retriev 0
   parataxis 1
   ROOT 1
   xcomp 1
   conj 1
   nmod 3
top 0
   amod 2
updat 0
   compound 1
keyword-bas 0
   amod 1
uddi-directory-bas 0
   amod 1
sourcebiasedprob 0
   dobj 1
one-term 0
   amod 1
q 0
   dobj 2
top-m 0
   amod 2
update 0
   compound 1
meta-data 0
   amod 1
lower 0
   ROOT 1
   amod 1
   xcomp 1
due 0
   case 2
   advmod 1
lack 0
   nmod 1
rich 0
   nmod 1
breviti 0
   nmod 1
simplist 0
   amod 1
world 0
   dobj 1
realiti 0
   nmod 1
magnitud 0
   nmod 1
arthriti 0
   dep 4
bacteria 0
   appos 2
   conj 1
cancer 0
   appos 2
   conj 1
simplic 0
   nmod 2
anim 0
   conj 1
car 0
   conj 1
dog 0
   conj 1
eleph 0
   conj 1
frog 0
   conj 1
   appos 1
garag 0
   conj 1
helmet 0
   conj 1
   appos 1
indyc

AssertionError: 

In [5]:
import boto
from gensim.models import Word2Vec
#https://github.com/olivettigroup/materials-word-embeddings
word2vec_pretrained = Word2Vec.load("../materials-word-embeddings/bin/word2vec_embeddings-SNAPSHOT.model")

word2vec_models_map = {}
for dataset in storiesMap.keys():
    infile = os.path.join('word2vec_models', dataset)
    model = Word2Vec.load(infile)
    word2vec_models_map[dataset] = model
    print('loaded model: ', dataset)

loaded model:  nus


In [6]:
from sklearn.externals import joblib

linear_models_map = {}
for dataset in storiesMap.keys():
#     outdir = 'MLP_C_100_train_0.8_edge_models++_dep_all_a=0.0005_t'
    outdir = 'Linear_edge_models++_dep_all'
    attention_key = 'L&D_w_20_top_1.00_L=non_neigbours'
    clffile = os.path.join(outdir, attention_key + dataset + '.pkl')
    try:
        linear_models_map[dataset] = joblib.load(clffile)
    except:
        print('Failed loading model: %s' % dataset)
        continue
linear_p_models_map = {}
for dataset in storiesMap.keys():
    outdir = 'P_MODELS_pos_u_norm'
    attention_key = 'L&D_w_20_top_1.00_L=non_neigbours'
    clffile = os.path.join(outdir, attention_key + dataset + '.pkl')
    try:
        linear_p_models_map[dataset] = joblib.load(clffile)
    except:
        print('Failed loading p model: %s' % dataset)

Failed loading p model: nus


In [7]:
weighted = False
visualize = False

from sklearn.externals import joblib
        
import math
import numpy as np
import time
import statistics
from sklearn.externals import joblib

import networkx as nx
from networkx.exception import NetworkXError
from networkx.utils import not_implemented_for
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import random

class Weights:
    LINEAR_MODEL_EDGE = "jump=1,edge=Linear_train_0.8++_dep_all_a=0.0005_th=0.7"
    JUMP_POS_LINEAR_MODEL_EDGE = "jump=pos,edge=Linear_train_0.8++_dep_all_a=0.0005_th=0.7"
    JUMP_POS = "jump=pos,edge=1"
    EQUAL = "jump=1,edge=1"

window_size = 6
window_offset = math.ceil(window_size / 2)
window_offsets = [
    o for o in range(-window_offset, window_offset + 1) if o != 0
]
print_keys = True
for dataset, stories in storiesMap.items():
    print(dataset.upper())

    try:
        clf, normalizer_edge, names_edge = linear_models_map[dataset]
    except:
        print('Faild loading linear model for dataset: ', dataset)
        continue
    
    try:
        clf_u, normalizer, names = linear_p_models_map[dataset]
        print('p names', names)
    except:
        print('Faild loading linear P model for dataset: ', dataset)
#         continue
        
    try:
        word2vec_model = word2vec_models_map[dataset]
    except:
        print('Failed loading word2vec model for dataset', dataset)
        continue

    avg_time = 0
    GR = []
    PRED = []
    
    all_stories = [s.copy() for s in stories]
    all_stories = sorted(all_stories, key = lambda e : str(e['suid']))
    len_all = len(all_stories)
    len_train = round(0.8 * len_all)
    random.seed(10)
    random.shuffle(all_stories)
    
    train_stories = all_stories[:len_train]
    test_stories = all_stories[len_train:]
    print('TEST  stories [%d]' % len(test_stories))
    print([story['suid'] for story in test_stories])
    print('')
    
    for story_idx, story in enumerate(test_stories):
        if len(story['body']) == 0: continue
        start_time = time.time()
        processed = False

        phrases = story['phrases']
        stemmed_words = story['stemmed_words']
        word_positions = story['word_positions']
        word_levels = story['word_parsing_levels']
        dep_types_counter = story['dep_types_counter']
        
        entities = story['entities']
        gold_grams = set()
        for entity in entities:
            grams = stanford_core.word_tokenize(entity['id'])
            grams = [stem(token).lower() for token in grams]
            gold_grams.update(grams)
            
        # compute feature pre-reqs
        word2vec = {}
        for i, word in enumerate(stemmed_words):
            if word in word2vec.keys() or word not in word2vec_model.wv.vocab:
                continue
            w_embed = word2vec_model.wv.get_vector(word)
            word2vec[word] = w_embed
        count = {}
        pos = {}
        first_pos = {}
        
        def update_or_insert(d, k, v):
            if k in d.keys():
                d[k] += v
            else:
                d[k] = v

        for i, word in enumerate(stemmed_words):
            update_or_insert(count, word, 1)
            update_or_insert(pos, word, 1 / (i + 1))
            if not word in first_pos:
                first_pos[word] = 1/(i+1)
                
            
        def get_part_strength(d, key):
            def get_count(k):
                if k in d:
                    return d[k]
                else:
                    return 0
            s = sum([get_count(k) for k in d.keys() if key in k.lower()])
            return s
        
        def compose_features(u, v, H, H_first):
            count_u = count[u]
            count_v = count[v]
            pos_u = pos[u]
            pos_v = pos[v]
            wvec_u = word2vec[u]
            wvec_v = word2vec[v]
            u_level = 1 - 1 / np.mean(word_levels[u])
            v_level = 1 - 1 / np.mean(word_levels[v])
            s_uv = word2vec_model.wv.similarity(u, v)
            
#             parts = [
#                 'subj',
#                 'obj',
#                 'appos',
#                 'conj',
#                 'compound',
#                 'mod',
#                 'det',
#                 ]                        
#             # part of sentence
            parts = [
                'subj',
                'obj',
                'appos',
                'conj',
                'compound',
                'mod',
                'acl',
                'det',
                'neg',
                'mark',
                'auxpass',
                'ccomp',
                'mwe',
                'parataxis',
                'cop',
                'advcl',
                'cc',
                'discourse',
                'xcomp',
                'case',
                'dep',
                'aux',
                'punct',
                'root',
            ]
#             parts = [
#                 'csubj',
#                 'nsubj',
#                 'nsubjpass',
#                 'csubjpass',
#                 'dobj',
#                 'iobj',
#                 'appos',
#                 'conj',
#                 'cc:preconj',
#                 'compound',
#                 'compound:prt',
#                 'nmod',
#                 'advmod',
#                 'nummod',
#                 'nmod:tmod',
#                 'nmod:npmod',
#                 'nmod:poss',
#                 'amod',
#                 'acl',
#                 'acl:relcl',
#                 'det',
#                 'det:predet',
#                 'neg',
#                 'mark',
#                 'auxpass',
#                 'ccomp',
#                 'mwe',
#                 'parataxis',
#                 'cop',
#                 'advcl',
#                 'cc',
#                 'discourse',
#                 'xcomp',
#                 'case',
#                 'dep',
#                 'root',
#                 'aux',
#                 'punct',
#             ]
            v_parts = np.array([get_part_strength(dep_types_counter[v], part) for part in parts])
            u_parts = np.array([get_part_strength(dep_types_counter[u], part) for part in parts])

            # co-occurence count
            co_occur = H[u][v]['weight']

            # first time together
            co_first = H_first[u][v]['weight']

            # neighbours
            u_neigs = list(H[u].keys())
            v_neigs = list(H[v].keys())
            uv_neigs = [u for u in u_neigs if u in v_neigs]

            x = np.concatenate(
                (np.array([
                    count_u, count_v, pos_u, pos_v, u_level, v_level, s_uv,
                    co_occur,
                    co_first,
                    len(u_neigs),
                    len(v_neigs),
                    len(uv_neigs),
                ]), u_parts, v_parts, wvec_u, wvec_v),
                axis=0)
            x = np.expand_dims(x, axis=0)
            return x

        def valid_token(u, verbose=False):
            if u not in word_levels.keys():
                if verbose:
                    print(u, 'not in word_levels')
                return False
            if u not in word2vec_model.wv.vocab:
                if verbose:
                    print(u, 'not in vocabulary')
                return False
            return True
        
        H_first = nx.Graph()
        H_all = nx.Graph()
        for i, v in enumerate(stemmed_words):
            if len(v) <= 1 or not valid_token(v):
                continue
            for offset in window_offsets:
                idx = i + offset
                if idx < len(stemmed_words) and idx >= 0:
                    u = stemmed_words[idx]
                    if u == v:
                        continue
                    if len(u) <= 1 or not valid_token(u):
                        continue
                    if H_all.has_edge(u, v):
                        H_all[u][v]['weight'] += 1
                    else:
                        H_all.add_edge(u, v, weight=1)
                        assert(H_first.has_edge(u, v) == False)
                        H_first.add_edge(u, v, weight=1/(i+1))
        P_all = {}
        for v in H_all:
            if v not in P_all.keys():
                P_all[v] = pos[v]
                
        # create edges       
        H = nx.DiGraph()
        for i, v in enumerate(stemmed_words):
            if len(v) <= 1 or not valid_token(v):
                continue
            for offset in window_offsets:
                idx = i + offset
                if idx < len(stemmed_words) and idx >= 0:
                    u = stemmed_words[idx]
                    if u == v:
                        continue
                    if len(u) <= 1 or not valid_token(u):
                        continue
                    if H.has_edge(u, v) and H.has_edge(v, u):
                        continue                   
                    uv = compose_features(u, v, H_all, H_first)
                    uv = normalizer_edge.transform(uv)
                    uv_s = float(clf.predict(uv))
                    if uv_s >= 0.7:
                        H.add_edge(u, v, weight=uv_s)
                    vu = compose_features(v, u, H_all, H_first)
                    vu = normalizer_edge.transform(vu)
                    vu_s = float(clf.predict(vu))
                    if vu_s >= 0.7:
                        H.add_edge(v, u, weight=vu_s)
                    GR.append(int(v in gold_grams))
                    GR.append(int(u in gold_grams))
                    PRED.append(uv_s)
                    PRED.append(vu_s)
                                 
        P = {}
        for v in H:
            if v not in P.keys():
                P[v] = pos[v]
            for u in H[v]:
                if u not in P.keys():
                    P[u] = pos[u]
                
#         def compose_features_u(u):
#             count_u = count[u]
#             pos_u = pos[u]
#             fst_pos = first_pos[u]
#             wvec_u = word2vec[u]
#             u_level = np.mean(word_levels[u])
#             features_map = {
#                 'pos_u'   : pos_u,
#                 'u_level' : 1 - 1/u_level,
#                 'count_u' : count_u,
#                 'first_pos' : fst_pos
#             }
#             x = np.array([features_map[name] for name in names])
# #             x = np.concatenate((x, wvec_u), axis=0)
#             x = np.expand_dims(x, axis=0)
#             return x
#         P = {}
#         for idx, v in enumerate(stemmed_words):
#             if len(v) <= 1 or not valid_token(v) or v in P.keys():
#                 continue
#             x = compose_features_u(v)
#             x = normalizer.transform(x)
#             score = clf_u.predict(x)
#             P[v] = float(score)
#         min_w = min(P.values())
#         max_w = max(P.values())
#         if max_w <= 0:
#             print('all zero -- sidx %d, min-w: %f, max-w: %f' %(story_idx, min_w, max_w))
#         for k, v in P.items():
#             if max_w <= 0:
#                 P[k] = v - min_w
#             else:
#                 P[k] = max(0, P[k])
#         if visualize:
#             plt.clf()
#             s = sum(pos.values())
#             for k, v in pos.items():
#                 pos[k] = v / s
#             plt.hist(pos.values())
#             plt.title('pos')
#             plt.show()
             
#             plt.clf()
#             plt.hist(P.values())
#             plt.title('pred')
#             plt.show()
            
#             plt.clf()
#             s = sum(P.values())
#             for k, v in P.items():
#                 P[k] = v / s
#             plt.hist(P.values())
#             plt.title('pred norm')
#             plt.show()
#             assert(False)

        for (weights_type) in [
                Weights.JUMP_POS_LINEAR_MODEL_EDGE,
                Weights.LINEAR_MODEL_EDGE,
                Weights.EQUAL,
                Weights.JUMP_POS
        ]:
            algo = str(weights_type)
            if print_keys:
                print(algo)
            FOLDER = os.path.join('./output/', dataset, algo)
            if not os.path.exists(FOLDER): os.makedirs(FOLDER)
            keyphraseFile = os.path.join(FOLDER, str(story['suid']))
            processed = True

            if weights_type == Weights.LINEAR_MODEL_EDGE:
                calculated_page_rank = nx.pagerank(H)
            elif weights_type == Weights.JUMP_POS_LINEAR_MODEL_EDGE:
                calculated_page_rank = nx.pagerank(H, personalization=P)
            else:
                if weights_type == Weights.JUMP_POS:
                    calculated_page_rank = nx.pagerank(H_all, personalization=P_all)
                else:
                    calculated_page_rank = nx.pagerank(H_all)

            # reconstruct phrases
            def getWordScore(word):
                try:
                    return calculated_page_rank[word]
                except KeyError as e:
                    #  print('key error: ', word)
                    pass
                return 0

            phrase_scores = set()
            for phrase in phrases:
                phrase_words = [
                    stem(word).lower() for word in phrase.split("_")
                ]
                phrase_score = sum(
                    [getWordScore(word) for word in phrase_words])
                if phrase_score > 0:
                    stemmed_phrase = ' '.join(phrase_words)
                    phrase_scores.add((stemmed_phrase, phrase_score))

            # single words
            phrase_scores.update(calculated_page_rank.items())

            sorted_phrases = sorted(
                phrase_scores, key=lambda e: e[1], reverse=True)

            keyphraseFile = open(keyphraseFile, 'w')
            for phrase, score in sorted_phrases:
                keyphraseFile.write(phrase + ':::' + str(score) + '\n')
            keyphraseFile.close()

        if processed:
            elapsed_time = time.time() - start_time
            avg_time = (avg_time * story_idx + elapsed_time) / (story_idx + 1)
            elapsed = time.strftime('[%M:%S]', time.gmtime(elapsed_time))
            remaining = time.strftime(
                '[%H:%M:%S]',
                time.gmtime(avg_time * (len(stories) - story_idx - 1)))
            sys.stdout.write("\r%s [story: %d/%d] took: %s remaining: %s" %
                             (dataset, story_idx, len(test_stories), elapsed,
                              remaining))
        print_keys = False
    
    print('')
    print(classification_report(GR, PRED))
    print('')
    
    

NUS
Faild loading linear P model for dataset:  nus
TEST  stories [37]
[20, 35, 153, 79, 129, 82, 168, 138, 39, 159, 163, 80, 21, 18, 73, 13, 30, 108, 181, 155, 115, 173, 91, 54, 98, 135, 85, 162, 47, 40, 145, 100, 67, 45, 32, 105, 66]

jump=pos,edge=Linear_train_0.8++_dep_all_a=0.0005_th=0.7
jump=1,edge=Linear_train_0.8++_dep_all_a=0.0005_th=0.7
jump=1,edge=1
jump=pos,edge=1
nus [story: 36/37] took: [00:13] remaining: [00:21:21]


ValueError: Mix type of y not allowed, got types {'continuous', 'binary'}