In [1]:
import sys
import langid
import os
import itertools
from operator import itemgetter
import networkx as nx
import stemming.porter2 as porter
from tokenizer import StanfordCoreNlpTokenizer
from stanfordcorenlp import StanfordCoreNLP
import requests

path_or_host = 'http://localhost'
url = path_or_host + ':' + str(9000)
stanford_core = StanfordCoreNLP(path_or_host)
tokenizer = StanfordCoreNlpTokenizer(path_or_host)
stem = porter.stem

def stanford_service_request(annotators=None, data=None, *args, **kwargs):
    if sys.version_info.major >= 3:
        data = data.encode('utf-8')

    properties = {'annotators': annotators, 'outputFormat': 'json'}
    params = {'properties': str(properties), 'pipelineLanguage': 'en'}
    if 'pattern' in kwargs:
        params = {"pattern": kwargs['pattern'], 'properties': str(properties), 'pipelineLanguage': 'en', 'timeout' : 180000}
    r = requests.post(url, params=params, data=data, headers={'Connection': 'close'}, timeout=180000)
    r_dict = json.loads(r.text)
    return r_dict


In [2]:
import json
import os

datasets_dir = '../datasets'
datanames = ['nus']
storiesMap = {}

for dataset in datanames:
    infilename  = os.path.join(datasets_dir, '%s-standard.json' % dataset)
    with open(infilename) as f:
        stories = json.loads(f.read())
    storiesMap[dataset] = stories
    
for d, v in storiesMap.items():
    print(d, ' ', len(v))
    
for dataset, stories in storiesMap.items():
    stories_ = [story for story in stories if len(story['entities']) > 0]
    print(dataset, 'total_stories: ' , len(stories), ', stories with entities: ', len(stories_))
    print(stories[0].keys())
    
for dataset, stories in storiesMap.items():
    print(dataset)
    dataset_dir = os.path.join('../datasets', dataset + '_post_processed')
    if not os.path.isdir(dataset_dir): 
        os.mkdir(dataset_dir)
        
    for idx, story in enumerate(stories):
        story_file = os.path.join(dataset_dir, str(idx) + '.json')
        if os.path.isfile(story_file):
            with open(story_file) as f:
                story_ = json.loads(f.read())
                if len(story_['entities']) > 0:
                    story = story_
        if not os.path.isfile(story_file) or len(story_['entities']) == 0:
            content = story['headline'] + '\n' + story['body']
            story['content'] = content
            story['suid'] = idx
            
            # stem story content
            original_words, phrases = tokenizer.tokenize(content)
            stemmed_words = [stem(word).lower() for word in original_words]
            unique_word_list = set(stemmed_words)
            story['original_words'] = original_words
            story['stemmed_words'] = stemmed_words
            story['phrases'] = phrases
            story['unique_word_list'] = list(unique_word_list)
            
            # parse story content
            try:
                dep = stanford_service_request('depparse', content)
                sentences = dep['sentences']
            except:
                print('\nfailed processing: ', idx, ' len: ', len(content))
                story['body'] = story['headline'] = ''
                story['entities'] = []
                stories[idx] = story
                continue 
                
            word_parsing_levels = {}
            dep_types_counter = {}
            for sentence in sentences:
                dependencies = sentence['basicDependencies']
                dep_to_gov_map = {}
                for dep in dependencies:
                    dep_to_gov_map[dep['dependent']] = dep['governor']
                    dep_name = stem(dep['dependentGloss']).lower()
                    if dep_name not in unique_word_list:
                        continue
                    dep_type = dep['dep']
                    if dep_name not in dep_types_counter:
                        dep_types_counter[dep_name] = {}
                    if dep_type not in dep_types_counter[dep_name]:
                        dep_types_counter[dep_name][dep_type] = 1
                    else:
                        dep_types_counter[dep_name][dep_type] += 1
                dep_level = {}
                def get_level(token_idx):
                    if token_idx == 0: return 0
                    if token_idx in dep_level: return dep_level[token_idx]
                    level = 1 + get_level(dep_to_gov_map[token_idx])
                    dep_level[token_idx] = level
                    return level
                        
                for token in sentence['tokens']:
                    word = stem(token['originalText']).lower()
                    if word not in unique_word_list:
                        continue
                    level = get_level(token['index'])
                    if word not in word_parsing_levels:
                        word_parsing_levels[word] = [level]
                    else:
                        word_parsing_levels[word].append(level)
            story['dep_types_counter'] = dep_types_counter
            for w, l in word_parsing_levels.items():
                word_parsing_levels[w] = sorted(l)
            story['word_parsing_levels'] = word_parsing_levels
            
            # all words position
            word_positions = {}
            for i, word in enumerate(stemmed_words):
                pos = i+1
                if word not in word_positions:
                    word_positions[word] = [pos]
                else:
                    word_positions[word].append(pos)
            story['word_positions'] = word_positions
            
            with open(story_file, 'w') as f:
                f.write(json.dumps(story))
        stories[idx] = story
        sys.stdout.write('\r%s story %d/%d [len %d] [entities: %d]'% (dataset, idx, len(stories), len(story['content']), len(story['entities'])))
    print('')
for dataset, stories in storiesMap.items():
    print(dataset)
    level = 0
    pos   = 0
    dep   = 0
    stemmed = 0
    for story in stories:
        stemmed_words = story['stemmed_words']
        level_ = len([s for s in stemmed_words if s in story['word_parsing_levels']])
        dep_   = len([s for s in stemmed_words if s in story['dep_types_counter']])
        pos_   = len([s for s in stemmed_words if s in story['word_positions']])
        stemmed += len(stemmed_words)
        level += level_
        pos += pos_
        dep += dep_
    print('dep type coverage: %.3f' % (dep / stemmed))
    print('level coverage   : %.3f' % (level / stemmed))
    print('pos   coverage   : %.3f' % (pos / stemmed))
    print('')
    
    grammar_parts = set()
    for story in stories:
        for w, keys in story['dep_types_counter'].items():
            grammar_parts.update(keys)
    for part in grammar_parts:
        print(part)
        

nus   183
nus total_stories:  183 , stories with entities:  183
dict_keys(['headline', 'body', 'entities', 'stemmed_content'])
nus
nus story 182/183 [len 59812] [entities: 10]
nus
dep type coverage: 1.000
level coverage   : 0.995
pos   coverage   : 1.000

det
cc:preconj
compound:prt
nummod
parataxis
auxpass
compound
neg
nmod
aux
appos
dep
dobj
cc
acl:relcl
nmod:npmod
ccomp
ROOT
advmod
punct
mark
mwe
cop
nsubj
acl
root
det:predet
nmod:tmod
case
xcomp
nmod:poss
conj
amod
advcl
csubj
discourse
iobj
nsubjpass
csubjpass


In [3]:
import boto
from gensim.models import Word2Vec
#https://github.com/olivettigroup/materials-word-embeddings
# word2vec_pretrained = Word2Vec.load("../materials-word-embeddings/bin/word2vec_embeddings-SNAPSHOT.model")

word2vec_models_map = {}
for dataset in storiesMap.keys():
    infile = os.path.join('word2vec_models', dataset)
    model = Word2Vec.load(infile)
    word2vec_models_map[dataset] = model
    print('loaded model: ', dataset)

loaded model:  nus


In [None]:
for dataset, stories in storiesMap.items():
    print(dataset.upper())

    try:
        clf, normalizer_edge, names_edge = linear_models_map[dataset]
    except:
        print('Faild loading linear model for dataset: ', dataset)
        continue
    
    try:
        clf_u, normalizer, names = linear_p_models_map[dataset]
        print('p names', names)
    except:
        print('Faild loading linear P model for dataset: ', dataset)
#         continue
        
    try:
        word2vec_model = word2vec_models_map[dataset]
    except:
        print('Failed loading word2vec model for dataset', dataset)
        continue

    avg_time = 0
    GR = []
    PRED = []
    
    all_stories = [s.copy() for s in stories]
    all_stories = sorted(all_stories, key = lambda e : str(e['suid']))
    len_all = len(all_stories)
    len_train = round(0.8 * len_all)
    random.seed(10)
    random.shuffle(all_stories)
    
    train_stories = all_stories[:len_train]
    test_stories = all_stories[len_train:]
    print('TEST  stories [%d]' % len(test_stories))
    print([story['suid'] for story in test_stories])
    print('')
    
    for story_idx, story in enumerate(test_stories):
        if len(story['body']) == 0: continue
        start_time = time.time()
        processed = False

        phrases = story['phrases']
        stemmed_words = story['stemmed_words']
        word_positions = story['word_positions']
        word_levels = story['word_parsing_levels']
        dep_types_counter = story['dep_types_counter']
        
        entities = story['entities']
        gold_grams = set()
        for entity in entities:
            grams = stanford_core.word_tokenize(entity['id'])
            grams = [stem(token).lower() for token in grams]
            gold_grams.update(grams)
            
        # compute feature pre-reqs
        word2vec = {}
        for i, word in enumerate(stemmed_words):
            if word in word2vec.keys() or word not in word2vec_model.wv.vocab:
                continue
            w_embed = word2vec_model.wv.get_vector(word)
            word2vec[word] = w_embed
        count = {}
        pos = {}
        first_pos = {}
        
        def update_or_insert(d, k, v):
            if k in d.keys():
                d[k] += v
            else:
                d[k] = v

        for i, word in enumerate(stemmed_words):
            update_or_insert(count, word, 1)
            update_or_insert(pos, word, 1 / (i + 1))
            if not word in first_pos:
                first_pos[word] = 1/(i+1)
                
            
        def get_part_strength(d, key):
            def get_count(k):
                if k in d:
                    return d[k]
                else:
                    return 0
            s = sum([get_count(k) for k in d.keys() if key in k.lower()])
            return s
        
        def compose_features(u, v, H, H_first):
            count_u = count[u]
            count_v = count[v]
            pos_u = pos[u]
            pos_v = pos[v]
            wvec_u = word2vec[u]
            wvec_v = word2vec[v]
            u_level = 1 - 1 / np.mean(word_levels[u])
            v_level = 1 - 1 / np.mean(word_levels[v])
            s_uv = word2vec_model.wv.similarity(u, v)
            
#             parts = [
#                 'subj',
#                 'obj',
#                 'appos',
#                 'conj',
#                 'compound',
#                 'mod',
#                 'det',
#                 ]                        
#             # part of sentence
            parts = [
                'subj',
                'obj',
                'appos',
                'conj',
                'compound',
                'mod',
                'acl',
                'det',
                'neg',
                'mark',
                'auxpass',
                'ccomp',
                'mwe',
                'parataxis',
                'cop',
                'advcl',
                'cc',
                'discourse',
                'xcomp',
                'case',
                'dep',
                'aux',
                'punct',
                'root',
            ]
#             parts = [
#                 'csubj',
#                 'nsubj',
#                 'nsubjpass',
#                 'csubjpass',
#                 'dobj',
#                 'iobj',
#                 'appos',
#                 'conj',
#                 'cc:preconj',
#                 'compound',
#                 'compound:prt',
#                 'nmod',
#                 'advmod',
#                 'nummod',
#                 'nmod:tmod',
#                 'nmod:npmod',
#                 'nmod:poss',
#                 'amod',
#                 'acl',
#                 'acl:relcl',
#                 'det',
#                 'det:predet',
#                 'neg',
#                 'mark',
#                 'auxpass',
#                 'ccomp',
#                 'mwe',
#                 'parataxis',
#                 'cop',
#                 'advcl',
#                 'cc',
#                 'discourse',
#                 'xcomp',
#                 'case',
#                 'dep',
#                 'root',
#                 'aux',
#                 'punct',
#             ]
            v_parts = np.array([get_part_strength(dep_types_counter[v], part) for part in parts])
            u_parts = np.array([get_part_strength(dep_types_counter[u], part) for part in parts])

            # co-occurence count
            co_occur = H[u][v]['weight']

            # first time together
            co_first = H_first[u][v]['weight']

            # neighbours
            u_neigs = list(H[u].keys())
            v_neigs = list(H[v].keys())
            uv_neigs = [u for u in u_neigs if u in v_neigs]

            x = np.concatenate(
                (np.array([
                    count_u, count_v, pos_u, pos_v, u_level, v_level, s_uv,
                    co_occur,
                    co_first,
                    len(u_neigs),
                    len(v_neigs),
                    len(uv_neigs),
                ]), u_parts, v_parts, wvec_u, wvec_v),
                axis=0)
            x = np.expand_dims(x, axis=0)
            return x

        def valid_token(u, verbose=False):
            if u not in word_levels.keys():
                if verbose:
                    print(u, 'not in word_levels')
                return False
            if u not in word2vec_model.wv.vocab:
                if verbose:
                    print(u, 'not in vocabulary')
                return False
            return True
        
        H_first = nx.Graph()
        H_all = nx.Graph()
        for i, v in enumerate(stemmed_words):
            if len(v) <= 1 or not valid_token(v):
                continue
            for offset in window_offsets:
                idx = i + offset
                if idx < len(stemmed_words) and idx >= 0:
                    u = stemmed_words[idx]
                    if u == v:
                        continue
                    if len(u) <= 1 or not valid_token(u):
                        continue
                    if H_all.has_edge(u, v):
                        H_all[u][v]['weight'] += 1
                    else:
                        H_all.add_edge(u, v, weight=1)
                        assert(H_first.has_edge(u, v) == False)
                        H_first.add_edge(u, v, weight=1/(i+1))
        P_all = {}
        for v in H_all:
            if v not in P_all.keys():
                P_all[v] = pos[v]
                
        # create edges       
        H = nx.DiGraph()
        for i, v in enumerate(stemmed_words):
            if len(v) <= 1 or not valid_token(v):
                continue
            for offset in window_offsets:
                idx = i + offset
                if idx < len(stemmed_words) and idx >= 0:
                    u = stemmed_words[idx]
                    if u == v:
                        continue
                    if len(u) <= 1 or not valid_token(u):
                        continue
                    if H.has_edge(u, v) and H.has_edge(v, u):
                        continue                   
                    uv = compose_features(u, v, H_all, H_first)
                    uv = normalizer_edge.transform(uv)
                    uv_s = float(clf.predict(uv))
                    if uv_s >= 0.7:
                        H.add_edge(u, v, weight=uv_s)
                    vu = compose_features(v, u, H_all, H_first)
                    vu = normalizer_edge.transform(vu)
                    vu_s = float(clf.predict(vu))
                    if vu_s >= 0.7:
                        H.add_edge(v, u, weight=vu_s)
                    GR.append(int(v in gold_grams))
                    GR.append(int(u in gold_grams))
                    PRED.append(uv_s)
                    PRED.append(vu_s)
                                 
        P = {}
        for v in H:
            if v not in P.keys():
                P[v] = pos[v]
            for u in H[v]:
                if u not in P.keys():
                    P[u] = pos[u]
                