In [247]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk as nltk
import itertools

In [303]:
# knowledge base available at http://probase.msra.cn/dataset.aspx
concepts = pd.read_csv('data/concepts.csv')

In [304]:
swords = stopwords.words('english')

In [325]:
test_sentences=['april in paris lyrics',
                'april in paris vacation',
               'hotel california eagles',
                'read harry potter',
                'read harry potter book',
                'watch harry potter movie',
                'manchester city beat manchester united and won the trophy',
                'niagara falls best season to visit',
                'how to hone randomized algorithms']

In [306]:
# creation of hash index of words from knowledge base to speed up word search
def create_hash_index(knowledgebase):
    hash_index = {}
    for inst in knowledgebase['instance'].unique():
        hash_index[inst] = 1
    return hash_index

In [307]:
# splitting of the sentence in all possible ways
# source https://stackoverflow.com/questions/18406776/split-a-string-into-all-possible-ordered-phrases
def break_down(text):
    words = text.split()
    ns = range(1, len(words)) # n = 1..(n-1)
    for n in ns[::-1]: # split into 2, 3, 4, ..., n parts.
        for idxs in itertools.combinations(ns, n):
            yield [' '.join(words[i:j]) for i, j in zip((0,) + idxs, idxs + (None,))]

In [322]:
# calculation of the affinity of two words
def edge_weight(concepts, w1, w2):
    w1_vec = concepts[concepts.instance == w1][['concept', 'relations']]
    w2_vec = concepts[concepts.instance == w2][['concept', 'relations']]  
    intersection = pd.merge(w1_vec, w2_vec, on='concept')
    return 1+intersection['relations_x'].dot(intersection['relations_y'])

In [309]:
# composition of term graph
def construct_graph(string, h_index, stopwrds, knowledgebase):
    nodes_to_words = {}
    words_to_nodes = {}
    graph = np.empty((1,3), dtype=int)
    i = 0
    for split in break_down(string):
        reduced = []
        for w in split: 
            if w not in h_index:
                reduced = []
                break
            if w in stopwrds:
                continue
            reduced += [w,]

        new_nodes = []    
        for j,w in enumerate(reduced):
            if w not in words_to_nodes:
                nodes_to_words[i] = w
                words_to_nodes[w] = i
                for k in range(j):
                    dest_w = reduced[k]
                    dest_v = words_to_nodes[dest_w]
                    weight = edge_weight(knowledgebase, w, dest_v)
                    new_edge = np.array([[i, dest_v, weight]])
                    graph = np.append(graph, new_edge, axis=0)
                new_nodes += [i,]
                i += 1
            else:
                for u in new_nodes:
                    dest_v = words_to_nodes[w]
                    u_w = nodes_to_words[u]
                    weight = edge_weight(knowledgebase, w, u_w)
                    new_edge = np.array([[dest_v, u, weight]])
                    graph = np.append(graph, new_edge, axis=0)
    return graph[1:], nodes_to_words

In [310]:
# random selection of an edge proportionally to its weight
def rnd_edg_sel(graph):
    rng = graph[:,2].sum()
    rnd_pt = np.random.random()*rng
    cml_sum=0
    for edge in graph:
        cml_sum += edge[2]
        if cml_sum>rnd_pt:
            return edge[np.newaxis,:]

In [311]:
def remove_edge(graph, edge):
    new_graph = np.empty((1, graph.shape[1]), dtype=int)
    for e in graph:
        ed = e[np.newaxis,:]
        if (ed == edge).all():
            continue
        new_graph = np.append(new_graph, ed, axis=0)
    return new_graph[1:]

In [312]:
# remove edges related to a particular vertex
def remove_rel_edges(graph, t):
    new_graph = np.empty((1, graph.shape[1]), dtype=int)
    for e in graph:
        ed = e[np.newaxis,:]
        if e[0] == t or e[1] == t:
            continue
        new_graph = np.append(new_graph, ed, axis=0)
    return new_graph[1:]

In [313]:
def are_connected(graph, u, v):
    for e in graph:
        if u in e[:2] and v in e[:2]:
            return True
    return False

In [314]:
# randomised algorithm for maximal clique calculation
def MaxCMC(graph):
    p_graph = np.empty((1,graph.shape[1]), dtype=int)
    removed_v = np.array([])
    while graph.size > 0:
        e = rnd_edg_sel(graph)
#         print('selected e =', e)
        u = e[0,0]
        v = e[0,1]
        removed_v = np.append(removed_v, np.array([u,v]))
        p_graph = np.append(p_graph, e, axis=0)
        graph = remove_edge(graph, e)
        for t in graph[:,:2].flatten():
            if t in removed_v: #t==u or t==v:
                continue
            if not are_connected(graph, t, v) or not are_connected(graph, t, u):
                graph = remove_rel_edges(graph, t)
    return p_graph[1:], p_graph[1:,2].mean()

In [315]:
# repetition of the randomised clique calculation k times for better precision
def CMaxC(graph, nodes_to_words, k=3):
    max_w = 0
    for i in range(k):
        clique, weight = MaxCMC(graph)
        if weight>max_w:
            best_cl = clique
            max_w = weight
    word_nos = np.sort(np.unique(best_cl[:,:2]))
    result = [nodes_to_words[no] for no in word_nos]
    return result

In [323]:
def segment_sentence(knowledgebase, sentence, stpwrds):
    hash_index = create_hash_index(knowledgebase)
    graph, ntw = construct_graph(sentence, hash_index, stpwrds, knowledgebase)
    return CMaxC(graph, ntw)    

In [319]:
tstr

'april in paris lyrics'

In [326]:
for tstr in test_sentences:
    segmentation = segment_sentence(concepts, tstr, swords)
    print(segmentation)

['april', 'paris', 'lyrics']
['vacation', 'april in paris']
['eagles', 'hotel california']
['read', 'harry', 'potter']
['read', 'book', 'harry potter']
['watch', 'movie', 'harry potter']
['manchester', 'city', 'beat', 'trophy', 'manchester united']
['best', 'season', 'visit', 'niagara falls']
['hone', 'randomized', 'algorithms', 'how to']
