In [None]:
from graphrank import GraphRank, GraphUtils, TextPreprocess
import networkx as nx
import community
import os
import glob
from tqdm import tnrange, tqdm_notebook, tqdm
from timeit import default_timer as timer

from ipynb.fs.defs.run_sentence_encoder import get_embedding, EMBED, tf
embed = EMBED

In [None]:
gr = GraphRank()
tp = TextPreprocess()
utils = GraphUtils()

## Define functions

In [None]:
def get_segment_list(meeting_graph):
    segment_list = []
    for node, attr in meeting_graph.nodes(data=True):
        if attr.get("label") == "segmentId":
            segment_list.append(attr.get("text"))
    return segment_list

In [None]:
def build_word_graph(segment_list):
    for i, text in enumerate(segment_list):
        original_tokens, pos_tuple, filtered_pos_tuple = tp.preprocess_text(text, filter_by_pos=True, stop_words=False)
        graph = gr.build_word_graph(input_pos_text=pos_tuple, window=4)
        sub_keyphrases = gr.get_keyphrases(graph_obj=graph, input_pos_text=pos_tuple) 
        if i == 15:
            break
    
    return graph

In [None]:
def sort_dict_by_value(dict_var, order='desc', key=None):
    """
    A utility function to sort lists by their value.
    Args:
        item_list:
        order:

    Returns:

    """
    item_list = dict_var.items()
    if order == 'desc':
        if key is not None:
            sorted_list = sorted(item_list, key=lambda x: (x[1][key], x[0]), reverse=True)
        else:
            sorted_list = sorted(item_list, key=lambda x: (x[1], x[0]), reverse=True)
    else:
        if key is not None:
            sorted_list = sorted(item_list, key=lambda x: (x[1][key], x[0]), reverse=False)
        else:
            sorted_list = sorted(item_list, key=lambda x: (x[1], x[0]), reverse=False)

    return sorted_list

## Define Graph - Embeddings functions

In [None]:
def get_edge_weight(session, input_placeholder, word1 , word2, embedding_encoder):
    word_list = [word1, word2]
    cosine_dist = get_embedding(session, input_placeholder, word_list, embedding_encoder)
    try:
        return 1-cosine_dist
    except KeyError:
        print("word not found: {}--{}".format(word1, word2))
        return 0

In [None]:
def compute_edge_embeddings(word_graph):
    in_vocab_count = 0
    out_vocab_count = 0

    checkpoint_dir = "checkpoints/"
    file_store_name = checkpoint_dir + "word_graph_embedding_chkp" + "20" + ".pickle"

    input_placeholder = tf.placeholder(tf.string, shape=(None))
    word_encodings = embed(input_placeholder)

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])

        with tqdm(total=word_graph.number_of_edges()) as pbar:
            for i, (node1, node2, attr) in enumerate(word_graph.edges.data()):
                start = timer()
                if attr.get("edge_emb_wt") is None:
                    emb_edge_weight = get_edge_weight(session, input_placeholder, node1.lower(), node2.lower(), word_encodings)
                    if emb_edge_weight == 0:
                        out_vocab_count += 1
                    else:
                        in_vocab_count += 1
                    word_graph.add_edge(node1, node2, edge_emb_wt=emb_edge_weight)
                else:
                    continue

                end = timer()
                pbar.set_description('processed: {}; time taken: {}'.format((1 + i), (end - start)))
                pbar.update(1)
                # print("Words seen: {}/{}; Time taken: {}".format(i, len(word_graph.edges.data()), end-start))
                if i % 20 == 0 and i != 0:
                    # print("Computed {} edges". format(i))
                    f = os.path.exists(file_store_name)
                    if f:
                        os.remove(file_store_name)
                    file_store_name = checkpoint_dir + "word_graph_embedding_chkp" + str(i) + ".pickle"
                    nx.write_gpickle(word_graph, file_store_name)
            print("created checkpoint: {}...".format(file_store_name))

#         print("###########################")
#         print("Total in_vocab_words = {}".format(in_vocab_count))
#         print("Total out_of_vocab_words = {}".format(out_vocab_count))
#         print("Percentage of out_of_vocab = {}".format((out_vocab_count/in_vocab_count)*100))

## Functions to store computed values in graph

Build graph and compute edge embeddings

The parameters are as follows: 
1. `meeting_data`: path to pickled graph object of the meeting data
2. `segment_list`: list of segments (Using this, a graph for all these segments will be built and then ranked)
3. `test_segment`: Single segment to run quick tests

In [None]:
# `graph_id` and `meeting_data` are parameters here
data_dir = "data/"
graph_id = "01DB8DEW0YFYK0ZBP2Q3XR2YT1_5f89df0e-3631-4c64-a7ff-3bf0264c830f"
meeting_data = data_dir + graph_id

segment_list = []

In [None]:
meeting_graph = nx.read_gpickle(meeting_data)

if len(segment_list) == 0:
    segment_list = get_segment_list(meeting_graph)
    word_graph = build_word_graph(segment_list)
else:
    word_graph = build_word_graph(segment_list)

# Compute edge embeddings
compute_edge_embeddings(word_graph)

In [None]:
biased_pagerank = nx.pagerank(word_graph, weight="edge_emb_wt")
unbiased_pagerank = nx.pagerank(word_graph)

sorted_biased_rank = sort_dict_by_value(biased_pagerank)
sorted_unbiased_rank = sort_dict_by_value(unbiased_pagerank)

In [None]:
for i, tup in enumerate(sorted_biased_rank):
    word = tup[0]
    biased_rank = i
    biased_pagerank_score = tup[1]
    word_graph.add_node(word, weighted_pagerank_val=biased_pagerank_score, biased_pagerank=biased_rank)

In [None]:
for i, tup in enumerate(sorted_unbiased_rank):
    word = tup[0]
    unbiased_rank = i
    unbiased_pagerank_score = tup[1]
    word_graph.add_node(word, original_pagerank_val=unbiased_pagerank_score, original_rank=unbiased_rank)

In [None]:
def get_segment_wise_ranking(word_graph, segment_list):
    for i, sub_text in enumerate(segment_list):
        original_tokens, pos_tuple, filtered_pos_tuple = tp.preprocess_text(sub_text, filter_by_pos=True, stop_words=False)
        sub_keyphrases = gr.get_keyphrases(graph_obj=word_graph, input_pos_text=pos_tuple, post_process=True)
        wt_sub_keyphrase = gr.get_keyphrases(word_graph, input_pos_text=pos_tuple, post_process=True, weight="edge_emb_wt")

        keyphrase_rank_list = []
        for i, phrase_tup in enumerate(wt_sub_keyphrase):
            wt_rank = i
            wt_word = phrase_tup[0]
            for j, un_phrase_tup in enumerate(sub_keyphrases):
                word = un_phrase_tup[0]
                if word == wt_word:
                    orig_rank = j
                    tup = (word, orig_rank, wt_rank, sub_text)
                    keyphrase_rank_list.append(tup)
        
        # print(sub_text)
        yield keyphrase_rank_list

### View the difference in keyphrase rank based on weighted PageRank

In [None]:
# print("Word \t\t ===> Index | Original rank | Weighted rank | Difference")
# for seg in list(get_segment_wise_ranking(word_graph, segment_list[:])):
#     print("========New Segment=========")
#     print(seg[0][-1])
#     for i, (word, orig_rank, wt_rank, text) in enumerate(seg):
#         diff = orig_rank - wt_rank
#         print("{}".format(word))
#         print("{} | {} | {} | {}".format(i, orig_rank, wt_rank, diff))
#         print()

## Test for segment relevance scoring

In [None]:
def get_segment_word_similarity(session, input_placeholder, sentence, word, embedding_encoder):
    word_list = [sentence, word]
    cosine_dist = get_embedding(session, input_placeholder, word_list, embedding_encoder)
    try:
        return 1-cosine_dist
    except KeyError:
        print("word not found: {}--{}".format(sentence, word))
        return 0

In [None]:
segment_kw = []
for i in list(get_segment_wise_ranking(word_graph, segment_list)):
    for word, orig_rank, new_rank, text in i:
        segment_kw.append((word, orig_rank, new_rank, text))

In [None]:
input_placeholder = tf.placeholder(tf.string, shape=(None))
word_encodings = embed(input_placeholder)
kw_segment_rank = {}

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    for i in segment_kw:
        word = i[0]
        orig_rank = i[1]
        new_rank = i[2]
        segment = i[3]
        seg_rank = get_segment_word_similarity(session, input_placeholder, segment, word, word_encodings)
        kw_segment_rank[word] = seg_rank
    

### Form a dictionary with all the scores and keyphrases for a segment

#### Add unweighted and weighted pagerank scores/rank for each phrase in the example segment

`segment_rank_output["result"]` contains list of ranks.

In [None]:
segment_rank_output = {}
segment_rank_output["result"] = {}
for i in segment_kw:
    word = i[0]
    orig_rank = i[1]
    new_rank = i[2]
    text = i[3]
    tup = (orig_rank, new_rank)
    segment_rank_output["text"] = text
    segment_rank_output["result"][word] = list(tup)

#### Add local relevance rank to results dictionary

In [None]:
sorted_kw_segment_rank = sort_dict_by_value(kw_segment_rank)

for i, seg_rank in enumerate(sorted_kw_segment_rank):
    word = str(seg_rank[0])
    orig_rank_list = segment_rank_output["result"][word]
    new_rank_list = list((orig_rank_list[0], orig_rank_list[1], i))
    segment_rank_output["result"][word] = new_rank_list

#### The order of keys in ranked list will be as follows:

`unweighted_pagerank, weighted_pagerank, local_relevance_rank, pagerank_boosted_rank`

In [None]:
segment_rank_output["result"]

## Test ranking when boosted with pagerank

In [None]:
# Will be of the form - `word, cosine similiraity with segment`
sorted_kw_segment_rank[:20]

In [None]:
boosted_rank = {}
for i in sorted_kw_segment_rank:
    pagerank_score = 0
    phrase = i[0]
    seg_score = i[1]
    for sing_word in phrase.split():
        try:
            pagerank_score += word_graph.node[sing_word]["weighted_pagerank_val"]
        except KeyError:
            singular_word = sing_word[:-1]
            try:
                pagerank_score += word_graph.node[singular_word]["weighted_pagerank_val"]
            except:
                pagerank_score = 0.0001
    # print(pagerank_score)
    boosted_rank[phrase] = (pagerank_score + seg_score)
    

#### View pagerank-boosted ranks

In [None]:
sorted_boosted_rank = sort_dict_by_value(boosted_rank)
sorted_boosted_rank[:20]

**Update result dictionary with new ranks**

#### The order of keys in ranked list will be as follows:

`unweighted_pagerank, weighted_pagerank, local_relevance_rank, pagerank_boosted_rank`

In [None]:
for i, seg_rank in enumerate(sorted_boosted_rank):
    word = str(seg_rank[0])
    orig_rank_list = segment_rank_output["result"][word]
    new_rank_list = list((orig_rank_list[0], orig_rank_list[1], orig_rank_list[2], i))
    segment_rank_output["result"][word] = new_rank_list

In [None]:
print(segment_rank_output["text"])
sort_dict_by_value(segment_rank_output["result"], order="asc", key=2)