In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib widget

In [3]:
def sort_by_value(item_list, order='desc'):
    """
    A utility function to sort lists by their value.
    Args:
        item_list:
        order:

    Returns:

    """

    if order == 'desc':
        sorted_list = sorted(item_list, key=lambda x: (x[1], x[0]), reverse=True)
    else:
        sorted_list = sorted(item_list, key=lambda x: (x[1], x[0]), reverse=False)

    return sorted_list

In [5]:
from graphrank import GraphRank, GraphUtils, TextPreprocess

In [6]:
import networkx as nx

In [7]:
gr = GraphRank()
tp = TextPreprocess()
utils = GraphUtils()

In [11]:
meeting_data = "data/01DB8DEW0YFYK0ZBP2Q3XR2YT1_5f89df0e-3631-4c64-a7ff-3bf0264c830f"

In [12]:
meeting_graph = nx.read_gpickle(meeting_data)

In [13]:
meeting_graph.number_of_nodes()

108

## Query for all the transcripts

In [14]:
segment_list = []
for node, attr in meeting_graph.nodes(data=True):
    if attr.get("label") == "segmentId":
        segment_list.append(attr.get("text"))

In [15]:
segment_list

["I was born in involved like many other engineers. I grew reading section how traffic was actually very involved with this house and crackers and little that I know that this works a major invoice on my life did you time section to do you remember the moment when the while basically saving the world asks careful for something computer make itself and the machine just does it and is sales how why the real want know like this example for the okay Google and ask the application service mostly requirement and get the job done just like that today I want to talk about technology based on other which one is a small step in that direction when happens we have an opportunity to like your users and bring more engagement to Europe. However, we can't do it need help as Google. We have been working on important organizing the also information to make a university. The access to help with that we build the knowledge that knowledge information about ent*ties and their relationships one of the inter

In [17]:
complete_pos_tuple = []
for text in segment_list:
    original_tokens, pos_tuple, filtered_pos_tuple = tp.preprocess_text(text, filter_by_pos=True, stop_words=False)
    word_graph = gr.build_word_graph(input_pos_text=pos_tuple, window=4)
    complete_pos_tuple.extend(pos_tuple)

In [18]:
word_graph.number_of_nodes()

401

In [19]:
word_graph.number_of_edges()

1947

In [20]:
keyphrases = gr.get_keyphrases(graph_obj=word_graph, input_pos_text=complete_pos_tuple)

In [21]:
keyphrases

[('music store data', 0.03315189249335202),
 ('music store website', 0.03122256358861395),
 ('music store site', 0.029762772806879077),
 ('Thailand website mobile app', 0.026227663195228006),
 ('music artists', 0.026191046795835594),
 ('music app', 0.02501793882048923),
 ('music action online', 0.02474068360177135),
 ('music website', 0.02375342399231673),
 ('music Sean likes', 0.021678508787485472),
 ('sample open source implementations second ponens', 0.021370466835350377),
 ('music selection', 0.02120387549981612),
 ('help satisfy user requests North', 0.020712010622034975),
 ('music card', 0.019383139560869636),
 ('enjoy music', 0.019356613781768),
 ('music recommendations', 0.019259092588691774),
 ('mobile web', 0.01915477235079502),
 ('find machine graph', 0.01873916092497588),
 ('case standard', 0.018488735330739362),
 ('term link data refers', 0.018150883187183175),
 ('artist web', 0.01795778232021638),
 ('open source database', 0.017587478865610816),
 ('free time', 0.016788832

In [25]:
for i, sub_text in enumerate(segment_list):
    original_tokens, pos_tuple, filtered_pos_tuple = tp.preprocess_text(sub_text, filter_by_pos=True, stop_words=False)
    sub_keyphrases = gr.get_keyphrases(graph_obj=word_graph, input_pos_text=pos_tuple)
    print(sub_text)
    print()
    print(sub_keyphrases)
    print()
    if i == 10:
        break

I was born in involved like many other engineers. I grew reading section how traffic was actually very involved with this house and crackers and little that I know that this works a major invoice on my life did you time section to do you remember the moment when the while basically saving the world asks careful for something computer make itself and the machine just does it and is sales how why the real want know like this example for the okay Google and ask the application service mostly requirement and get the job done just like that today I want to talk about technology based on other which one is a small step in that direction when happens we have an opportunity to like your users and bring more engagement to Europe. However, we can't do it need help as Google. We have been working on important organizing the also information to make a university. The access to help with that we build the knowledge that knowledge information about ent*ties and their relationships one of the interes

## Word Embedding

In [26]:
import pickle
import numpy as np
from scipy.spatial.distance import cosine
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = list(STOP_WORDS)

#---------------------------------#
mind_vec = pickle.load(open('data/mind_vector_300dim.pkl','rb'))
embedding_dict = pickle.load(open('data/feature_dict.pkl','rb'))

#---------------------------------#

def get_node_weight (word):
    
    min_dist = 999
    node_similarity_score = 0
    try:
        for i in range(mind_vec.shape[0]):
            curr_dist = cosine(embedding_dict[word], mind_vec[i])
            if curr_dist<min_dist:
                min_dist=curr_dist
        node_similarity_score = 1 - min_dist
    except KeyError:
        print("word not found: {}".format(word))
        node_similarity_score = 0
    
    return node_similarity_score
    
def get_edge_weight (word1 , word2):
    
    try:
        dist = cosine(embedding_dict[word1], embedding_dict[word2])
        return 1-dist
    except KeyError:
        print("word not found: {}--{}".format(word1, word2))
        return 0

In [27]:
word_graph.nodes

NodeView(('engineer', 'traffic', 'involved', 'house', 'cracker', 'major', 'invoice', 'life', 'time', 'remember', 'moment', 'careful', 'computer', 'machine', 'sale', 'real', 'example', 'Google', 'application', 'service', 'requirement', 'job', 'today', 'talk', 'technology', 'small', 'step', 'direction', 'opportunity', 'user', 'bring', 'engagement', 'Europe', 'help', 'university', 'access', 'build', 'knowledge', 'entties', 'relationship', 'interesting', 'language', 'concept', 'stream', 'core', 'difference', 'clear', 'accord', 'find', 'graph', 'refer', 'type', 'functionality', 'marker', 'strike', 'grasp', 'team', 'satisfy', 'introduce', 'body', 'shop', 'music', 'southeast', 'piece', 'day', 'open', 'reference', 'score', 'QA', 'awesome', 'selection', 'confident', 'correct', 'action', 'online', 'web', 'recommendation', 'discover', 'understand', 'apps', 'ready', 'explore', 'conference', 'event', 'expect', 'opinion', 'catch', 'data', 'consultant', 'voice', 'NI', 'publish', 'review', 'standard',

### Add node weights

In [28]:
in_vocab_count = 0
out_vocab_count = 0

for node in word_graph.nodes:
    emb_node_weight = get_node_weight(node.lower())
    if emb_node_weight == 0:
        out_vocab_count += 1
    else:
        in_vocab_count += 1
    word_graph.add_node(node, node_emb_wt=emb_node_weight)

print("###########################")
print("Total in_vocab_words = {}".format(in_vocab_count))
print("Total out_of_vocab_words = {}".format(out_vocab_count))
print("Percentage of out_of_vocab = {}".format((out_vocab_count/in_vocab_count)*100))

word not found: cracker
word not found: invoice
word not found: careful
word not found: engagement
word not found: europe
word not found: university
word not found: entties
word not found: accord
word not found: strike
word not found: grasp
word not found: satisfy
word not found: shop
word not found: southeast
word not found: selection
word not found: confident
word not found: recommendation
word not found: discover
word not found: consultant
word not found: ni
word not found: translate
word not found: refers
word not found: leo
word not found: schema
word not found: hardest
word not found: eco
word not found: docment
word not found: harden
word not found: entty
word not found: poland
word not found: inspection
word not found: favorite
word not found: hall
word not found: crackle
word not found: science
word not found: fiction
word not found: protagonist
word not found: suitable
word not found: fulfill
word not found: schema
word not found: org
word not found: entity
word not found: am

### Add edge weights

In [29]:
in_vocab_count = 0
out_vocab_count = 0

for node1, node2 in word_graph.edges:
    emb_edge_weight = get_edge_weight(node1.lower(), node2.lower())
    if emb_edge_weight == 0:
        out_vocab_count += 1
    else:
        in_vocab_count += 1
    word_graph.add_edge(node1, node2, edge_emb_wt=emb_edge_weight)

print("###########################")
print("Total in_vocab_words = {}".format(in_vocab_count))
print("Total out_of_vocab_words = {}".format(out_vocab_count))
print("Percentage of out_of_vocab = {}".format((out_vocab_count/in_vocab_count)*100))

word not found: traffic--cracker
word not found: involved--cracker
word not found: house--cracker
word not found: house--invoice
word not found: house--hall
word not found: house--crackle
word not found: cracker--major
word not found: cracker--invoice
word not found: cracker--life
word not found: major--invoice
word not found: major--crackle
word not found: major--science
word not found: invoice--life
word not found: invoice--time
word not found: invoice--remember
word not found: life--consultant
word not found: life--science
word not found: life--fiction
word not found: time--careful
word not found: time--southeast
word not found: time--denzel
word not found: time--peace
word not found: time--foundation
word not found: time--grateful
word not found: time--surprise
word not found: time--kit
word not found: time--mouse
word not found: time--brand
word not found: time--closer
word not found: time--sixty
word not found: time--displayed
word not found: time--raspberry
word not found: time-

In [62]:
for node1, node2, attr in word_graph.edges.data():
    print(node1, node2, attr.get("edge_emb_wt"))

engineer traffic 0.29134002327919006
engineer involved 0.262100487947464
engineer house 0.24288004636764526
traffic involved 0.24905157089233398
traffic house 0.248066708445549
traffic cracker 0
involved house 0.33418357372283936
involved cracker 0
involved major 0.413006991147995
house cracker 0
house major 0.2718994915485382
house invoice 0
house talk 0.3088148832321167
house Hall 0
house close 0.32392576336860657
house crackle 0
house Works 0.3677544593811035
house card 0.1984807699918747
house point 0.29618045687675476
house leave 0.3393210768699646
house fine 0.23322954773902893
house time 0.3468265235424042
house image 0.2451685070991516
cracker major 0
cracker invoice 0
cracker life 0
major invoice 0
major life 0.3572828769683838
major time 0.3509465157985687
major crackle 0
major Works 0.3815233111381531
major influence 0.23040378093719482
major science 0
invoice life 0
invoice time 0
invoice remember 0
life time 0.43007901310920715
life remember 0.386841744184494
life moment 0

## Test Pagerank on weighted edges

In [30]:
biased_pagerank = nx.pagerank(word_graph, weight="edge_emb_wt")

In [31]:
unbiased_pagerank = nx.pagerank(word_graph)

In [32]:
biased_pagerank

{'engineer': 0.0020485784853435515,
 'traffic': 0.002030714396121335,
 'involved': 0.0028264257951841855,
 'house': 0.006653506936672581,
 'cracker': 0.0005433803316099488,
 'major': 0.003767682128398588,
 'invoice': 0.0005433803316099488,
 'life': 0.005585450598656994,
 'time': 0.020971466797065824,
 'remember': 0.003306180796999024,
 'moment': 0.003707498139250854,
 'careful': 0.0005433803316099488,
 'computer': 0.010722526693456975,
 'machine': 0.006769477803900984,
 'sale': 0.002392651540894244,
 'real': 0.00411846752122115,
 'example': 0.014278105344437556,
 'Google': 0.009657029029132578,
 'application': 0.008588569006149873,
 'service': 0.0049547492438217515,
 'requirement': 0.002398127749716289,
 'job': 0.004461048531123301,
 'today': 0.008840283225321746,
 'talk': 0.005785034530325235,
 'technology': 0.002971706653854848,
 'small': 0.005240578698177135,
 'step': 0.0036047471942565372,
 'direction': 0.0032940181284880138,
 'opportunity': 0.0027197548877278916,
 'user': 0.010497

In [74]:
biased_pagerank['jsonld']

0.0005414190083348599

In [33]:
print("Word \t Unbiased_score \t Biased_score")
print()
for word, value in biased_pagerank.items():
    print(word, unbiased_pagerank[word], biased_pagerank[word])

Word 	 Unbiased_score 	 Biased_score

engineer 0.0012754652017666055 0.0020485784853435515
traffic 0.0015861597768892194 0.002030714396121335
involved 0.001863143338655727 0.0028264257951841855
house 0.004840923964915059 0.006653506936672581
cracker 0.0020197115394923583 0.0005433803316099488
major 0.0030168425568201053 0.003767682128398588
invoice 0.0018423589712746784 0.0005433803316099488
life 0.003993131731721086 0.005585450598656994
time 0.01437530612256021 0.020971466797065824
remember 0.002834258074754516 0.003306180796999024
moment 0.0027719920952767475 0.003707498139250854
careful 0.0016739876411861756 0.0005433803316099488
computer 0.008275732545915892 0.010722526693456975
machine 0.007061677607422549 0.006769477803900984
sale 0.002678063391049093 0.002392651540894244
real 0.0021676788031543947 0.00411846752122115
example 0.008955593260722209 0.014278105344437556
Google 0.007779841875509305 0.009657029029132578
application 0.0065013353476223905 0.008588569006149873
service 0.

In [34]:
sorted_biased_rank = sort_by_value(biased_pagerank.items())
sorted_unbiased_rank = sort_by_value(unbiased_pagerank.items())

In [35]:
for i, tup in enumerate(sorted_biased_rank):
    word = tup[0]
    biased_rank = i
    biased_pagerank_score = tup[1]
#     word_rank_dict["weighted_pagerank_val"] = biased_pagerank_score
#     word_rank_dict["weighted_pagerank_rank"] = biased_rank
#     word_dict_weighted[word] = word_rank_dict
    
    word_graph.add_node(word, wt_val=biased_pagerank_score, wt_rank=biased_rank)

In [37]:
for i, tup in enumerate(sorted_unbiased_rank):
    word = tup[0]
    unbiased_rank = i
    unbiased_pagerank_score = tup[1]
#     word_rank_dict["pagerank_val"] = unbiased_pagerank_score
#     word_rank_dict["pagerank_rank"] = unbiased_rank
#     word_dict[word] = word_rank_dict
    
    word_graph.add_node(word, val=unbiased_pagerank_score, rank=unbiased_rank)

In [38]:
a = list(word_graph.nodes.data())

In [39]:
sorted(a, key=lambda x: x[1]['wt_rank'], reverse=False)

[('time',
  {'node_emb_wt': 0.5347729921340942,
   'wt_val': 0.020971466797065824,
   'wt_rank': 0,
   'val': 0.01437530612256021,
   'rank': 1}),
 ('music',
  {'node_emb_wt': 0.4553903341293335,
   'wt_val': 0.019525167304016416,
   'wt_rank': 1,
   'val': 0.017437112407059713,
   'rank': 0}),
 ('mobile',
  {'node_emb_wt': 0.4746757745742798,
   'wt_val': 0.014317851190918513,
   'wt_rank': 2,
   'val': 0.010596381191398956,
   'rank': 2}),
 ('example',
  {'node_emb_wt': 0.4869203269481659,
   'wt_val': 0.014278105344437556,
   'wt_rank': 3,
   'val': 0.008955593260722209,
   'rank': 7}),
 ('web',
  {'node_emb_wt': 0.5377049446105957,
   'wt_val': 0.013290032309644508,
   'wt_rank': 4,
   'val': 0.009103837876169247,
   'rank': 5}),
 ('computer',
  {'node_emb_wt': 0.5091765522956848,
   'wt_val': 0.010722526693456975,
   'wt_rank': 5,
   'val': 0.008275732545915892,
   'rank': 10}),
 ('knowledge',
  {'node_emb_wt': 0.4517827033996582,
   'wt_val': 0.010661666500753668,
   'wt_rank': 6

## Compare keyphrase rank

In [42]:
keyphrase = gr.get_keyphrases(word_graph, input_pos_text=complete_pos_tuple)
wt_keyphrase = gr.get_keyphrases(word_graph, input_pos_text=complete_pos_tuple, weight="edge_emb_wt")

In [43]:
wt_keyphrase

[('music store data', 0.03612527527243285),
 ('music store website', 0.03550145011666796),
 ('music store site', 0.03419704761778745),
 ('Thailand website mobile app', 0.03284516652224488),
 ('music action online', 0.031582685446814814),
 ('music app', 0.028917147199882583),
 ('music website', 0.02770387601620755),
 ('mobile web', 0.026554877173696652),
 ('sample open source implementations second ponens', 0.026028346368931806),
 ('web streaming today', 0.024640277308815513),
 ('free time', 0.023655296713774886),
 ('mobile streaming accesses', 0.02269004797287696),
 ('case standard', 0.02257568758406353),
 ('music card', 0.021947717014288715),
 ('mobile applications', 0.02172382291396698),
 ('time mouse', 0.021668991823337057),
 ('help satisfy user requests North', 0.021566123775448753),
 ('open source database', 0.021557200165262264),
 ('music Sean likes', 0.02155208383396683),
 ('find machine graph', 0.020445990403337902),
 ('knowledge graph consultant', 0.020193881234620528),
 ('mid

In [44]:
keyphrase_rank_list = []
for i, phrase_tup in enumerate(wt_keyphrase):
    wt_rank = i
    wt_word = phrase_tup[0]
    for j, un_phrase_tup in enumerate(keyphrases):
        word = un_phrase_tup[0]
        if word == wt_word:
            orig_rank = j
            tup = (word, orig_rank, wt_rank)
            keyphrase_rank_list.append(tup)

In [45]:
print("Word \t\t ===> | Original rank | Weighted rank | Difference")
for i, (word, orig_rank, wt_rank) in enumerate(keyphrase_rank_list):
    diff = orig_rank - wt_rank
    print("{}".format(word))
    print("{} | {} | {}".format(orig_rank, wt_rank, diff))

Word 		 ===> | Original rank | Weighted rank | Difference
music store data
0 | 0 | 0
music store website
1 | 1 | 0
music store site
2 | 2 | 0
Thailand website mobile app
3 | 3 | 0
music action online
6 | 4 | 2
music app
5 | 5 | 0
music website
7 | 6 | 1
mobile web
15 | 7 | 8
sample open source implementations second ponens
9 | 8 | 1
web streaming today
28 | 9 | 19
free time
21 | 10 | 11
mobile streaming accesses
27 | 11 | 16
case standard
17 | 12 | 5
music card
12 | 13 | -1
mobile applications
23 | 14 | 9
time mouse
24 | 15 | 9
help satisfy user requests North
11 | 16 | -5
open source database
20 | 17 | 3
music Sean likes
8 | 18 | -10
find machine graph
16 | 19 | -3
knowledge graph consultant
22 | 20 | 2
middle capacity additional case
31 | 21 | 10
music selection
10 | 22 | -12
music recommendations
14 | 23 | -9
music artists
4 | 24 | -20
enjoy music
13 | 25 | -12
term link data refers
18 | 26 | -8
Google voice command
32 | 27 | 5
graph database cocaine
26 | 28 | -2
offer web
45 | 29 |

## Compare segment wise

In [88]:
def get_segment_wise_ranking(word_graph, segment_list):
    for i, sub_text in enumerate(segment_list):
        original_tokens, pos_tuple, filtered_pos_tuple = tp.preprocess_text(sub_text, filter_by_pos=True, stop_words=False)
        sub_keyphrases = gr.get_keyphrases(graph_obj=word_graph, input_pos_text=pos_tuple, post_process=True)
        wt_sub_keyphrase = gr.get_keyphrases(word_graph, input_pos_text=pos_tuple, post_process=True, weight="edge_emb_wt")

        keyphrase_rank_list = []
        for i, phrase_tup in enumerate(wt_sub_keyphrase):
            wt_rank = i
            wt_word = phrase_tup[0]
            for j, un_phrase_tup in enumerate(sub_keyphrases):
                word = un_phrase_tup[0]
                if word == wt_word:
                    orig_rank = j
                    tup = (word, orig_rank, wt_rank)
                    keyphrase_rank_list.append(tup)
        
        yield keyphrase_rank_list

In [89]:
kr = get_segment_wise_ranking(word_graph, segment_list)

In [90]:
print("Word \t\t ===> Index | Original rank | Weighted rank | Difference")
for seg in list(kr):
    print("========New Segment=========")
    for i, (word, orig_rank, wt_rank) in enumerate(seg):
        diff = orig_rank - wt_rank
        print("{}".format(word))
        print("{} | {} | {} | {}".format(i, orig_rank, wt_rank, diff))
        print()

Word 		 ===> Index | Original rank | Weighted rank | Difference
time
0 | 1 | 0 | 1

find machine graph
1 | 0 | 1 | -1

example
2 | 4 | 2 | 2

application service
3 | 3 | 3 | 0

computer
4 | 5 | 4 | 1

interesting applications
5 | 6 | 5 | 1

users
6 | 2 | 6 | -4

knowledge
7 | 9 | 7 | 2

Google
8 | 8 | 8 | 0

help satisfy
9 | 7 | 9 | -2

today
10 | 16 | 10 | 6

small step
11 | 11 | 11 | 0

build
12 | 12 | 12 | 0

refer types
13 | 17 | 13 | 4

house
14 | 14 | 14 | 0

access
15 | 20 | 15 | 5

markers
16 | 13 | 16 | -3

talk
17 | 18 | 17 | 1

life
18 | 19 | 18 | 1

core
19 | 21 | 19 | 2

grasp sales team
20 | 10 | 20 | -10

major invoice
21 | 15 | 21 | -6

job
22 | 24 | 22 | 2

concept
23 | 27 | 23 | 4

clear
24 | 26 | 24 | 2

real
25 | 30 | 25 | 5

moment
26 | 23 | 26 | -3

language
27 | 28 | 27 | 1

difference
28 | 25 | 28 | -3

remember
29 | 22 | 29 | -7

direction
30 | 33 | 30 | 3

involved
31 | 31 | 31 | 0

technology
32 | 42 | 32 | 10

bring
33 | 32 | 33 | -1

opportunity
34 | 35 | 3

In [112]:
word_graph.node["introduce"]

{'node_emb_wt': 0.5004165172576904,
 'wt_val': 0.0014423985014499767,
 'wt_rank': 238,
 'val': 0.0012688338995935722,
 'rank': 374}

In [126]:
for w in word_graph.edges(data=True):
    if "Google" in w:
        print(w)

('sale', 'Google', {'weight': 1.0, 'edge_emb_wt': 0.14090129733085632})
('real', 'Google', {'weight': 1.0, 'edge_emb_wt': 0.35285669565200806})
('example', 'Google', {'weight': 1.0, 'edge_emb_wt': 0.33021947741508484})
('Google', 'application', {'weight': 1.0, 'edge_emb_wt': 0.2061760425567627})
('Google', 'service', {'weight': 1.0, 'edge_emb_wt': 0.20930157601833344})
('Google', 'requirement', {'weight': 1.0, 'edge_emb_wt': 0.09482615441083908})
('Google', 'engagement', {'weight': 1.0, 'edge_emb_wt': 0})
('Google', 'Europe', {'weight': 1.0, 'edge_emb_wt': 0})
('Google', 'help', {'weight': 1.0, 'edge_emb_wt': 0.23065628111362457})
('Google', 'university', {'weight': 1.0, 'edge_emb_wt': 0})
('Google', 'access', {'weight': 1.0, 'edge_emb_wt': 0.23188386857509613})
('Google', 'knowledge', {'weight': 1.0, 'edge_emb_wt': 0.25149667263031006})
('Google', 'language', {'weight': 1.0, 'edge_emb_wt': 0.26773056387901306})
('Google', 'concept', {'weight': 1.0, 'edge_emb_wt': 0.22262348234653473})