# TextRank (PageRank)-based Keyphrase extraction

In [38]:
from text_preprocessing import preprocess as tp
import jgtextrank as tr
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
# Using `widget` for graph interaction
# Use `inline` or `notebook` if this throws error
%matplotlib widget

In [3]:
eg_text = "Compatibility of systems of linear constraints over the set of natural numbers. " \
                       "Criteria of compatibility of a system of linear Diophantine equations, strict inequations, " \
                       "and nonstrict inequations are considered. Upper bounds for components of a minimal set of " \
                       "solutions and algorithms of construction of minimal generating sets of solutions for all " \
                       "types of systems are given. These criteria and the corresponding algorithms for " \
                       "constructing a minimal supporting set of solutions can be used in solving all the " \
                       "considered types systems and systems of mixed types."

In [4]:
ether_text = "So last week whatever 16 years with respect to the playlist and DRM key, right? " \
"So I was able to test on Safari and chrome both wearing it was able to forward the cookies. " \
"I was just like trying to trace out the cookies whether it's cool being sent in the DRM ta PA all those things. " \
"So one thing is I had tested it, but I wanted the Deep also to test from IOS app also whether we can pass the cookies. " \
"So once that is done, it is like tested it but I just want him to also confirm that part that it can send a cookies from it was have also know but what I am right now stuck is the Eco meat to a double AC p-- a Gateway. " \
"It's not able to proxying it actually. "\
"So as you spend like I was trying to do with the goatee also, there also is not able to do it the same problem is that this something which is going." \
"Hang on, okay."

## Preprocess & Tokenize text
Preprocessing handles:
- Remove punctuations
- Expand contractions
- Remove irrelevant symbols
- Change dates
- Change numbers

In [13]:
eg = tp.preprocess(eg_text, stop_words=False, word_tokenize=True, remove_punct=True)

In [14]:
eg

[['Compatibility',
  'of',
  'systems',
  'of',
  'linear',
  'constraints',
  'over',
  'the',
  'set',
  'of',
  'natural',
  'numbers',
  '.'],
 ['Criteria',
  'of',
  'compatibility',
  'of',
  'a',
  'system',
  'of',
  'linear',
  'Diophantine',
  'equations',
  ',',
  'strict',
  'inequations',
  ',',
  'and',
  'nonstrict',
  'inequations',
  'are',
  'considered',
  '.'],
 ['Upper',
  'bounds',
  'for',
  'components',
  'of',
  'a',
  'minimal',
  'set',
  'of',
  'solutions',
  'and',
  'algorithms',
  'of',
  'construction',
  'of',
  'minimal',
  'generating',
  'sets',
  'of',
  'solutions',
  'for',
  'all',
  'types',
  'of',
  'systems',
  'are',
  'given',
  '.'],
 ['These',
  'criteria',
  'and',
  'the',
  'corresponding',
  'algorithms',
  'for',
  'constructing',
  'a',
  'minimal',
  'supporting',
  'set',
  'of',
  'solutions',
  'can',
  'be',
  'used',
  'in',
  'solving',
  'all',
  'the',
  'considered',
  'types',
  'systems',
  'and',
  'systems',
  'of',


## Check Keyphrases
A Context-window of **4** is used here to build a co-occurence graph.

`log_norm_max` is used as the weighted metric for extracting keyphrases

In [42]:
tr.keywords_extraction_from_segmented_corpus(eg, 
                        top_p=1, 
                        directed=False,
                        conn_with_original_ctx=False,
                        window=4, 
                        weight_comb='log_norm_max')[0]

[('minimal set', 0.17424),
 ('linear constraints', 0.14623),
 ('types systems', 0.14062),
 ('minimal supporting set', 0.10993),
 ('mixed types', 0.10781),
 ('corresponding algorithms', 0.10772),
 ('linear diophantine equations', 0.09226),
 ('strict inequations', 0.08268),
 ('nonstrict inequations', 0.08268),
 ('set', 0.07504),
 ('natural numbers', 0.06723),
 ('systems', 0.06056),
 ('minimal', 0.06021),
 ('solutions', 0.0563),
 ('types', 0.04643),
 ('algorithms', 0.04639),
 ('compatibility', 0.04273),
 ('criteria', 0.04233),
 ('system', 0.03414),
 ('sets', 0.03223),
 ('construction', 0.0322),
 ('components', 0.02828),
 ('upper', 0.02368)]

## Build a Co-occurence graph to visually inspect nodes & edges

In [43]:
eg_proc_text = tr.preprocessing(eg_text)
eg_graph, eg_tokens = tr.build_cooccurrence_graph(eg_proc_text, window=4)

In [44]:
plt.figure(figsize=(8,6))
plt.axis('off')
pos = nx.spring_layout(eg_graph, k =0.2, iterations = 20)
nx.draw_networkx(eg_graph, pos=pos, arrows=True, with_labels=True, node_size = 15, alpha = 0.65, width = 0.2, edge_color = 'b', font_size=10)
plt.show()

FigureCanvasNbAgg()

  if cb.is_numlike(alpha):


## Graph Analysis

Graph operations: 
- Betweenness score
- Closeness score
- Degree score
- No. of Neighbors
- No. of nodes and edges

In [45]:
def sort_dict_by_value(graph_metric):
    """
    Args:
        graph_metric: `dict` form indicating the graph metric output
    Returns:
        Sorted list based on the dict values.
    """
    b = graph_metric
    return sorted(b.items(), key=lambda x: (-x[1],x[0]))

### Betweenness analysis

In [46]:
b = nx.betweenness_centrality(eg_graph)

In [47]:
sort_dict_by_value(b)

[('set', 0.46240942028985516),
 ('linear', 0.407548309178744),
 ('solutions', 0.125),
 ('algorithms', 0.11609299516908214),
 ('strict', 0.09631642512077301),
 ('systems', 0.08816425120772947),
 ('components', 0.08333333333333333),
 ('minimal', 0.08303140096618357),
 ('compatibility', 0.06748188405797101),
 ('types', 0.046799516908212546),
 ('criteria', 0.04091183574879227),
 ('diophantine', 0.028049516908212575),
 ('equations', 0.028049516908212575),
 ('constraints', 0.017814009661835748),
 ('system', 0.013466183574879227),
 ('inequations', 0.005072463768115942),
 ('construction', 0.004227053140096619),
 ('corresponding', 0.0),
 ('mixed', 0.0),
 ('natural', 0.0),
 ('nonstrict', 0.0),
 ('numbers', 0.0),
 ('sets', 0.0),
 ('supporting', 0.0),
 ('upper', 0.0)]

### Check Neighbours of nodes

In [49]:
list(eg_graph.neighbors('diophantine'))

['inequations', 'strict', 'system', 'equations', 'linear']

### Check edge connections

In [50]:
eg_graph.edges('strict')

EdgeDataView([('strict', 'inequations'), ('strict', 'diophantine'), ('strict', 'equations'), ('strict', 'nonstrict'), ('strict', 'linear')])

# Preprocessing meetings

In [19]:
import numpy as np
import pandas as pd
# from spacy.lang.en.stop_words import STOP_WORDS

In [20]:
meeting_transcripts = "data/meeting_transcripts.csv"
trans_df = pd.read_csv(meeting_transcripts)

In [21]:
meeting_id = "fe9f49175b154418b06399d34ffbf37e"

In [22]:
def preprocess_meetings(df, meeting_id, min_sent_len):
    curr_meeting_df = getCurrentMeetingByMeetingID(df, meeting_id)
    conversation_list, chat_len_list = getConversations(curr_meeting_df, min_sent_len)
    out_df = pd.DataFrame({'originalText':conversation_list, 'conversation_len':chat_len_list}, index=None)
    return out_df

def getConversations(transcriptsDF, min_sent_len=0):
    conversation_list = list()
    chat_len_list = list()

    for i in range(len(transcriptsDF)):
        t_conversation = transcriptsDF.iloc[i]['originalText']
        t_conversation_filt = set(t_conversation.split(' '))
        # filt_sent = t_conversation_filt.difference(stop_words)

        if len(t_conversation_filt) > min_sent_len:
            conversation_list.append(t_conversation)
            chat_len_list.append(len(t_conversation_filt))
    return conversation_list, chat_len_list

def getCurrentMeetingByMeetingID(meeting_df, meeting_id):
    curr_meeting_df = meeting_df[meeting_df['meetingId'] == meeting_id]
    curr_meeting_df.dropna(inplace=True, axis=0)
    return curr_meeting_df

In [23]:
out_df = preprocess_meetings(trans_df, meeting_id, min_sent_len=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
#import nltk
doc = ''
#doc_context = []
for i in range(len(out_df)):
    text = out_df.iloc[i]['originalText']
    doc = doc + text + ' '
    # doc_context.append(token)

In [25]:
doc

"Okay Right. One thing. Yeah sure. Yeah, yeah so under two that two the web is is done part, but does require coding. Can somebody knew the. Yeah yeah yeah, go ahead. Based basically list respect to the call already getting and I'm making sure that the ether recording board is disconnect. Once the meeting is over those changes is also have been tested there is one issue that I found out while testing the entire. So when ether recording client is the last one in the meeting and we should meeting command the client is client is not able to handle that and disconnect all from the call. So usually suggested what you happen is in the we used to delete the room and from the room the comes to the recording client to disconnect and also on side when you do an end meeting for all that from the side to client and that's way it works and the in the client disconnect from the call it. So we need to kind of fix that so a couple of options that we were discussing is one is handling the event like th

In [26]:
doc_graph, doc_token = tr.build_cooccurrence_graph(tr.preprocessing(doc), window=4)

In [27]:
plt.figure(figsize=(9,9))
plt.axis('off')
pos = nx.spring_layout(doc_graph, k=0.2, iterations = 20)
nx.draw_networkx(doc_graph, pos=pos, arrows=True, with_labels=True, node_size = 55, alpha = 0.65, width = 0.2, edge_color = 'b', font_size=10)
plt.show()

FigureCanvasNbAgg()

  if cb.is_numlike(alpha):


In [28]:
print("Graph nodes: {}, Graph edges: {}".format(doc_graph.number_of_nodes(), doc_graph.number_of_edges()))

Graph nodes: 290, Graph edges: 610


In [35]:
custom_stop_words = ['okay', 'yeah', 'right', 'i', 'today', 'could', 'able', 'one']

In [36]:
tr.keywords_extraction(doc, top_p =0.3, directed=False, lemma=False, window=5, 
                       conn_with_original_ctx=False, weight_comb='log_norm_max', stop_words=custom_stop_words)[0]

[('hermes service', 0.04057),
 ('key service', 0.04057),
 ('applications service', 0.04057),
 ('ether service', 0.04057),
 ('service image', 0.04057),
 ('authentication service', 0.04057),
 ('independent service', 0.04057),
 ('hermes call', 0.04017),
 ('call auditing', 0.04017),
 ('production issue', 0.0357),
 ('recording client', 0.03278),
 ('client disconnect', 0.03278),
 ('client side', 0.03278),
 ('capture code', 0.02948),
 ('browser code', 0.02948),
 ('code review', 0.02948),
 ('meeting command', 0.0273),
 ('end meeting', 0.0273),
 ('meeting number', 0.0273),
 ('service docker image', 0.0256),
 ('keyphrase extraction service', 0.0256),
 ('call url l', 0.02535),
 ('hermes repository', 0.02449),
 ('libraries model production', 0.02252),
 ('recording design', 0.02241),
 ('recording clients', 0.02241),
 ('ether recording client', 0.02068),
 ('other things', 0.0204),
 ('related things', 0.0204),
 ('event like', 0.01962),
 ('particular event', 0.01962),
 ('david capture code', 0.0186),


In [37]:
tr.keywords_extraction(doc, top_p =0.3, directed=False, lemma=False, window=5, 
                       conn_with_original_ctx=False, weight_comb='log_norm_max',
                      stop_words=custom_stop_words,
                      solver='pagerank_numpy')[0][:20]

[('hermes service', 0.04057),
 ('key service', 0.04057),
 ('applications service', 0.04057),
 ('ether service', 0.04057),
 ('service image', 0.04057),
 ('authentication service', 0.04057),
 ('independent service', 0.04057),
 ('hermes call', 0.04018),
 ('call auditing', 0.04018),
 ('production issue', 0.0357),
 ('recording client', 0.03278),
 ('client disconnect', 0.03278),
 ('client side', 0.03278),
 ('capture code', 0.02947),
 ('browser code', 0.02947),
 ('code review', 0.02947),
 ('meeting command', 0.0273),
 ('end meeting', 0.0273),
 ('meeting number', 0.0273),
 ('service docker image', 0.02559)]

In [29]:
list(doc_graph.neighbors('google'))

['website', 'analytics', 'segment']

In [30]:
list(doc_graph.neighbors('ether'))

['past',
 'scenario',
 'service',
 'comments',
 'disconnects',
 'recording',
 'client',
 'disconnect',
 'domains',
 'end',
 'hermes',
 'board',
 'dependency',
 'sure']

In [None]:
list(doc_graph.edges('google'))

In [33]:
list(doc_graph.edges('hermes'))

[('hermes', 'yeah'),
 ('hermes', 'service'),
 ('hermes', 'everybody'),
 ('hermes', 'repository'),
 ('hermes', 'ether'),
 ('hermes', 'pc'),
 ('hermes', 'right'),
 ('hermes', 'call')]

In [None]:
nx.closeness_centrality(doc_graph)

In [None]:
d = dict(nx.degree(doc_graph))

In [None]:
sorted(d.items(), key=lambda x: (-x[1], x[0]))

# Testing Custom Graph

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
%matplotlib widget

In [8]:
import jgtextrank as tr
from graphrank import GraphRank
from utils import VisualizeGraph
import networkx as nx

ModuleNotFoundError: No module named 'graphrank'

In [132]:
text = u"Compatibility of systems of linear constraints over the set of natural\
 numbers. Criteria of compatibility of a system of linear Diophantine equations\
, strict inequations, and nonstrict inequations are considered. Upper bounds fo\
r components of a minimal set of solutions and algorithms of construction of mi\
nimal generating sets of solutions for all types of systems are given. These cr\
iteria and the corresponding algorithms for constructing a minimal supporting s\
et of solutions can be used in solving all the considered types systems and sys\
tems of mixed types."

In [133]:
processed_text = list(tr.preprocessing(text))

In [134]:
def get_pos_tuple(proc_text):
    word_pos_tuple = []
    token_list = []
    for token, pos_tuple in list(proc_text):
        word_pos_tuple.append(pos_tuple)
        token_list.append(token)
    return token_list, word_pos_tuple

In [136]:
token_list, pos_tuple = get_pos_tuple(processed_text)

In [135]:
gr = GraphRank()

In [137]:
word_graph = gr.build_word_graph(pos_tuple)

[('compatibility', 'NN'), ('systems', 'NNS'), ('linear', 'JJ'), ('constraints', 'NNS'), ('set', 'NN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('criteria', 'NNS'), ('compatibility', 'NN'), ('system', 'NN'), ('linear', 'JJ'), ('diophantine', 'NNP'), ('equations', 'NNS'), ('strict', 'JJ'), ('inequations', 'NNS'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('upper', 'NNP'), ('components', 'NNS'), ('minimal', 'JJ'), ('set', 'NN'), ('solutions', 'NNS'), ('algorithms', 'NN'), ('construction', 'NN'), ('minimal', 'JJ'), ('sets', 'NNS'), ('solutions', 'NNS'), ('types', 'NNS'), ('systems', 'NNS'), ('criteria', 'NNS'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('solutions', 'NNS'), ('types', 'NNS'), ('systems', 'NNS'), ('systems', 'NNS'), ('mixed', 'JJ'), ('types', 'NNS')]
[('compatibility', 'NN'), ('systems', 'NNS'), ('linear', 'JJ'), ('constraints', 'NNS'), ('set', 'NN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('criteria', 'NNS'), ('

In [139]:
viz_graph = VisualizeGraph()

In [140]:
viz_graph.draw_graph(word_graph)

FigureCanvasNbAgg()

  if cb.is_numlike(alpha):


In [141]:
node_weights, top_words = gr.node_weighting(pos_tuple)

[('compatibility', 'NN'), ('systems', 'NNS'), ('linear', 'JJ'), ('constraints', 'NNS'), ('set', 'NN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('criteria', 'NNS'), ('compatibility', 'NN'), ('system', 'NN'), ('linear', 'JJ'), ('diophantine', 'NNP'), ('equations', 'NNS'), ('strict', 'JJ'), ('inequations', 'NNS'), ('nonstrict', 'JJ'), ('inequations', 'NNS'), ('upper', 'NNP'), ('components', 'NNS'), ('minimal', 'JJ'), ('set', 'NN'), ('solutions', 'NNS'), ('algorithms', 'NN'), ('construction', 'NN'), ('minimal', 'JJ'), ('sets', 'NNS'), ('solutions', 'NNS'), ('types', 'NNS'), ('systems', 'NNS'), ('criteria', 'NNS'), ('corresponding', 'JJ'), ('algorithms', 'NN'), ('minimal', 'JJ'), ('supporting', 'NN'), ('set', 'NN'), ('solutions', 'NNS'), ('types', 'NNS'), ('systems', 'NNS'), ('systems', 'NNS'), ('mixed', 'JJ'), ('types', 'NNS')]
[('compatibility', 'NN'), ('systems', 'NNS'), ('linear', 'JJ'), ('constraints', 'NNS'), ('set', 'NN'), ('natural', 'JJ'), ('numbers', 'NNS'), ('criteria', 'NNS'), ('

In [142]:
node_weights

{'compatibility': 0.04050130350480296,
 'systems': 0.0635444247694692,
 'linear': 0.05547927375550788,
 'constraints': 0.028669198834051886,
 'set': 0.0642777271769126,
 'natural': 0.029583342774413265,
 'numbers': 0.02982715489502955,
 'criteria': 0.05291980184834524,
 'system': 0.029216113760474803,
 'diophantine': 0.03281820309281377,
 'equations': 0.03564337646341648,
 'strict': 0.036522169851136425,
 'inequations': 0.054913438468111836,
 'nonstrict': 0.021459367118919152,
 'upper': 0.0349273631013934,
 'components': 0.03180842702430976,
 'minimal': 0.07668108188196558,
 'solutions': 0.05077793141911456,
 'algorithms': 0.05146666199512798,
 'construction': 0.027808019883344165,
 'sets': 0.02767841541539337,
 'types': 0.03950963888722571,
 'corresponding': 0.028175018149669974,
 'supporting': 0.027792493863883137,
 'mixed': 0.02800005206516726}

In [143]:
top_words

[('minimal', 0.07668108188196558),
 ('set', 0.0642777271769126),
 ('systems', 0.0635444247694692),
 ('linear', 0.05547927375550788),
 ('inequations', 0.054913438468111836),
 ('criteria', 0.05291980184834524),
 ('algorithms', 0.05146666199512798),
 ('solutions', 0.05077793141911456),
 ('compatibility', 0.04050130350480296),
 ('types', 0.03950963888722571),
 ('strict', 0.036522169851136425),
 ('equations', 0.03564337646341648),
 ('upper', 0.0349273631013934),
 ('diophantine', 0.03281820309281377),
 ('components', 0.03180842702430976),
 ('numbers', 0.02982715489502955),
 ('natural', 0.029583342774413265),
 ('system', 0.029216113760474803),
 ('constraints', 0.028669198834051886),
 ('corresponding', 0.028175018149669974),
 ('mixed', 0.02800005206516726),
 ('construction', 0.027808019883344165),
 ('supporting', 0.027792493863883137),
 ('sets', 0.02767841541539337),
 ('nonstrict', 0.021459367118919152)]

In [170]:
multi_terms = gr.retrieve_multi_keyterms(pos_tuple, token_list)

In [171]:
multi_terms

[(['compatibility'], [0.04050130350480296]),
 (['systems'], [0.0635444247694692]),
 (['linear', 'constraints'], [0.05547927375550788, 0.028669198834051886]),
 (['set'], [0.0642777271769126]),
 (['natural', 'numbers'], [0.029583342774413265, 0.02982715489502955]),
 (['criteria'], [0.05291980184834524]),
 (['system'], [0.029216113760474803]),
 (['linear', 'diophantine', 'equations'],
  [0.05547927375550788, 0.03281820309281377, 0.03564337646341648]),
 (['strict', 'inequations'], [0.036522169851136425, 0.054913438468111836]),
 (['nonstrict', 'inequations'], [0.021459367118919152, 0.054913438468111836]),
 (['upper'], [0.0349273631013934]),
 (['components'], [0.03180842702430976]),
 (['minimal', 'set'], [0.07668108188196558, 0.0642777271769126]),
 (['solutions'], [0.05077793141911456]),
 (['algorithms'], [0.05146666199512798]),
 (['construction'], [0.027808019883344165]),
 (['minimal'], [0.07668108188196558]),
 (['sets'], [0.02767841541539337]),
 (['types'], [0.03950963888722571]),
 (['corr

In [172]:
multi_words, multi_word_scores = gr.compute_multiterm_score(pos_tuple, original_tokens=token_list)

In [173]:
len(multi_word_scores)

23

In [174]:
keyphrases = gr.get_keyphrases(pos_tuple, token_list)

['compatibility', 'systems', 'linear constraints', 'set', 'natural numbers', 'criteria', 'system', 'linear diophantine equations', 'strict inequations', 'nonstrict inequations', 'upper', 'components', 'minimal set', 'solutions', 'algorithms', 'construction', 'minimal', 'sets', 'types', 'corresponding algorithms', 'minimal supporting set', 'types systems', 'mixed types']
23 23


In [175]:
keyphrases

[('minimal supporting set', 0.1687513029227613),
 ('minimal set', 0.1409588090588782),
 ('linear diophantine equations', 0.12394085331173813),
 ('types systems', 0.10305406365669491),
 ('strict inequations', 0.09143560831924827),
 ('linear constraints', 0.08414847258955976),
 ('corresponding algorithms', 0.07964168014479794),
 ('minimal', 0.07668108188196558),
 ('nonstrict inequations', 0.076372805587031),
 ('mixed types', 0.06750969095239297),
 ('set', 0.0642777271769126),
 ('systems', 0.0635444247694692),
 ('natural numbers', 0.05941049766944281),
 ('criteria', 0.05291980184834524),
 ('algorithms', 0.05146666199512798),
 ('solutions', 0.05077793141911456),
 ('compatibility', 0.04050130350480296),
 ('types', 0.03950963888722571),
 ('upper', 0.0349273631013934),
 ('components', 0.03180842702430976),
 ('system', 0.029216113760474803),
 ('construction', 0.027808019883344165),
 ('sets', 0.02767841541539337)]