In [102]:
import pandas as pd
import json

import time
import datetime
import iso8601
import pickle
from sklearn.utils import shuffle
import re

import itertools, nltk, string
from nltk.tokenize import word_tokenize
import re
import networkx as nx
#stop_words = set(nltk.corpus.stopwords.words('english'))
import numpy as np
from nltk.util import ngrams
from scipy import spatial

In [103]:
with open('./long_stopwords.txt') as f:
    stop_words = f.read().splitlines()

In [104]:
def formatTime(tz_time):    
    isoTime = iso8601.parse_date(tz_time)
    ts = isoTime.timestamp()
    ts = datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S:%f")
    return ts

def getValjsonDF(df):
    df.columns = list(pd.Series(df.columns).apply(lambda x: x.strip()))
    df = df.drop(df.index[0])
    df = df['value']
    json_list = list()
    for i in df:
        if type(i)==str:
            json_list.append(json.loads(i))
    df = pd.DataFrame(json_list)
    return df

def replaceHyphen(col_series):
    return col_series.apply(lambda x: x.replace('-','').strip())

def stripColNames(df):
    return list(pd.Series(df.columns).apply(lambda x: x.strip()))

def getregexChunks(text,grammar):
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                        for tagged_sent in tagged_sents))
    return all_chunks

def getCandidatePhrases(text,grammar = [r"""base: {(<JJ.*>*<NN.*>+<IN>)?<JJ>*<NN.*>+}""",
                                        r"""nounverb:{<NN.+>+<.+>{0,2}<VBG>{1}}""",
                                        r"""verbnoun:{<VBG>{1}<.+>{0,2}<NN.+>+}""",
                                       r""" nounnoun:{<NN.+>+<.+>{0,2}<NN.+>+}"""]):
#def getCandidatePhrases(text,grammar=[r'keyphrase: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}']):
    punct = set(string.punctuation)
    all_chunks = []
    for pattern in grammar:
        all_chunks+=getregexChunks(text,pattern)
        
    candidates = [' '.join(word for word, pos, 
                           chunk in group).lower() 
                  for key, group in itertools.groupby(all_chunks, 
                  lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    out = [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
    
    return out

def deDoupCandidateList(candidate_keyphrase_list):
    '''applies stopword filters, merge sub-phrases into parent, merge overlapping phrases'''
    pass

def lambda_unpack(f):
    return lambda args: f(*args)


def getBigramSentence(sent,embedding_dict):
    if len(sent.split(' '))>1:
        sent = sent.split(' ')
        sent_merged = ' '.join(sent)
        t_bigrams = list(ngrams(sent,2))
        t_bigrams_token = list(pd.Series(t_bigrams).apply(lambda x: x[0]+'_'+x[1]))
        lookup_tokens = list(set(embedding_dict.keys()).intersection(set(t_bigrams_token)))
        for bigram_token in lookup_tokens:
            repl_str = bigram_token.split('_')[0]+' '+bigram_token.split('_')[1]
            sent_merged = sent_merged.replace(repl_str,bigram_token)
        split_ = sent_merged.lower().split(' ')
    else:
        split_ = [sent.lower()]
    return split_

def getEmbedVector(chat, feature_dict, p_list):
    p_ctr = len(p_list)
    chat = getBigramSentence(chat.lower(),feature_dict)
    #chat = chat.lower().split(' ')
    vec_list = []
    feature_vector = []
    extp_feature_vector = []
    word_ctr = 0
    
    for ele in chat:
        if ele not in stop_words:
            if ele in feature_dict.keys():
                word_ctr += 1
                f_vector = feature_dict[ele]
                vec_list.append(np.array(f_vector))
    if len(vec_list)>0:
        for p in p_list:
            t_features = getPMeanFeatures(vec_list, p)
            feature_vector += list(t_features)
        extp_feature_vector.append(np.array(feature_vector))
        return extp_feature_vector[0]
    else:
        return [0]*300*p_ctr

def getPMeanFeatures(embed_list, p):
    if p == 1:
        return np.mean(np.array(embed_list), axis=0)
    else:
        p_pow_list = []
        for embed in embed_list:
            embed = embed ** p
            p_pow_list.append(embed)
        p_mat = np.array(p_pow_list)
        p_mat = np.mean(p_mat, axis=0)
        p_mat_ = np.array([ownpow(item, 1 / p) for item in p_mat])
        return p_mat_

def getEmbeddingWeight(chat, weight_dict):
    chat_weight_list = []
    for ele in chat.split( ):
        if ele not in stop_words and ele in weight_dict.keys():
            chat_weight_list.append(weight_dict[ele])
    return sum(chat_weight_list)

def ownpow(a, b):
    if a > 0:
        return a ** b
    if a <= 0:
        temp = abs(a) ** b
        return -1 * temp
    
def getNormWeights(node_weights,fn):
    #degree centrality
    if fn == 'degree_centrality':
        node_degrees = dict(nx.degree(kpGraph))
        for k, v in node_weights.items():
#             if node_degrees[k]<2:
#                 norm_factor = 10
#             else:
            norm_factor = node_degrees[k]
            node_weights[k] = v/norm_factor
    
    #closeness centrality
    if fn=='closeness':
        node_closeness = nx.current_flow_closeness_centrality(kpGraph)
        for k, v in node_weights.items():
            node_weights[k] = v/node_closeness[k]
    
    #betweenness
    if fn=='betweenness':
        node_betweenness = nx.current_flow_betweenness_centrality(kpGraph)
        for k, v in node_weights.items():
            node_weights[k] = v * node_betweenness[k]
    #degree_bet
    if fn == 'degree_bet':
        node_degrees = dict(nx.degree(kpGraph))
        node_betweenness = nx.current_flow_betweenness_centrality(kpGraph)
        for k, v in node_weights.items():
            node_norm = node_degrees[k] + node_betweenness[k]
            node_weights[k] = v/node_norm
    #node_weights = sorted(node_weights.items(), key=lambda x: x[1], reverse=True)
    return node_weights

def getDirEdge(curr_node,nxt_node,personalization_dict):
    curr_node_score = personalization_dict[curr_node]
    nxt_node_score = personalization_dict[nxt_node]
    if curr_node_score>nxt_node_score:
        order_ = nxt_node,curr_node
    else:
        order_ = curr_node,nxt_node
    return order_

def getDFFromDict(dict_,col_names):
    dict_df = pd.DataFrame.from_dict(dict_,orient='index').reset_index()
    dict_df.columns = col_names
    return dict_df

def getMeanOutNodeScore(node_, kpGraph):
    if node_ in kpGraph.nodes():
        out_edges = [out_edge for out_edge in [edge[1] for edge in kpGraph.out_edges(node_)]]
    if len(out_edges)>0:
        return np.mean([kpGraph[node_][x]['weight'] for x in out_edges])
    else:
        return 0.0 #small non-zero value

def getMeanInNodeScore(node_, kpGraph):
    if node_ in kpGraph.nodes():
        in_edges = [in_edge for in_edge in [edge[0] for edge in kpGraph.in_edges(node_)]]
    if len(in_edges)>0:
        return np.mean([kpGraph[x][node_]['weight'] for x in in_edges])
    else:
        return 0.0
    
def stripStops(phrase):
    split_tokens = phrase.split()
    if split_tokens[0] in stop_words:
        split_tokens = split_tokens[1:]
    if split_tokens[-1] in stop_words:
        split_tokens = split_tokens[:-1]
        
    return ' '.join(split_tokens)

In [105]:
#load 
embedding_dict_path = 'data/ether_engg_embedding_dict.pkl'
embedding_dict = pickle.load(open(embedding_dict_path,'rb'))

weight_dict_path = 'data/ether_engg_weight_dict.pkl'
weight_dict = pickle.load(open(weight_dict_path,'rb'))

In [106]:
df_test = pd.read_csv('/Users/venkat/Downloads/test_csvs/7c5a224f4fb54cdb83724612dd1088e8.csv')
df_test = getValjsonDF(df_test)
df_test['createdAt'] = df_test['createdAt'].apply(lambda x: datetime.datetime.strptime(x[:-4],'%Y-%m-%dT%H:%M:%S.%f'))
df_test = df_test[df_test['transcriber']=='deepgram']
df_test = df_test.sort_values(by='createdAt').reset_index(drop=True)

In [107]:
#sort and get chapter subsets
#split dataframe into chapters
time_stamp_list = list(df_test['createdAt'])
chapter_df_list = []

for idx in range(len(time_stamp_list)):
    if idx==0:
        chp_start_time = time_stamp_list[idx]
    print(idx)
    chp_end_time = chp_start_time+datetime.timedelta(seconds=300)
    chaper_subset = df_test[df_test['createdAt'].between(chp_start_time,chp_end_time,inclusive=True)]
    chp_start_time = chp_end_time+datetime.timedelta(seconds=1)
    
    chapter_df_list.append(chaper_subset)
    break_ctr = max(chaper_subset.index)
    if break_ctr>=len(df_test)-1:
        break
        
assert sum([ele.shape[0] for ele in chapter_df_list])==len(df_test)

0
1
2
3
4
5
6
7
8
9


In [108]:
# for txt,time in zip(chapter_df_list[1]['originalText'],chapter_df_list[1]['createdAt']):
#     print(txt)
#     print(time)
#     print()

In [109]:
#do burst mode - aggregate all transcripts in each chapter and calculate key-phrase scores
#no pre-processing, just get the transcript level key-phrases
#curr_chapter = pd.concat([chapter_df_list[-2],chapter_df_list[-3]])
curr_chapter = chapter_df_list[4]

start = time.time()
candidate_keys = []
sent_list = []
tra_sent_list = []
for transcript in list(curr_chapter['originalText']):
    transcript_sents = nltk.sent_tokenize(transcript)
    for sent_ in transcript_sents:
        if len(sent_.split(' '))>5:
            tra_sent_list.append(sent_.strip())
            sent_list.append(sent_.strip())
    filtr_transcript = ' '.join(tra_sent_list)
    candidate_keys+=list(set(getCandidatePhrases(filtr_transcript)))
# #filter stop_word only key-phrases    
# print(len(candidate_keys))
candidate_keys = list(set(candidate_keys)-set(stop_words))
drop_list = []
for candidate in candidate_keys:
    candidate_split = list(set(candidate.split(' ')))
    if len(set(stop_words).intersection(candidate_split))==len(candidate_split):
        drop_list.append(candidate)
print(len(candidate_keys))
candidate_keys = list(set(candidate_keys)-set(drop_list))
print(drop_list)
print(len(candidate_keys))

66
['back yeah okay', 'yeah yeah yeah', 'same thing', 'few things', 'okay yeah']
61


In [110]:
sample_text = """From a back-end perspective if we are ready to integrate. With the front end say sometime next week, \
                then we'll be okay. So basically, you know the basic slack install flow right have a very simple kind of login. \
                I mean installation process takes you to slack and authenticates and gives you the necessary permissions for installs, \
                comes back right and then from the back end all the information we need is ready so that we can go and put the other \
                pages in place right like all the other kind of corner cases. So if we do that then I think we'll be in good shape."""

tra_sent_list = []
candidate_keys = []
sent_list = []

transcript_sents = nltk.sent_tokenize(sample_text)
for sent_ in transcript_sents:
    if len(sent_.split(' '))>5:
        tra_sent_list.append(sent_.strip())
        sent_list.append(stripSpaces(sent_))

filtr_transcript = ' '.join(tra_sent_list)
candidate_keys=list(set(getCandidatePhrases(filtr_transcript)))

print(candidate_keys)

['basic slack install flow', 'next week', 'necessary permissions for installs', 'simple kind of login', 'installation process', 'back end', 'permissions for installs', 'authenticates', 'other kind of corner cases', 'back-end perspective', 'front end', 'other pages in place right', 'good shape']


In [111]:
p_list = [1,3,5]

sent_feat_list = []
sent_dist_list = []
candidate_embedding_list = []

for sent in sent_list:
    sent_feats = getEmbedVector(sent,embedding_dict,p_list)
    sent_feat_list.append(sent_feats)
    
# for candidate in candidate_keys:
#     candidate_embedding_list.append(getEmbedVector(candidate,embedding_dict,p_list))

In [112]:
#iterate key-phrases through each sentence and use them as initial score - if there are duplicates, take max
score_list = []
for candidate in candidate_keys:
    
    candidate_feats = getEmbedVector(candidate,embedding_dict,p_list)
    candidate_embedding_list.append(candidate_feats)
    curr_scores = []
    
    for sent_,sent_feats in zip(sent_list,sent_feat_list):
        if candidate.lower() in sent_.lower():
            curr_dist = 1-spatial.distance.cosine(candidate_feats,sent_feats)
            if curr_dist!=curr_dist:
                curr_dist=0.0001
            print(candidate,curr_dist)
            curr_scores.append(curr_dist)
    score_list.append(np.mean(np.array(curr_scores)))

basic slack install flow 0.9315945826157038
next week 0.0001
necessary permissions for installs 0.5388867436736838
simple kind of login 0.5306938992884916
installation process 0.5730167863360691
back end 0.0001
permissions for installs 0.5388867436736838
authenticates 0.0001
other kind of corner cases 0.630023013490366
back-end perspective 0.8440460874184231
front end 0.0001
other pages in place right 0.574491033986358
good shape 0.8156128974994202


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [113]:
key_feat_dict = dict(zip(candidate_keys,candidate_embedding_list))
personalization_dict = dict(zip(candidate_keys,score_list))
#personalization_dict = None

In [114]:
len(candidate_keys)

13

In [115]:
## build graph with candidates and use p-means cosine similarity as an edge connection
#kpGraph.clear()
kpGraph = nx.DiGraph()
#kpGraph = nx.Graph()
kpGraph.add_nodes_from(candidate_keys)
print(len(kpGraph.nodes()))
edge_weight_list = []

for i in range(len(candidate_keys)):
    curr_node = candidate_keys[i]
    if curr_node in kpGraph.nodes():
        for j in range(i+1,len(candidate_keys)):
            nxt_node = candidate_keys[j]
            if nxt_node in kpGraph.nodes():
                edge_weight = 1-spatial.distance.cosine(key_feat_dict[curr_node],key_feat_dict[nxt_node])
                if edge_weight!=edge_weight:
                    edge_weight = 0.0
                edge_weight_list.append(edge_weight)
                #add inward edge to the most influential node
                #node2 has to be most influential
                node1,node2 = getDirEdge(curr_node,nxt_node,personalization_dict)
                if node1!=node2:
                    kpGraph.add_edge(node1,node2,weight=edge_weight)
                
# #get min_edge_weight
edge_retain_perc = 0.1
edge_weight_list.sort(reverse=True)
edge_weight_list = edge_weight_list[0:int(len(edge_weight_list)*edge_retain_perc)]
min_edge_dist = min(edge_weight_list)
print('Minimum distance threshold: ', min_edge_dist)

drop_list = []
edge_list = kpGraph.edges
print('Total edges formed: ', len(edge_list))

for edge in edge_list:
    if kpGraph[edge[0]][edge[1]]['weight']<min_edge_dist:
        drop_list.append(edge)
        
kpGraph.remove_edges_from(drop_list)
print('Total edges after pruning: ', len(kpGraph.edges))
#remove nodes with no edges
print('Removing following dangling nodes: ', list(nx.isolates(kpGraph)))
kpGraph.remove_nodes_from(list(nx.isolates(kpGraph)))
nstart = {k:v for k, v in personalization_dict.items() if k in kpGraph.nodes()}
node_weights = nx.pagerank(kpGraph, alpha=0.85, max_iter=100,tol=0.0001, 
                            personalization=nstart, nstart=None)
# end = time.time()

13
Minimum distance threshold:  0.44960753667639886
Total edges formed:  78
Total edges after pruning:  7
Removing following dangling nodes:  ['next week', 'simple kind of login', 'back end', 'authenticates', 'front end']


In [84]:
node_weights

{'basic slack install flow': 0.14071493615745184,
 'necessary permissions for installs': 0.0633215368811905,
 'installation process': 0.06733196538137921,
 'permissions for installs': 0.11712725225964377,
 'other kind of corner cases': 0.12963310843563536,
 'back-end perspective': 0.31852794110029037,
 'other pages in place right': 0.06750519589420159,
 'good shape': 0.09583806389020712}

In [86]:
node_scores_page_rank = getDFFromDict(node_weights,['key-phrase','pr_score']).sort_values(by='pr_score',ascending=False)
node_scores_cosine = getDFFromDict(nstart,['key-phrase','cs_score']).sort_values(by='cs_score',ascending=False)
node_scores_page_rank['pr_score'] = node_scores_page_rank['pr_score'].apply(lambda x: x/max(node_scores_page_rank['pr_score']))
node_scores_cosine['cs_score'] = node_scores_cosine['cs_score'].apply(lambda x: x/max(node_scores_cosine['cs_score']))

In [88]:
df_final_scores = pd.merge(node_scores_page_rank,node_scores_cosine,on='key-phrase')
df_final_scores

Unnamed: 0,key-phrase,pr_score,cs_score
0,back-end perspective,1.0,0.906023
1,basic slack install flow,0.441766,1.0
2,other kind of corner cases,0.406976,0.676285
3,permissions for installs,0.367714,0.578456
4,good shape,0.300878,0.875502
5,other pages in place right,0.211929,0.616675
6,installation process,0.211385,0.615092
7,necessary permissions for installs,0.198794,0.578456


In [235]:

df_final_scores['phrase_bias'] = df_final_scores['key-phrase'].apply(lambda x: getEmbeddingWeight(x, weight_dict))
df_final_scores['phrase_bias'] = df_final_scores['phrase_bias'].apply(lambda x: x/max(df_final_scores['phrase_bias']))

In [236]:
df_final_scores['in_edges'] = df_final_scores['key-phrase'].apply(lambda x: len(kpGraph.in_edges(x))+1)
df_final_scores['in_edges'] = df_final_scores['in_edges'].apply(lambda x: x/max(df_final_scores['in_edges']))
df_final_scores['out_edges'] = df_final_scores['key-phrase'].apply(lambda x: len(kpGraph.out_edges(x))+1)
df_final_scores['out_edges'] = df_final_scores['out_edges'].apply(lambda x: x/max(df_final_scores['out_edges']))

In [237]:
df_final_scores['mind_context'] = df_final_scores['cs_score']+df_final_scores['phrase_bias']
df_final_scores['graph_context'] = df_final_scores['pr_score']+df_final_scores['in_edges']+df_final_scores['out_edges']
df_final_scores = df_final_scores.sort_values(by='graph_context', ascending=False)

In [238]:
df_final_scores['weighted_in_edge_score'] = df_final_scores['key-phrase'].apply(lambda x: getMeanInNodeScore(x,kpGraph))
df_final_scores['weighted_out_edge_score'] = df_final_scores['key-phrase'].apply(lambda x: getMeanOutNodeScore(x,kpGraph))

In [239]:
df_final_scores['weighted_in_edge_score'] = df_final_scores['weighted_in_edge_score']*df_final_scores['in_edges']
df_final_scores['weighted_out_edge_score'] = df_final_scores['weighted_out_edge_score']*df_final_scores['out_edges']
df_final_scores['degree_score'] = df_final_scores['weighted_out_edge_score']/df_final_scores['weighted_in_edge_score']
#df_final_scores['degree_score'] = df_final_scores['degree_score'].apply(lambda x: x/max(df_final_scores['degree_score']))
#del df_final_scores['weighted_in_edge_score']
#del df_final_scores['weighted_out_edge_score']

In [245]:
df_final_scores['mind_context'] = df_final_scores['mind_context'].apply(lambda x: x/max(df_final_scores['mind_context']))
df_final_scores['graph_context'] = df_final_scores['graph_context'].apply(lambda x: x/max(df_final_scores['graph_context']))

df_final_scores['final_score'] = df_final_scores['mind_context']+df_final_scores['graph_context']
df_final_scores = df_final_scores.sort_values(by='final_score', ascending=False).reset_index(drop=True)

df_final_scores['key-phrase'] = df_final_scores['key-phrase'].apply(lambda x: stripStops(x))
df_final_scores['key_len'] = df_final_scores['key-phrase'].apply(lambda x: len(x.split(' ')))
df_final_scores = df_final_scores[df_final_scores['key_len']>1]

In [246]:
df_final_scores

Unnamed: 0,key-phrase,pr_score,cs_score,phrase_bias,in_edges,out_edges,mind_context,graph_context,weighted_in_edge_score,weighted_out_edge_score,degree_score,final_score,key_len
0,language model,0.81607,0.685153,0.692144,1.0,1.0,0.761392,1.0,0.530315,0.536167,1.011035,1.761392,2
1,time than other other food join,0.862379,0.80892,1.0,0.692308,0.176471,1.0,0.614742,0.337508,0.085775,0.254143,1.614742,6
2,start time,0.59564,0.782176,0.786196,0.653846,0.235294,0.867021,0.527253,0.338039,0.141018,0.417165,1.394274,2
3,worst time,1.0,0.853014,0.52437,0.461538,0.058824,0.76144,0.539888,0.23724,0.0,0.0,1.301328,2
4,reason calls,0.186568,0.679806,0.573345,0.346154,0.529412,0.692761,0.377169,0.15059,0.240938,1.599962,1.06993,2
5,pin people,0.3026,0.740161,0.566307,0.423077,0.235294,0.722236,0.341246,0.198351,0.110104,0.555097,1.063482,2
6,team score,0.101226,0.597441,0.756027,0.153846,0.529412,0.748219,0.278574,0.06539,0.243043,3.716819,1.026793,2
7,meeting graph,0.230415,0.923799,0.620552,0.153846,0.058824,0.853742,0.157342,0.082593,0.0,0.0,1.011083,2
8,coming by by tuesday,0.255992,0.703013,0.483123,0.346154,0.235294,0.655714,0.297379,0.178602,0.111554,0.624597,0.953093,4
9,engineering mind,0.087329,0.593257,0.608983,0.076923,0.588235,0.664617,0.267212,0.031796,0.263247,8.279304,0.931829,2


In [252]:
key_ = 'meeting graph'
print('In_Edges: ')
print([edge[0] for edge in kpGraph.in_edges(key_)])
print()
print('Out_Edges: ')
print([edge[1] for edge in kpGraph.out_edges(key_)])

In_Edges: 
['language model', 'next week', 'meeting']

Out_Edges: 
[]


In [250]:
key_ = 'meeting'
print('In_Edges: ')
print([edge[0] for edge in kpGraph.in_edges(key_)])
print()
print('Out_Edges: ')
print([edge[1] for edge in kpGraph.out_edges(key_)])

In_Edges: 
['next week']

Out_Edges: 
['meeting graph', 'language model']


In [244]:
'worst' in stop_words

False

In [257]:
a = 5+5

In [27]:
vec1 = getEmbedVector('other pages in place right',embedding_dict,p_list)


array([-2.22568348e-01,  2.61646181e-01,  6.21008649e-02, -3.35652456e-02,
       -2.39757970e-01, -2.60026008e-01,  2.00880215e-01, -2.24116191e-01,
        1.96480572e-01,  1.42902702e-01, -7.98235014e-02,  3.33956070e-02,
       -8.38135481e-02, -1.55800253e-01, -4.61133663e-03,  1.68200582e-01,
        7.82136917e-02,  1.12497188e-01, -1.99435670e-02, -1.02097012e-01,
        7.36215487e-02,  4.76110799e-05, -1.88313648e-02, -9.14450362e-02,
        5.40803969e-02, -9.01148543e-02, -2.23167002e-01,  1.04137652e-01,
       -9.31515768e-02, -8.61513019e-02, -2.00394899e-01,  1.83853149e-01,
       -6.23841733e-02, -8.15735292e-03, -9.13078059e-03,  3.98679152e-02,
        1.33876175e-01, -2.50617534e-01,  7.85041749e-02,  2.54105300e-01,
        6.47041947e-02, -1.63657025e-01,  3.69483292e-01,  7.16926232e-02,
        7.93766603e-02, -1.72588438e-01, -6.03112429e-02, -7.49131888e-02,
       -1.10869236e-01,  1.83374316e-01, -1.94696188e-01,  1.66751906e-01,
        1.01264343e-01, -

In [41]:
sent_list

['From a back-end perspective if we are ready to integrate.',
 "With the front end say sometime next week,                 then we'll be okay.",
 'So basically, you know the basic slack install flow right have a very simple kind of login.',
 'I mean installation process takes you to slack and authenticates and gives you the necessary permissions for installs,                 comes back right and then from the back end all the information we need is ready so that we can go and put the other                 pages in place right like all the other kind of corner cases.',
 "So if we do that then I think we'll be in good shape."]

In [42]:
def stripSpaces(text):
    return re.sub(' +', ' ',text).strip()

In [43]:
stripSpaces(sent_list[3])

'I mean installation process takes you to slack and authenticates and gives you the necessary permissions for installs, comes back right and then from the back end all the information we need is ready so that we can go and put the other pages in place right like all the other kind of corner cases.'

In [91]:
import pickle
import boto3
pickle_dumps = pickle.dumps(obj=kpGraph, protocol=pickle.HIGHEST_PROTOCOL)

In [94]:
s3 = boto3.resource('s3')
bucket = 'meetinggraphs'
key = 'samplegraph'
s3.Object(bucket,key).put(Body=pickle_dumps)

{'ResponseMetadata': {'RequestId': '7FC12B08E704D495',
  'HostId': 'S6FMYBkwWZW3QtU+VXuQhlfaMCUc6ayT96O6Uf8Ot8P9DlYckrQEgU9XPkbOlpJbO9/VuvM/uyo=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'S6FMYBkwWZW3QtU+VXuQhlfaMCUc6ayT96O6Uf8Ot8P9DlYckrQEgU9XPkbOlpJbO9/VuvM/uyo=',
   'x-amz-request-id': '7FC12B08E704D495',
   'date': 'Wed, 03 Jul 2019 06:22:32 GMT',
   'etag': '"ba4cf2df3f315881705259f9d9629a85"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ba4cf2df3f315881705259f9d9629a85"'}

In [95]:
kpGraph.nodes()

NodeView(('basic slack install flow', 'necessary permissions for installs', 'installation process', 'permissions for installs', 'other kind of corner cases', 'back-end perspective', 'other pages in place right', 'good shape'))

In [96]:
del kpGraph

In [99]:
# #download from s3
# file_obj = self.s3_client.download_file(file_name='s3://meetinggraphs/samplegraph')
# file_obj_bytestring = file_obj["Body"].read()
# kpGraph = pickle.loads(file_obj_bytestring)


In [101]:
kpGraph.add

Object `kpGraph` not found.
