In [9]:
import pandas as pd
import pickle
import numpy as np
import json 
from IPython.display import clear_output
import requests
from pytorch_pretrained_bert import BertTokenizer, BertConfig, BertModel
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertPreTrainingHeads
from bert_utils import *
import time
from scipy import spatial

import nltk
import string,itertools
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import unicodedata
from more_itertools import locate
from functools import reduce
import pandas as pd
import re

In [10]:
contractions = json.load(open('contractions.txt','rb'))
contractions = contractions['contractions']
    
def replaceContractions(text):
    #text = text.lower()
    c_filt_text = ''
    for word in text.split(' '):
        if word in contractions:
            c_filt_text = c_filt_text+' '+contractions[word]
        else:
            c_filt_text = c_filt_text+' '+word
    return c_filt_text.strip()

def stripText(text):
    text = replaceContractions(text.lower())
    text = re.sub('(\d+[A-z]+)|(([A-z]+\d+))',' ',text) #remove alphanumeric words
    text = re.sub('-',' ', text)
    text = re.sub('\s+',' ', text)
    text = re.sub("'",' ', text)
    return text.strip()

In [11]:
#load test data
import json
import os
test_file_list = os.listdir('./test_data/')
master_text = ''

for file in test_file_list:
    curr_file = json.load(open('test_set/'+file,'rb'))
    master_text = master_text+' '+curr_file
    
sent_bucket = master_text.split('.')
sent_bucket = [ele.strip() for ele in sent_bucket if len(ele)>5]
sent_bucket = [stripText(ele) for ele in sent_bucket]

In [12]:
def getregexChunks(text, grammar):

    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    return [(ele[0], ele[1], ele[2], ctr) for ele,ctr in zip(all_chunks,range(len(all_chunks)))]

def getCandidatePhrases(text, pos_search_pattern_list=[r"""base: {(<JJ.*>*<NN.*>+<IN>)?<JJ>*<NN.*>+}"""]):
                                       #r"""nounverb:{<NN.+>+<.+>{0,2}<VB*>{1}}""",
                                       #r"""verbnoun:{<VB*>{1}<.+>{0,2}<NN.+>+}"""]):
                                       #r""" nounnoun:{<NN.+>+<.+>{1,2}<NN.+>+}"""]):
                                       #r"""baseverb: {(<JJ.+>+<IN>)?<JJ>*<VB.*>+}"""]):
    punct = set(string.punctuation)
    all_chunks = []

    for pattern in pos_search_pattern_list:
        all_chunks+=getregexChunks(text, pattern)
    
    candidate_locs = [' '.join(str(ctr) for word, pos, 
                           chunk,ctr in group).lower() 
                  for key, group in itertools.groupby(all_chunks, 
                  lambda_unpack(lambda word, pos, chunk, ctr: chunk != 'O')) if key]
    
    candidate_phrases = [' '.join(word for word, pos, 
                           chunk,ctr in group).lower() 
                  for key, group in itertools.groupby(all_chunks, 
                  lambda_unpack(lambda word, pos, chunk, ctr: chunk != 'O')) if key]
    
    #candidate_phrases = [cand for cand in candidates_tokens if cand not in stop_words and not all(char in punct for char in cand)]

    return candidate_phrases,candidate_locs

def lambda_unpack(f):
    return lambda args: f(*args)

def getWordLevelFeats(sent,token_feat_dict,tokenizer):
    word_feat_list = []
    for word in sent.split(' '):
        tokenized_word = tokenizer.tokenize(word)
        if len(tokenized_word)==1:
            word_feat_list.append(np.array(token_feat_dict[tokenized_word]))
        else:
            tok_feats = []
            for tok in tokenized_word:
                tok_feats+=np.array(token_feat_dict[tok])
            word_feat_list.append(tok_feats)
            
    return word_feat_list

def getWordFeatsFromBertTokenFeats(sent_tokens,bert_tokens,bert_token_feats):
    #steps for merging the bert tokens to get the BERT features for actual words
    #1. iterate over the BERT base tokenizer
    #2. lookup for the actual word in the current BERT lookup postions
    #3. If found:
        #3a. the word is not tokenized further - use the current BERT features as word embedding
    #else:
        #3b. the word is tokenized in BERT - find the sequence of tokens and sum up the features to get the word vector
    base_ctr = 0
    bert_ctr = 0
    word_feat_list = []

    for word in sent_tokens:
        if bert_tokens[bert_ctr] == word:#word not further tokenized, use the same feature vector
            word_feat_list.append(np.array(bert_token_feats[bert_ctr].detach().numpy()))
            base_ctr+=1
            bert_ctr+=1
        else:
            aggr_feats = np.array(bert_token_feats[bert_ctr].detach().numpy())
            aggr_word = bert_tokens[bert_ctr]
            merge_next = True
            while merge_next and bert_ctr<len(bert_tokens)-1:
                if '#' in bert_tokens[bert_ctr+1]:
                    aggr_word = aggr_word+bert_tokens[bert_ctr+1]
                    bert_ctr+=1
                    aggr_feats+=np.array(bert_token_feats[bert_ctr].detach().numpy())
                else:
                    merge_next = False
                    bert_ctr+=1
            word_feat_list.append(aggr_feats)
    assert len(sent_tokens)==len(word_feat_list)
    return word_feat_list

def getPOSPhrases(sent_tokens,candidate_pos_tags = ['NN','NNPS','NNS','NNP','VBG','VBN','VBP','VBZ','JJ','JJR','JJS'],
                  tag_pairs = ['NN','VB','JJ']):

    candidate_pos_tags = ['NN','NNPS','NNS','NNP','VBG','VBN','VBP','VBZ','JJ','JJR','JJS']
    tag_pairs = ['NN','VB','JJ']

    sent_tokens_pos= nltk.pos_tag(sent_tokens)
    sent_tokens = [(ele[0],ele[1],ctr) for ctr,ele in zip(range(len(sent_tokens_pos)),sent_tokens_pos) if ele[1] in candidate_pos_tags and len(ele[0])>2]

    noun_tags = [(tok,ctr) for tok,pos,ctr in sent_tokens if 'NN' in pos]
    verb_tags = [(tok,ctr) for tok,pos,ctr in sent_tokens if 'VB' in pos]
    adj_tags = [(tok,ctr) for tok,pos,ctr in sent_tokens if 'JJ' in pos]

    #return dict(zip(['noun_tags','verb_tags','adj_tags'],[[noun_tags],[verb_tags],[adj_tags]]))
    return dict(zip(['noun_tags','verb_tags','adj_tags'],[noun_tags,verb_tags,adj_tags]))

def getPOSSetsForSent(text_sent_tokens):
    pos_sent = getPOSPhrases(text_sent_tokens)
    merged_pos_list = []
    
    for key in pos_sent.keys():
        merge_list = []
        phrase_list = []

        curr_candidate_tokens = [ele[0] for ele in pos_sent[key]]
        pos_tok_seq = [ele[1] for ele in pos_sent[key]]
        
        assert len(curr_candidate_tokens) == len(pos_tok_seq)

        diff_list = [pos_tok_seq[i+1]-pos_tok_seq[i] for i in range(len(pos_tok_seq)-1)]
        diff_list.append(0)

        for ctr in range(len(pos_tok_seq)):
            if diff_list[ctr]==1:
                merge_list.append(pos_tok_seq[ctr])
                merge_list.append(pos_tok_seq[ctr+1])
            else:
                phrase_list.append(merge_list)
                merge_list = []
                if pos_tok_seq[ctr] not in reduce(lambda x,y: x+y,phrase_list):
                    phrase_list.append([pos_tok_seq[ctr]])

        merged_pos_list.append([list(set(ele)) for ele in phrase_list if len(ele)>0])
    return dict(zip(pos_sent.keys(),merged_pos_list))

def posSetFeats(text_sent_tokens,token_bert_feats):
    
    assert len(text_sent_tokens)==len(token_bert_feats)
    
    sent_pos_sets = getPOSSetsForSent(text_sent_tokens)
    pos_wise_feats = []
    pos_words = []
    for pos_key in sent_pos_sets.keys():
        curr_pos_key_feats = []
        curr_pos_entity = []
        
        for pos_set in sent_pos_sets[pos_key]:
            if len(pos_set)>1:
                feat_list = []
                sent = ''
                #merge all the tags in current list
                for token in pos_set:
                    feat_list.append(token_bert_feats[token])
                    sent = sent+' '+text_sent_tokens[token]
                curr_pos_key_feats.append(sum(feat_list))
                curr_pos_entity.append(sent.strip())
            else:
                curr_pos_key_feats.append(token_bert_feats[pos_set[0]])
                curr_pos_entity.append(text_sent_tokens[pos_set[0]])
        pos_wise_feats.append(curr_pos_key_feats)
        pos_words.append(curr_pos_entity)
        
    return dict(zip(sent_pos_sets.keys(),pos_wise_feats)),pos_words 

###################### Key-phrase based cosine similarity ######################
def getKeyPhraseFeatures(kp_list, kp_loc_idx,text_feats, text_tokens):
    
    key_phrase_feats = []
    for ele,loc_list in zip(kp_list,kp_loc_idx):
        if len(ele.split(' '))==1:
            idx_val = int(loc_list[0])
            key_phrase_feats.append(getTokenFeature(ele,idx_val,text_feats,text_tokens))
        else:
            curr_feature_vec = []
            for tok,tok_idx in zip(ele.split(' '),loc_list.split(' ')):
                curr_feature_vec.append(getTokenFeature(tok,int(tok_idx),text_feats,text_tokens))
            key_phrase_feats.append(sum(curr_feature_vec))
    return key_phrase_feats
            
def getTokenFeature(token, token_idx, text_feats, text_tokens):    
    if text_tokens[token_idx]==token:
        feat_vec = text_feats[token_idx]
    else:
        #print('Token not found in the location, searching entire text.: ', token)
        if token in text_tokens:
            idx_val = text_tokens.index(token)
            feat_vec = text_feats[idx_val]
        else:
            #print('Token not found.. returning default feature vector: ', token)
            feat_vec = np.full(len(text_feats[0]),0.01)
    return feat_vec

## ----------------- Methods borrowed from BERT tokenizer -----------------

def tokenize(text, never_split = [], do_lower_case = True):
    """Tokenizes a piece of text."""
    text = _clean_text(text)
    # This was added on November 1st, 2018 for the multilingual and Chinese
    # models. This is also applied to the English models now, but it doesn't
    # matter since the English models were not trained on any Chinese data
    # and generally don't have any Chinese data in them (there are Chinese
    # characters in the vocabulary because Wikipedia does have some Chinese
    # words in the English Wikipedia.).
    orig_tokens = whitespace_tokenize(text)
    split_tokens = []
    for token in orig_tokens:
        if do_lower_case and token not in never_split:
            token = token.lower()
            token = _run_strip_accents(token)
        split_tokens.extend(_run_split_on_punc(token))

    output_tokens = whitespace_tokenize(" ".join(split_tokens))
    return output_tokens


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

def _run_split_on_punc(text):
    """Splits punctuation on a piece of text."""
    chars = list(text)
    i = 0
    start_new_word = True
    output = []
    while i < len(chars):
        char = chars[i]
        if _is_punctuation(char):
            output.append([char])
            start_new_word = True
        else:
            if start_new_word:
                output.append([])
            start_new_word = False
            output[-1].append(char)
        i += 1

    return ["".join(x) for x in output]

def _clean_text(text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or _is_control(char):
            continue
        if _is_whitespace(char):
            output.append(" ")
        else:
            output.append(char)
    return "".join(output)

def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def _run_strip_accents(text):
    """Strips accents from a piece of text."""
    text = unicodedata.normalize("NFD", text)
    output = []
    for char in text:
        cat = unicodedata.category(char)
        if cat == "Mn":
            continue
        output.append(char)
    return "".join(output)

def getKPBasedSimilarity(text1, text2,  bert_layer = -1):
    
    
    print(text1)
    print()
    print(text2)
    print()
    
    token_feats_1,final_feats1,text1_bert_tokenized = getBERTFeatures(model, text1, attn_head_idx=layer)
    token_feats_2,final_feats2,text2_bert_tokenized = getBERTFeatures(model, text2, attn_head_idx=layer)

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    #get candidate key-phrases for both sentences
    kps_sent1,kps_loc_sent1 = getCandidatePhrases(text1)
    kps_sent2,kps_loc_sent2 = getCandidatePhrases(text2)

    print(kps_sent1)
    print()
    print(kps_sent2)
    print()
    
    sent1_kp_feats = getKeyPhraseFeatures(kps_sent1,kps_loc_sent1,merged_feats_text1,text1_sent_tokens)
    sent2_kp_feats = getKeyPhraseFeatures(kps_sent2,kps_loc_sent2,merged_feats_text2,text2_sent_tokens)

    curr_max = 0
    for sent1_kp, feats1 in zip(kps_sent1,sent1_kp_feats):
        for sent2_kp, feats2 in zip(kps_sent2,sent2_kp_feats):
            if len(sent1_kp)<3 or len(sent2_kp)<3:
                curr_sim = 0.1
            else:
                curr_sim = 1-spatial.distance.cosine(feats1,feats2)
            print(sent1_kp,'<>',sent2_kp,': ',curr_sim)
            if curr_sim>curr_max:
                curr_max = curr_sim

#     print()      
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')
    
    return curr_max

def getPOSBasedSimilarity(text1, text2,  bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = getBERTFeatures(model, text1, attn_head_idx=layer)
    token_feats_2,final_feats2,text2_bert_tokenized = getBERTFeatures(model, text2, attn_head_idx=layer)

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    sent1_pos_feats,sent1_pos = posSetFeats(text1_sent_tokens,merged_feats_text1)
    sent2_pos_feats,sent2_pos = posSetFeats(text2_sent_tokens,merged_feats_text2)

    #Do pos-tag wise feature similarity and take max() similarity in each pos as metric

    #get pos_tag wise cosine similarity
    pos_idx = 0 #0-noun, 1-verb, 2 - adj
    curr_pos = list(sent1_pos_feats.keys())[pos_idx]
    sent1_tags = sent1_pos[pos_idx]
    sent2_tags = sent2_pos[pos_idx]

    #print('Current POS: ',curr_pos)
    #print()
    sent1_curr_pos_feats = sent1_pos_feats[curr_pos]
    sent2_curr_pos_feats = sent2_pos_feats[curr_pos]

    curr_max = 0
    if len(sent1_curr_pos_feats)>0 and len(sent2_curr_pos_feats)>0:
        for sent1_ctr in range(len(sent1_curr_pos_feats)):
            for sent2_ctr in range(len(sent2_curr_pos_feats)):
                curr_sim = 1-spatial.distance.cosine(sent1_curr_pos_feats[sent1_ctr],sent2_curr_pos_feats[sent2_ctr])
                if curr_sim>curr_max:
                    curr_max = curr_sim
                #print(sent1_tags[sent1_ctr],'<>',sent2_tags[sent2_ctr],":",dist)
            #print()
#     else:
#         print("No %s tokens in one of the texts, skipping" % (curr_pos))
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')

    return curr_max

def getCosineSimilarity(text1, text2,  bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = getBERTFeatures(model, text1, attn_head_idx=layer)
    token_feats_2,final_feats2,text2_bert_tokenized = getBERTFeatures(model, text2, attn_head_idx=layer)

    return 1-spatial.distance.cosine(final_feats1,final_feats2)

In [13]:
def getKPBasedSimilarity_loop(tup1,tup2, text1, text2, bert_layer = -1):
    
    token_feats_1,final_feats1,text1_bert_tokenized = tup1
    token_feats_2,final_feats2,text2_bert_tokenized = tup2

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    #get candidate key-phrases for both sentences
    kps_sent1,kps_loc_sent1 = getCandidatePhrases(text1)
    kps_sent2,kps_loc_sent2 = getCandidatePhrases(text2)

    sent1_kp_feats = getKeyPhraseFeatures(kps_sent1,kps_loc_sent1,merged_feats_text1,text1_sent_tokens)
    sent2_kp_feats = getKeyPhraseFeatures(kps_sent2,kps_loc_sent2,merged_feats_text2,text2_sent_tokens)

    curr_max = 0
    for sent1_kp, feats1 in zip(kps_sent1,sent1_kp_feats):
        for sent2_kp, feats2 in zip(kps_sent2,sent2_kp_feats):
            if len(sent1_kp)<3 or len(sent2_kp)<3:
                curr_sim = 0.1
            else:
                curr_sim = 1-spatial.distance.cosine(feats1,feats2)
            print(sent1_kp,'<>',sent2_kp,': ',curr_sim)
            if curr_sim>curr_max:
                curr_max = curr_sim


#     print()      
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')
    
    return curr_max

def getPOSBasedSimilarity_loop(tup1, tup2,  text1, text2, bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = tup1
    token_feats_2,final_feats2,text2_bert_tokenized = tup2

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    sent1_pos_feats,sent1_pos = posSetFeats(text1_sent_tokens,merged_feats_text1)
    sent2_pos_feats,sent2_pos = posSetFeats(text2_sent_tokens,merged_feats_text2)

    #Do pos-tag wise feature similarity and take max() similarity in each pos as metric

    #get pos_tag wise cosine similarity
    pos_idx = 0 #0-noun, 1-verb, 2 - adj
    curr_pos = list(sent1_pos_feats.keys())[pos_idx]
    sent1_tags = sent1_pos[pos_idx]
    sent2_tags = sent2_pos[pos_idx]

    #print('Current POS: ',curr_pos)
    #print()
    sent1_curr_pos_feats = sent1_pos_feats[curr_pos]
    sent2_curr_pos_feats = sent2_pos_feats[curr_pos]

    curr_max = 0
    if len(sent1_curr_pos_feats)>0 and len(sent2_curr_pos_feats)>0:
        for sent1_ctr in range(len(sent1_curr_pos_feats)):
            for sent2_ctr in range(len(sent2_curr_pos_feats)):
                dist = 1-spatial.distance.cosine(sent1_curr_pos_feats[sent1_ctr],sent2_curr_pos_feats[sent2_ctr])
                if dist>curr_max:
                    curr_max = dist
                #print(sent1_tags[sent1_ctr],'<>',sent2_tags[sent2_ctr],":",dist)
            #print()
#     else:
#         print("No %s tokens in one of the texts, skipping" % (curr_pos))
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')

    return curr_max

def getCosineSimilarity_loop(tup1, tup2,  bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = tup1
    token_feats_2,final_feats2,text2_bert_tokenized = tup2

    return 1-spatial.distance.cosine(final_feats1,final_feats2)

In [14]:
# experiment_id, experiment_name
# 01daapwr6w051q9wwqy99jsgfy - Generic
# 01daaqy88qzb19jqz5prjfr76y - Engineering
# 01daaqyn9gbebc92aywnxedp0c - HR
# 01daatanxnrqa35e6004hb7mbn - Marketing
# 01daatbc3ak1qwc5nyc5ahv2xz - Product
# 01dadp74wfv607knpcb6vvxgtg - AI
# 01daayheky5f4e02qvrjptftxv - Ether Engineering

In [15]:
config_path = '/Users/venkat/Documents/mlflow/mlflow_bert/bert_mlflow_pyfunc/artifacts/bert_config.json'
mind_path = '/Users/venkat/Documents/mlflow/mlflow_bert/bert_mlflow_pyfunc/artifacts/mind.pkl'
model_path = '/Users/venkat/Documents/mlflow/mlflow_bert_deploy/mind-01daaqyn9gbebc92aywnxedp0c/artifacts/model.bin'

config = BertConfig.from_json_file(config_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
mind_dict = pickle.load(open(mind_path,'rb'))

model = BertForPreTrainingCustom(config)
state_dict = torch.load(model_path,map_location='cpu')

model.load_state_dict(state_dict)
model.eval()
print()




In [16]:
#calculate pair-wise cosine similarity
layer = -1

text1_list = []
text2_list = []
#pos_dist_list = []
kp_dist_list = []
cs_dist_list = []

sent_feat_list = []
for text in sent_bucket:
    sent_feat_list.append(getBERTFeatures(model, text, attn_head_idx=layer))
    
for i in range(len(sent_bucket)):
    sent1 = sent_bucket[i]
    for j in range(i+1,len(sent_bucket)):
        sent2 = sent_bucket[j]
        text1_list.append(sent1)
        text2_list.append(sent2)
        tup1 = sent_feat_list[i]
        tup2 = sent_feat_list[j]
        #pos_dist_list.append(getPOSBasedSimilarity_loop(tup1, tup2, sent1, sent2))
        kp_dist_list.append(getKPBasedSimilarity_loop(tup1, tup2,sent1, sent2))
        cs_dist_list.append(getCosineSimilarity_loop(tup1, tup2))

print()

simple network with terraform <> lot :  0.7033495306968689
simple network with terraform <> terraform :  0.8302325010299683
simple network with terraform <> compute instances :  0.7817032337188721
simple network with terraform <> firewall rules :  0.7962830662727356
gcp <> lot :  0.635649561882019
gcp <> terraform :  0.7179625034332275
gcp <> compute instances :  0.7085735201835632
gcp <> firewall rules :  0.7305143475532532
simple network with terraform <> next thing :  0.7364122271537781
simple network with terraform <> necessary credentials from gcp :  0.7998179793357849
gcp <> next thing :  0.6469058990478516
gcp <> necessary credentials from gcp :  0.7542247176170349
simple network with terraform <> gcp console menu :  0.7369287014007568
simple network with terraform <> select apis :  0.7202449440956116
simple network with terraform <> services :  0.6676577925682068
simple network with terraform <> credentials :  0.685900866985321
gcp <> gcp console menu :  0.7173702716827393
gcp 

simple network with terraform <> file :  0.5824180841445923
simple network with terraform <> sensitive information :  0.4985794126987457
simple network with terraform <> public repository :  0.5510944128036499
gcp <> file :  0.5280028581619263
gcp <> sensitive information :  0.45927754044532776
gcp <> public repository :  0.526122510433197
simple network with terraform <> hr strategies :  0.7031657099723816
simple network with terraform <> strategic recruiting :  0.7398914694786072
simple network with terraform <> retention processes :  0.7103188037872314
simple network with terraform <> organization :  0.6984660029411316
gcp <> hr strategies :  0.6309728026390076
gcp <> strategic recruiting :  0.6515409350395203
gcp <> retention processes :  0.6411375999450684
gcp <> organization :  0.6101254224777222
simple network with terraform <> clear need for increased visibility :  0.7060657143592834
simple network with terraform <> sales :  0.6747782230377197
gcp <> clear need for increased vi

compute instances <> terraform code :  0.7336243987083435
firewall rules <> json file :  0.7371116876602173
firewall rules <> computer :  0.701958417892456
firewall rules <> file :  0.6925769448280334
firewall rules <> folder :  0.6034924387931824
firewall rules <> terraform code :  0.7288258671760559
lot <> file :  0.5984790921211243
lot <> sensitive information :  0.5363341569900513
lot <> public repository :  0.5612996220588684
terraform <> file :  0.5832033753395081
terraform <> sensitive information :  0.5017414689064026
terraform <> public repository :  0.5212568640708923
compute instances <> file :  0.5822255611419678
compute instances <> sensitive information :  0.5304741263389587
compute instances <> public repository :  0.5800338387489319
firewall rules <> file :  0.6032566428184509
firewall rules <> sensitive information :  0.5490624904632568
firewall rules <> public repository :  0.6040897965431213
lot <> i :  0.1
lot <> kinds of apps i :  0.6552579402923584
terraform <> i 

terraform <> select apis :  0.6965315937995911
terraform <> services :  0.6503278613090515
terraform <> credentials :  0.6868176460266113
compute instances <> gcp console menu :  0.7211131453514099
compute instances <> select apis :  0.743038535118103
compute instances <> services :  0.7033607363700867
compute instances <> credentials :  0.7035790085792542
firewall rules <> gcp console menu :  0.7305580377578735
firewall rules <> select apis :  0.7604591250419617
firewall rules <> services :  0.7055060267448425
firewall rules <> credentials :  0.7009241580963135
lot <> create credentials button :  0.6574373841285706
lot <> choose service account key :  0.6262028217315674
terraform <> create credentials button :  0.7077409029006958
terraform <> choose service account key :  0.7077256441116333
compute instances <> create credentials button :  0.7422086000442505
compute instances <> choose service account key :  0.7294735312461853
firewall rules <> create credentials button :  0.744856655

compute instances <> today :  0.5297000408172607
compute instances <> much vying :  0.5474939942359924
compute instances <> attention from social media :  0.6163706183433533
compute instances <> engine optimization :  0.5586349368095398
compute instances <> click advertising :  0.5618466734886169
compute instances <> hair :  0.5736733675003052
firewall rules <> message :  0.5994914770126343
firewall rules <> right audience :  0.5830695033073425
firewall rules <> visibility :  0.6152640581130981
firewall rules <> sales :  0.6108180284500122
firewall rules <> profit :  0.5643664002418518
firewall rules <> offer :  0.5491023659706116
firewall rules <> today :  0.5477012395858765
firewall rules <> much vying :  0.5619986057281494
firewall rules <> attention from social media :  0.6223071217536926
firewall rules <> engine optimization :  0.6039102673530579
firewall rules <> click advertising :  0.6043628454208374
firewall rules <> hair :  0.5903322696685791
lot <> truth :  0.637623131275177

next thing <> mind :  0.6279981732368469
next thing <> many approaches :  0.6538664698600769
next thing <> frameworks :  0.5988636612892151
next thing <> same pattern :  0.6477985978126526
next thing <> single page applications :  0.6447657942771912
next thing <> spa :  0.5695654153823853
necessary credentials from gcp <> mind :  0.6022340655326843
necessary credentials from gcp <> many approaches :  0.6488940715789795
necessary credentials from gcp <> frameworks :  0.6079788208007812
necessary credentials from gcp <> same pattern :  0.6914114952087402
necessary credentials from gcp <> single page applications :  0.7069347500801086
necessary credentials from gcp <> spa :  0.6601731777191162
next thing <> next thing :  1.0
next thing <> necessary credentials from gcp :  0.8720231652259827
necessary credentials from gcp <> next thing :  0.8720231652259827
necessary credentials from gcp <> necessary credentials from gcp :  1.0
next thing <> gcp console menu :  0.6774790287017822
next thin

gcp console menu <> json file :  0.7466418147087097
gcp console menu <> computer :  0.7505035400390625
gcp console menu <> file :  0.7466803789138794
gcp console menu <> folder :  0.7320114374160767
gcp console menu <> terraform code :  0.6640741229057312
select apis <> json file :  0.7403733134269714
select apis <> computer :  0.7229674458503723
select apis <> file :  0.7085666656494141
select apis <> folder :  0.6626673936843872
select apis <> terraform code :  0.6970887780189514
services <> json file :  0.6902857422828674
services <> computer :  0.685687780380249
services <> file :  0.6516326069831848
services <> folder :  0.6218801140785217
services <> terraform code :  0.6581186056137085
credentials <> json file :  0.738647997379303
credentials <> computer :  0.7036638855934143
credentials <> file :  0.6740925312042236
credentials <> folder :  0.602500319480896
credentials <> terraform code :  0.7672092318534851
gcp console menu <> file :  0.5973021388053894
gcp console menu <> se

services <> services :  1.0
services <> credentials :  0.8008542060852051
credentials <> gcp console menu :  0.735412061214447
credentials <> select apis :  0.8268114924430847
credentials <> services :  0.8008542060852051
credentials <> credentials :  1.0
gcp console menu <> create credentials button :  0.7560587525367737
gcp console menu <> choose service account key :  0.7201825976371765
select apis <> create credentials button :  0.7664390206336975
select apis <> choose service account key :  0.749791145324707
services <> create credentials button :  0.7258326411247253
services <> choose service account key :  0.7064045071601868
credentials <> create credentials button :  0.7398147583007812
credentials <> choose service account key :  0.7952335476875305
gcp console menu <> next screen choose compute engine default service account :  0.8041345477104187
gcp console menu <> json :  0.7190468907356262
gcp console menu <> click create :  0.7501364350318909
select apis <> next screen choo

services <> sales :  0.5876718759536743
services <> profit :  0.49886441230773926
services <> offer :  0.5088088512420654
services <> today :  0.5171321630477905
services <> much vying :  0.4924747049808502
services <> attention from social media :  0.5437410473823547
services <> engine optimization :  0.5024641752243042
services <> click advertising :  0.5762853026390076
services <> hair :  0.5490598082542419
credentials <> message :  0.5406113862991333
credentials <> right audience :  0.5035713315010071
credentials <> visibility :  0.5563172698020935
credentials <> sales :  0.566875159740448
credentials <> profit :  0.4948379695415497
credentials <> offer :  0.5390015244483948
credentials <> today :  0.48304474353790283
credentials <> much vying :  0.49764347076416016
credentials <> attention from social media :  0.5378573536872864
credentials <> engine optimization :  0.4460325241088867
credentials <> click advertising :  0.519231379032135
credentials <> hair :  0.5543895959854126
g

create credentials button <> i :  0.1
create credentials button <> kinds of apps i :  0.7158718109130859
choose service account key <> i :  0.1
choose service account key <> kinds of apps i :  0.7195063233375549
create credentials button <> mind :  0.5262178182601929
create credentials button <> many approaches :  0.599086582660675
create credentials button <> frameworks :  0.5839546322822571
create credentials button <> same pattern :  0.6304763555526733
create credentials button <> single page applications :  0.6937389969825745
create credentials button <> spa :  0.6143354177474976
choose service account key <> mind :  0.4728448987007141
choose service account key <> many approaches :  0.5739457607269287
choose service account key <> frameworks :  0.5238807797431946
choose service account key <> same pattern :  0.6123532652854919
choose service account key <> single page applications :  0.6852754354476929
choose service account key <> spa :  0.6205419301986694
create credentials butt

next screen choose compute engine default service account <> file :  0.6814656257629395
next screen choose compute engine default service account <> sensitive information :  0.6090664863586426
next screen choose compute engine default service account <> public repository :  0.6256178021430969
json <> file :  0.6337155699729919
json <> sensitive information :  0.5642948150634766
json <> public repository :  0.6030157804489136
click create <> file :  0.6732158064842224
click create <> sensitive information :  0.5933160185813904
click create <> public repository :  0.613262414932251
next screen choose compute engine default service account <> i :  0.1
next screen choose compute engine default service account <> kinds of apps i :  0.7498456835746765
json <> i :  0.1
json <> kinds of apps i :  0.7154486775398254
click create <> i :  0.1
click create <> kinds of apps i :  0.7154185771942139
next screen choose compute engine default service account <> mind :  0.5846037268638611
next screen ch

next screen choose compute engine default service account <> file :  0.6814656257629395
next screen choose compute engine default service account <> sensitive information :  0.6090664863586426
next screen choose compute engine default service account <> public repository :  0.6256178021430969
json <> file :  0.6337155699729919
json <> sensitive information :  0.5642948150634766
json <> public repository :  0.6030157804489136
click create <> file :  0.6732158064842224
click create <> sensitive information :  0.5933160185813904
click create <> public repository :  0.613262414932251
next screen choose compute engine default service account <> hr strategies :  0.735663652420044
next screen choose compute engine default service account <> strategic recruiting :  0.7068179249763489
next screen choose compute engine default service account <> retention processes :  0.6723988056182861
next screen choose compute engine default service account <> organization :  0.6880542039871216
json <> hr str

folder <> public repository :  0.5724555850028992
terraform code <> file :  0.5431674718856812
terraform code <> sensitive information :  0.48530128598213196
terraform code <> public repository :  0.49056318402290344
json file <> i :  0.1
json file <> kinds of apps i :  0.6904010772705078
computer <> i :  0.1
computer <> kinds of apps i :  0.6487200260162354
file <> i :  0.1
file <> kinds of apps i :  0.6387707591056824
folder <> i :  0.1
folder <> kinds of apps i :  0.5817874073982239
terraform code <> i :  0.1
terraform code <> kinds of apps i :  0.663314163684845
json file <> mind :  0.5787428617477417
json file <> many approaches :  0.6468366980552673
json file <> frameworks :  0.6042593717575073
json file <> same pattern :  0.7009438276290894
json file <> single page applications :  0.7379574775695801
computer <> mind :  0.6252466440200806
computer <> many approaches :  0.6245806813240051
computer <> frameworks :  0.5740070343017578
computer <> same pattern :  0.6634291410446167
c

terraform code <> single page applications :  0.6266655325889587
terraform code <> spa :  0.6204925179481506
json file <> next thing :  0.7433861494064331
json file <> necessary credentials from gcp :  0.7887442708015442
computer <> next thing :  0.7239716053009033
computer <> necessary credentials from gcp :  0.7596523761749268
file <> next thing :  0.7131883502006531
file <> necessary credentials from gcp :  0.7576415538787842
folder <> next thing :  0.6244426369667053
folder <> necessary credentials from gcp :  0.6610692143440247
terraform code <> next thing :  0.6690155267715454
terraform code <> necessary credentials from gcp :  0.750929594039917
json file <> gcp console menu :  0.7466418147087097
json file <> select apis :  0.7403733134269714
json file <> services :  0.6902857422828674
json file <> credentials :  0.738647997379303
computer <> gcp console menu :  0.7505035400390625
computer <> select apis :  0.7229674458503723
computer <> services :  0.685687780380249
computer <> 

json file <> message :  0.5936295390129089
json file <> right audience :  0.5453494191169739
json file <> visibility :  0.5643597841262817
json file <> sales :  0.5660220980644226
json file <> profit :  0.564082682132721
json file <> offer :  0.5485097765922546
json file <> today :  0.5239198803901672
json file <> much vying :  0.5703470706939697
json file <> attention from social media :  0.6096683740615845
json file <> engine optimization :  0.5244591236114502
json file <> click advertising :  0.5759921073913574
json file <> hair :  0.5671049356460571
computer <> message :  0.5425970554351807
computer <> right audience :  0.55072420835495
computer <> visibility :  0.5507621765136719
computer <> sales :  0.5508946180343628
computer <> profit :  0.5505022406578064
computer <> offer :  0.5182901620864868
computer <> today :  0.5450959801673889
computer <> much vying :  0.5111145973205566
computer <> attention from social media :  0.5802263617515564
computer <> engine optimization :  0.5

file <> lot :  0.5984790921211243
file <> terraform :  0.5832033753395081
file <> compute instances :  0.5822255611419678
file <> firewall rules :  0.6032566428184509
sensitive information <> lot :  0.5363341569900513
sensitive information <> terraform :  0.5017414689064026
sensitive information <> compute instances :  0.5304741263389587
sensitive information <> firewall rules :  0.5490624904632568
public repository <> lot :  0.5612996220588684
public repository <> terraform :  0.5212568640708923
public repository <> compute instances :  0.5800338387489319
public repository <> firewall rules :  0.6040897965431213
file <> i :  0.1
file <> kinds of apps i :  0.5984356999397278
sensitive information <> i :  0.1
sensitive information <> kinds of apps i :  0.5446292161941528
public repository <> i :  0.1
public repository <> kinds of apps i :  0.5462234616279602
file <> mind :  0.48256149888038635
file <> many approaches :  0.47546714544296265
file <> frameworks :  0.4842069149017334
file <

sensitive information <> today :  0.40315917134284973
sensitive information <> much vying :  0.42777538299560547
sensitive information <> attention from social media :  0.42989683151245117
sensitive information <> engine optimization :  0.40896111726760864
sensitive information <> click advertising :  0.4292507767677307
sensitive information <> hair :  0.414825439453125
public repository <> message :  0.4890401363372803
public repository <> right audience :  0.5303635597229004
public repository <> visibility :  0.5036186575889587
public repository <> sales :  0.5004031658172607
public repository <> profit :  0.49091455340385437
public repository <> offer :  0.4233284294605255
public repository <> today :  0.412577360868454
public repository <> much vying :  0.4174301326274872
public repository <> attention from social media :  0.4483817517757416
public repository <> engine optimization :  0.47346624732017517
public repository <> click advertising :  0.4888310730457306
public repository

mind <> lot :  0.5943441987037659
mind <> terraform :  0.5788816809654236
mind <> compute instances :  0.5701910853385925
mind <> firewall rules :  0.6008445620536804
many approaches <> lot :  0.6443005800247192
many approaches <> terraform :  0.6508517861366272
many approaches <> compute instances :  0.6823780536651611
many approaches <> firewall rules :  0.6810498833656311
frameworks <> lot :  0.6249940991401672
frameworks <> terraform :  0.6350744962692261
frameworks <> compute instances :  0.6657631993293762
frameworks <> firewall rules :  0.6757914423942566
same pattern <> lot :  0.6295137405395508
same pattern <> terraform :  0.7089815139770508
same pattern <> compute instances :  0.711917519569397
same pattern <> firewall rules :  0.7159842252731323
single page applications <> lot :  0.6566972136497498
single page applications <> terraform :  0.7393638491630554
single page applications <> compute instances :  0.7692185640335083
single page applications <> firewall rules :  0.776

same pattern <> company :  0.594109058380127
single page applications <> smart hr insights into different areas :  0.7161063551902771
single page applications <> business :  0.6565380692481995
single page applications <> employee engagement :  0.7287498712539673
single page applications <> company :  0.6194726824760437
mind <> right strategies :  0.5224336385726929
mind <> market :  0.5576660633087158
mind <> business :  0.5883864760398865
mind <> science :  0.5348294377326965
many approaches <> right strategies :  0.6504793763160706
many approaches <> market :  0.6383441686630249
many approaches <> business :  0.648594319820404
many approaches <> science :  0.6030674576759338
frameworks <> right strategies :  0.6344305276870728
frameworks <> market :  0.6102892756462097
frameworks <> business :  0.6213403344154358
frameworks <> science :  0.564495325088501
same pattern <> right strategies :  0.5958402156829834
same pattern <> market :  0.6269161105155945
same pattern <> business :  0.

javascript ecosystem <> gcp :  0.6680334210395813
likely aware <> simple network with terraform :  0.6960723996162415
likely aware <> gcp :  0.6660622954368591
dependencies <> simple network with terraform :  0.7267945408821106
dependencies <> gcp :  0.7027420997619629
javascript ecosystem <> lot :  0.6627459526062012
javascript ecosystem <> terraform :  0.7458078265190125
javascript ecosystem <> compute instances :  0.7658852338790894
javascript ecosystem <> firewall rules :  0.7857034206390381
likely aware <> lot :  0.6585394740104675
likely aware <> terraform :  0.7060492634773254
likely aware <> compute instances :  0.6918301582336426
likely aware <> firewall rules :  0.7023818492889404
dependencies <> lot :  0.6260372400283813
dependencies <> terraform :  0.7332088351249695
dependencies <> compute instances :  0.7860472798347473
dependencies <> firewall rules :  0.7988981008529663
javascript ecosystem <> i :  0.1
javascript ecosystem <> kinds of apps i :  0.7542393803596497
likely

javascript ecosystem <> truth :  0.6964231729507446
javascript ecosystem <> point in business :  0.7518359422683716
javascript ecosystem <> next level :  0.7351155877113342
likely aware <> truth :  0.6962394118309021
likely aware <> point in business :  0.6927191615104675
likely aware <> next level :  0.7114551067352295
dependencies <> truth :  0.6316392421722412
dependencies <> point in business :  0.6623599529266357
dependencies <> next level :  0.6473667621612549
javascript ecosystem <> fray :  0.7028566002845764
likely aware <> fray :  0.6509527564048767
dependencies <> fray :  0.6703645586967468
javascript ecosystem <> entrepreneurs :  0.6155650615692139
javascript ecosystem <> businesses :  0.7482112646102905
javascript ecosystem <> businesses :  0.7482112646102905
likely aware <> entrepreneurs :  0.5810925960540771
likely aware <> businesses :  0.671427309513092
likely aware <> businesses :  0.671427309513092
dependencies <> entrepreneurs :  0.6160057187080383
dependencies <> bu

core libraries <> sales :  0.6372727155685425
major version <> clear need for increased visibility :  0.684818685054779
major version <> sales :  0.6485822796821594
time <> clear need for increased visibility :  0.6563330292701721
time <> sales :  0.610580325126648
core libraries <> order :  0.5644799470901489
core libraries <> visibility :  0.6913478374481201
core libraries <> businesses :  0.6901203989982605
core libraries <> money :  0.640025794506073
major version <> order :  0.5334067940711975
major version <> visibility :  0.7011159658432007
major version <> businesses :  0.6432238221168518
major version <> money :  0.6151018142700195
time <> order :  0.58860182762146
time <> visibility :  0.6672790050506592
time <> businesses :  0.6474569439888
time <> money :  0.6309429407119751
core libraries <> smart hr insights into different areas :  0.7170323133468628
core libraries <> business :  0.6393463611602783
core libraries <> employee engagement :  0.7066076397895813
core libraries

imagine <> mind :  0.1621035784482956
imagine <> many approaches :  0.21320250630378723
imagine <> frameworks :  0.17963145673274994
imagine <> same pattern :  0.3424552083015442
imagine <> single page applications :  0.27048859000205994
imagine <> spa :  0.3363107144832611
library <> mind :  0.4933425188064575
library <> many approaches :  0.531493604183197
library <> frameworks :  0.4768495261669159
library <> same pattern :  0.5969998240470886
library <> single page applications :  0.6343523263931274
library <> spa :  0.5696827173233032
x <> mind :  0.1
x <> many approaches :  0.1
x <> frameworks :  0.1
x <> same pattern :  0.1
x <> single page applications :  0.1
x <> spa :  0.1
peer dependency on react <> mind :  0.5180091857910156
peer dependency on react <> many approaches :  0.5863626599311829
peer dependency on react <> frameworks :  0.5205228924751282
peer dependency on react <> same pattern :  0.6459567546844482
peer dependency on react <> single page applications :  0.65345

imagine <> message :  0.27188119292259216
imagine <> right audience :  0.23159511387348175
imagine <> visibility :  0.24055859446525574
imagine <> sales :  0.24452444911003113
imagine <> profit :  0.2117990404367447
imagine <> offer :  0.2754887640476227
imagine <> today :  0.21590875089168549
imagine <> much vying :  0.18997684121131897
imagine <> attention from social media :  0.24183064699172974
imagine <> engine optimization :  0.17147594690322876
imagine <> click advertising :  0.19789743423461914
imagine <> hair :  0.2553837299346924
library <> message :  0.5878098011016846
library <> right audience :  0.5653436183929443
library <> visibility :  0.5763449668884277
library <> sales :  0.5620135068893433
library <> profit :  0.5863183736801147
library <> offer :  0.5164884328842163
library <> today :  0.49015671014785767
library <> much vying :  0.5618780851364136
library <> attention from social media :  0.535476565361023
library <> engine optimization :  0.5348166823387146
librar

team <> necessary credentials from gcp :  0.7740368247032166
organization <> next thing :  0.7056639194488525
organization <> necessary credentials from gcp :  0.7543191909790039
application <> gcp console menu :  0.6964617371559143
application <> select apis :  0.7057783603668213
application <> services :  0.6659026145935059
application <> credentials :  0.6540307402610779
large codebase <> gcp console menu :  0.6946471929550171
large codebase <> select apis :  0.702961802482605
large codebase <> services :  0.6570026874542236
large codebase <> credentials :  0.6089881062507629
lot of discipline <> gcp console menu :  0.5855889320373535
lot of discipline <> select apis :  0.60960853099823
lot of discipline <> services :  0.5687373280525208
lot of discipline <> credentials :  0.5591097474098206
dependencies <> gcp console menu :  0.5752217173576355
dependencies <> select apis :  0.626754641532898
dependencies <> services :  0.5893502235412598
dependencies <> credentials :  0.5658131837

application <> right strategies :  0.6473776698112488
application <> market :  0.6863020062446594
application <> business :  0.7212247848510742
application <> science :  0.6188004612922668
large codebase <> right strategies :  0.6656766533851624
large codebase <> market :  0.7115263342857361
large codebase <> business :  0.7226066589355469
large codebase <> science :  0.6136094927787781
lot of discipline <> right strategies :  0.6429985761642456
lot of discipline <> market :  0.649507462978363
lot of discipline <> business :  0.6575412154197693
lot of discipline <> science :  0.6129046082496643
dependencies <> right strategies :  0.6151230931282043
dependencies <> market :  0.6125267744064331
dependencies <> business :  0.6326407194137573
dependencies <> science :  0.575533390045166
date <> right strategies :  0.5743285417556763
date <> market :  0.6130316853523254
date <> business :  0.5888562202453613
date <> science :  0.5354923605918884
top priority <> right strategies :  0.6791001

organization <> customer hand :  0.627500593662262
organization <> supply chain demands :  0.6457867622375488
organization <> right marketing strategies :  0.6311925053596497
organization <> business s growth :  0.6450015902519226
simple network with terraform <> lot :  0.7033495306968689
simple network with terraform <> terraform :  0.8302325010299683
simple network with terraform <> compute instances :  0.7817032337188721
simple network with terraform <> firewall rules :  0.7962830662727356
gcp <> lot :  0.635649561882019
gcp <> terraform :  0.7179625034332275
gcp <> compute instances :  0.7085735201835632
gcp <> firewall rules :  0.7305143475532532
simple network with terraform <> i :  0.1
simple network with terraform <> kinds of apps i :  0.7054475545883179
gcp <> i :  0.1
gcp <> kinds of apps i :  0.6512348651885986
simple network with terraform <> mind :  0.5512979030609131
simple network with terraform <> many approaches :  0.6269159913063049
simple network with terraform <> fr

lot <> mind :  0.5486211180686951
lot <> many approaches :  0.5777662396430969
lot <> frameworks :  0.5486137270927429
lot <> same pattern :  0.5667722821235657
lot <> single page applications :  0.5930479764938354
lot <> spa :  0.4970422685146332
terraform <> mind :  0.5111656188964844
terraform <> many approaches :  0.5851883888244629
terraform <> frameworks :  0.5598104596138
terraform <> same pattern :  0.650879979133606
terraform <> single page applications :  0.6664873361587524
terraform <> spa :  0.6349496841430664
compute instances <> mind :  0.502916693687439
compute instances <> many approaches :  0.612530529499054
compute instances <> frameworks :  0.5864436626434326
compute instances <> same pattern :  0.6523317098617554
compute instances <> single page applications :  0.697273850440979
compute instances <> spa :  0.6502001285552979
firewall rules <> mind :  0.5387094020843506
firewall rules <> many approaches :  0.6059420704841614
firewall rules <> frameworks :  0.59073692

compute instances <> much vying :  0.5474939942359924
compute instances <> attention from social media :  0.6163706183433533
compute instances <> engine optimization :  0.5586349368095398
compute instances <> click advertising :  0.5618466734886169
compute instances <> hair :  0.5736733675003052
firewall rules <> message :  0.5994914770126343
firewall rules <> right audience :  0.5830695033073425
firewall rules <> visibility :  0.6152640581130981
firewall rules <> sales :  0.6108180284500122
firewall rules <> profit :  0.5643664002418518
firewall rules <> offer :  0.5491023659706116
firewall rules <> today :  0.5477012395858765
firewall rules <> much vying :  0.5619986057281494
firewall rules <> attention from social media :  0.6223071217536926
firewall rules <> engine optimization :  0.6039102673530579
firewall rules <> click advertising :  0.6043628454208374
firewall rules <> hair :  0.5903322696685791
lot <> truth :  0.637623131275177
lot <> point in business :  0.64303058385849
lot

kinds of apps i <> hair :  0.6074716448783875
i <> truth :  0.1
i <> point in business :  0.1
i <> next level :  0.1
kinds of apps i <> truth :  0.7039738297462463
kinds of apps i <> point in business :  0.7322826981544495
kinds of apps i <> next level :  0.7000707983970642
i <> fray :  0.1
kinds of apps i <> fray :  0.5985550284385681
i <> entrepreneurs :  0.1
i <> businesses :  0.1
i <> businesses :  0.1
kinds of apps i <> entrepreneurs :  0.6907397508621216
kinds of apps i <> businesses :  0.7218548655509949
kinds of apps i <> businesses :  0.7218548655509949
i <> result :  0.1
i <> day :  0.1
i <> day operations :  0.1
i <> company :  0.1
i <> customer hand :  0.1
i <> supply chain demands :  0.1
i <> right marketing strategies :  0.1
i <> business s growth :  0.1
kinds of apps i <> result :  0.4615304172039032
kinds of apps i <> day :  0.523348331451416
kinds of apps i <> day operations :  0.5798562169075012
kinds of apps i <> company :  0.5906825065612793
kinds of apps i <> custo

mind <> smart hr insights into different areas :  0.6034970879554749
mind <> business :  0.54839688539505
mind <> employee engagement :  0.5683553814888
mind <> company :  0.4083476662635803
many approaches <> smart hr insights into different areas :  0.660747230052948
many approaches <> business :  0.5287131667137146
many approaches <> employee engagement :  0.5945852994918823
many approaches <> company :  0.4206082820892334
frameworks <> smart hr insights into different areas :  0.6340945959091187
frameworks <> business :  0.5024874806404114
frameworks <> employee engagement :  0.5552177429199219
frameworks <> company :  0.38954848051071167
same pattern <> smart hr insights into different areas :  0.6082408428192139
same pattern <> business :  0.5771689414978027
same pattern <> employee engagement :  0.6200603246688843
same pattern <> company :  0.5349828004837036
single page applications <> smart hr insights into different areas :  0.6802826523780823
single page applications <> busi

next thing <> json file :  0.7433861494064331
next thing <> computer :  0.7239716053009033
next thing <> file :  0.7131883502006531
next thing <> folder :  0.6244426369667053
next thing <> terraform code :  0.6690155267715454
necessary credentials from gcp <> json file :  0.7887442708015442
necessary credentials from gcp <> computer :  0.7596523761749268
necessary credentials from gcp <> file :  0.7576415538787842
necessary credentials from gcp <> folder :  0.6610692143440247
necessary credentials from gcp <> terraform code :  0.750929594039917
next thing <> file :  0.603550374507904
next thing <> sensitive information :  0.5739662051200867
next thing <> public repository :  0.5995507836341858
necessary credentials from gcp <> file :  0.6281841397285461
necessary credentials from gcp <> sensitive information :  0.5922148823738098
necessary credentials from gcp <> public repository :  0.6270090341567993
next thing <> hr strategies :  0.749718964099884
next thing <> strategic recruiting 

gcp console menu <> clear need for increased visibility :  0.6646220088005066
gcp console menu <> sales :  0.6468845009803772
select apis <> clear need for increased visibility :  0.6825307011604309
select apis <> sales :  0.7035717368125916
services <> clear need for increased visibility :  0.6167879700660706
services <> sales :  0.6645218133926392
credentials <> clear need for increased visibility :  0.7481719255447388
credentials <> sales :  0.7862505912780762
gcp console menu <> order :  0.46573224663734436
gcp console menu <> visibility :  0.6370716094970703
gcp console menu <> businesses :  0.598605215549469
gcp console menu <> money :  0.5637924671173096
select apis <> order :  0.5325543880462646
select apis <> visibility :  0.6492180228233337
select apis <> businesses :  0.6451219320297241
select apis <> money :  0.5967558026313782
services <> order :  0.4812304377555847
services <> visibility :  0.6110538840293884
services <> businesses :  0.6323404908180237
services <> money 

create credentials button <> order :  0.5636461973190308
create credentials button <> visibility :  0.738841712474823
create credentials button <> businesses :  0.6911457180976868
create credentials button <> money :  0.6514800786972046
choose service account key <> order :  0.5165901780128479
choose service account key <> visibility :  0.7212422490119934
choose service account key <> businesses :  0.6908522248268127
choose service account key <> money :  0.6515825986862183
create credentials button <> smart hr insights into different areas :  0.7193871140480042
create credentials button <> business :  0.6580007076263428
create credentials button <> employee engagement :  0.7573258280754089
create credentials button <> company :  0.6562977433204651
choose service account key <> smart hr insights into different areas :  0.6775028705596924
choose service account key <> business :  0.6252575516700745
choose service account key <> employee engagement :  0.7367228865623474
choose service ac

next screen choose compute engine default service account <> offer :  0.5611177682876587
next screen choose compute engine default service account <> today :  0.5562582612037659
next screen choose compute engine default service account <> much vying :  0.5212832093238831
next screen choose compute engine default service account <> attention from social media :  0.5669277906417847
next screen choose compute engine default service account <> engine optimization :  0.5518949627876282
next screen choose compute engine default service account <> click advertising :  0.6072045564651489
next screen choose compute engine default service account <> hair :  0.5579010844230652
json <> message :  0.5521507263183594
json <> right audience :  0.517788827419281
json <> visibility :  0.5395387411117554
json <> sales :  0.5449028611183167
json <> profit :  0.5095419883728027
json <> offer :  0.5212995409965515
json <> today :  0.5076096057891846
json <> much vying :  0.511626660823822
json <> attention

json file <> hair :  0.5671049356460571
computer <> message :  0.5425970554351807
computer <> right audience :  0.55072420835495
computer <> visibility :  0.5507621765136719
computer <> sales :  0.5508946180343628
computer <> profit :  0.5505022406578064
computer <> offer :  0.5182901620864868
computer <> today :  0.5450959801673889
computer <> much vying :  0.5111145973205566
computer <> attention from social media :  0.5802263617515564
computer <> engine optimization :  0.5147152543067932
computer <> click advertising :  0.5553428530693054
computer <> hair :  0.5480452179908752
file <> message :  0.5836738348007202
file <> right audience :  0.5316373705863953
file <> visibility :  0.5449681282043457
file <> sales :  0.5508052110671997
file <> profit :  0.5652665495872498
file <> offer :  0.5568909645080566
file <> today :  0.5238483548164368
file <> much vying :  0.5765305757522583
file <> attention from social media :  0.6005072593688965
file <> engine optimization :  0.525738000869

public repository <> hair :  0.43058109283447266
file <> truth :  0.5750097036361694
file <> point in business :  0.5461763739585876
file <> next level :  0.5255388021469116
sensitive information <> truth :  0.5361959934234619
sensitive information <> point in business :  0.5264109373092651
sensitive information <> next level :  0.5157243013381958
public repository <> truth :  0.5117297172546387
public repository <> point in business :  0.5653414726257324
public repository <> next level :  0.5698559880256653
file <> fray :  0.5237378478050232
sensitive information <> fray :  0.47511160373687744
public repository <> fray :  0.5269913077354431
file <> entrepreneurs :  0.5068826675415039
file <> businesses :  0.5779948830604553
file <> businesses :  0.5779948830604553
sensitive information <> entrepreneurs :  0.4167223274707794
sensitive information <> businesses :  0.5563185214996338
sensitive information <> businesses :  0.5563185214996338
public repository <> entrepreneurs :  0.4079476

hr strategies <> result :  0.508587658405304
hr strategies <> day :  0.5438027381896973
hr strategies <> day operations :  0.6170976161956787
hr strategies <> company :  0.6072874665260315
hr strategies <> customer hand :  0.5778586864471436
hr strategies <> supply chain demands :  0.606802761554718
hr strategies <> right marketing strategies :  0.7008188962936401
hr strategies <> business s growth :  0.63453209400177
strategic recruiting <> result :  0.4966619610786438
strategic recruiting <> day :  0.5892693996429443
strategic recruiting <> day operations :  0.675225555896759
strategic recruiting <> company :  0.6694546341896057
strategic recruiting <> customer hand :  0.6933043599128723
strategic recruiting <> supply chain demands :  0.706113874912262
strategic recruiting <> right marketing strategies :  0.7719807624816895
strategic recruiting <> business s growth :  0.6917837262153625
retention processes <> result :  0.5227707624435425
retention processes <> day :  0.58553260564804

order <> truth :  0.5536348819732666
order <> point in business :  0.6310405731201172
order <> next level :  0.6227957606315613
visibility <> truth :  0.6858922839164734
visibility <> point in business :  0.6917903423309326
visibility <> next level :  0.6695544123649597
businesses <> truth :  0.6555250287055969
businesses <> point in business :  0.7038989663124084
businesses <> next level :  0.6687332391738892
money <> truth :  0.6158102750778198
money <> point in business :  0.6924826502799988
money <> next level :  0.6847992539405823
order <> fray :  0.4884242117404938
visibility <> fray :  0.6564586162567139
businesses <> fray :  0.62158203125
money <> fray :  0.5712242126464844
order <> entrepreneurs :  0.4456438720226288
order <> businesses :  0.6076720356941223
order <> businesses :  0.6076720356941223
visibility <> entrepreneurs :  0.6782413125038147
visibility <> businesses :  0.7484115958213806
visibility <> businesses :  0.7484115958213806
businesses <> entrepreneurs :  0.668

businesses <> fray :  0.5707688331604004
backs of social media <> fray :  0.6161086559295654
businesses <> entrepreneurs :  0.8263787031173706
businesses <> businesses :  0.712884247303009
businesses <> businesses :  0.712884247303009
backs of social media <> entrepreneurs :  0.6659939289093018
backs of social media <> businesses :  0.6729409098625183
backs of social media <> businesses :  0.6729409098625183
businesses <> result :  0.38400381803512573
businesses <> day :  0.39625823497772217
businesses <> day operations :  0.47274941205978394
businesses <> company :  0.50888991355896
businesses <> customer hand :  0.47540947794914246
businesses <> supply chain demands :  0.4736862778663635
businesses <> right marketing strategies :  0.508621096611023
businesses <> business s growth :  0.5138648748397827
backs of social media <> result :  0.46309134364128113
backs of social media <> day :  0.476673424243927
backs of social media <> day operations :  0.53926020860672
backs of social medi

hair <> day operations :  0.5673902630805969
hair <> company :  0.6100374460220337
hair <> customer hand :  0.6192629933357239
hair <> supply chain demands :  0.6093231439590454
hair <> right marketing strategies :  0.6344782114028931
hair <> business s growth :  0.6841169595718384
truth <> fray :  0.5897005796432495
point in business <> fray :  0.6186079382896423
next level <> fray :  0.6092283725738525
truth <> entrepreneurs :  0.6059399843215942
truth <> businesses :  0.6558205485343933
truth <> businesses :  0.6558205485343933
point in business <> entrepreneurs :  0.5488950610160828
point in business <> businesses :  0.7040399312973022
point in business <> businesses :  0.7040399312973022
next level <> entrepreneurs :  0.5297229290008545
next level <> businesses :  0.6597060561180115
next level <> businesses :  0.6597060561180115
truth <> result :  0.5212628841400146
truth <> day :  0.4799610674381256
truth <> day operations :  0.5263763666152954
truth <> company :  0.5333083868026

In [17]:
score_df = pd.DataFrame({'Sent1': text1_list,
                        'Sent2': text2_list,
                        'KP_Similarity': kp_dist_list,
                        'Cosine_Similarity': cs_dist_list})
score_df = score_df[score_df['Sent1']!=score_df['Sent2']]
print(score_df.shape)
#score_df = score_df[score_df['KP_Similarity']<1]

(621, 4)


In [18]:
score_df['cs_kp_diff'] = score_df['Cosine_Similarity']-score_df['KP_Similarity']
score_df = score_df[score_df['KP_Similarity']>0]
score_df.sort_values(by='cs_kp_diff', ascending=False)

Unnamed: 0,Sent1,Sent2,KP_Similarity,Cosine_Similarity,cs_kp_diff
472,"and with that in mind, i tried many approaches...",how do you get your message to the right audie...,0.649831,0.823917,0.174086
325,"if you work in the javascript ecosystem, you a...",how do you get your message to the right audie...,0.661606,0.822899,0.161294
623,how do you get your message to the right audie...,as a result of dealing with the day to day ope...,0.711773,0.870922,0.159149
300,"and with that in mind, i tried many approaches...",how do you get your message to the right audie...,0.625579,0.768445,0.142866
64,you can do a lot more with terraform such as s...,how do you get your message to the right audie...,0.622307,0.762091,0.139784
435,you can do a lot more with terraform such as s...,how do you get your message to the right audie...,0.622307,0.762091,0.139784
394,maintaining an application with a large codeba...,how do you get your message to the right audie...,0.671347,0.806298,0.134951
547,this will download a json file to your compute...,how do you get your message to the right audie...,0.609668,0.743621,0.133953
219,this will download a json file to your compute...,how do you get your message to the right audie...,0.609668,0.743621,0.133953
351,when one of your core libraries releases a maj...,"if you are feeling stuck, join the fray",0.645090,0.777972,0.132883


In [19]:
idx = 472
text1 = score_df['Sent1'][idx]
text2 = score_df['Sent2'][idx]
getKPBasedSimilarity(text1,text2)

and with that in mind, i tried many approaches and frameworks for implementing the same pattern: single page applications (spa)

how do you get your message to the right audience and do it effectively? how do you boost visibility and increase sales while sustaining a profit with a converting offer? today, with so much vying for our attention from social media, to search engine optimization, blogging and pay per click advertising, it is easy to see why most are ready to pull their hair out

['mind', 'many approaches', 'frameworks', 'same pattern', 'single page applications', 'spa']

['message', 'right audience', 'visibility', 'sales', 'profit', 'offer', 'today', 'much vying', 'attention from social media', 'engine optimization', 'click advertising', 'hair']

mind <> message :  0.5401946306228638
mind <> right audience :  0.5094702243804932
mind <> visibility :  0.5330876111984253
mind <> sales :  0.5251703858375549
mind <> profit :  0.528603732585907
mind <> offer :  0.4492456614971161


0.6498307585716248

In [None]:
replaceContractions("I didn't do that")