In [1]:
import pandas as pd
import pickle
import numpy as np
import json 
from IPython.display import clear_output
import requests
from pytorch_pretrained_bert import BertTokenizer, BertConfig, BertModel
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertPreTrainingHeads
from bert_utils import *
import time
from scipy import spatial

import nltk
import string,itertools
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import unicodedata
from more_itertools import locate
from functools import reduce
import pandas as pd
import re

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import math
from itertools import chain

In [2]:
contractions = json.load(open('contractions.txt','rb'))
contractions = contractions['contractions']
    
def replaceContractions(text):
    #text = text.lower()
    c_filt_text = ''
    for word in text.split(' '):
        if word in contractions:
            c_filt_text = c_filt_text+' '+contractions[word]
        else:
            c_filt_text = c_filt_text+' '+word
    return c_filt_text.strip()

def stripText(text):
    text = replaceContractions(text.lower())
    text = re.sub('(\d+[A-z]+)|(([A-z]+\d+))',' ',text) #remove alphanumeric words
    text = re.sub('-',' ', text)
    text = re.sub('\s+',' ', text)
    text = re.sub("'",' ', text)
    return text.strip()

In [3]:
#load test data
import json
import os
test_file_list = os.listdir('./test_data/')
master_text = ''

for file in test_file_list:
    curr_file = json.load(open('test_data/'+file,'rb'))
    master_text = master_text+' '+curr_file
    
sent_bucket = master_text.split('.')
sent_bucket = [ele.strip() for ele in sent_bucket if len(ele)>5]
sent_bucket = [stripText(ele) for ele in sent_bucket]

In [4]:
def getregexChunks(text, grammar):

    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    return [(ele[0], ele[1], ele[2], ctr) for ele,ctr in zip(all_chunks,range(len(all_chunks)))]

def getCandidatePhrases(text, pos_search_pattern_list=[r"""base: {(<JJ.*>*<NN.*>+<IN>)?<JJ>*<NN.*>+}""",
                                           r"""nounverb:{<NN.*>+<VB.*>+}""",
                                           r"""verbnoun:{<VB.*>+<NN.*>+}"""]):
                                       #r""" nounnoun:{<NN.+>+<.+>{1,2}<NN.+>+}"""]):
                                       #r"""baseverb: {(<JJ.+>+<IN>)?<JJ>*<VB.*>+}"""]):
    text = stripText(text)
    punct = set(string.punctuation)
    all_chunks = []

    for pattern in pos_search_pattern_list:
        all_chunks+=getregexChunks(text, pattern)
    
    candidate_phrases = [' '.join(word for word, pos, 
                           chunk,ctr in group).lower() 
                  for key, group in itertools.groupby(all_chunks, 
                  lambda_unpack(lambda word, pos, chunk, ctr: chunk != 'O')) if key]
    
#     candidate_locs = [' '.join(str(ctr) for word, pos, 
#                            chunk,ctr in group).lower() 
#                   for key, group in itertools.groupby(all_chunks, 
#                   lambda_unpack(lambda word, pos, chunk, ctr: chunk != 'O')) if key]
    
    filtered_candidates = []
    for key_phrase in candidate_phrases:
        curr_filtr_phrase = stripStopWordsFromText(key_phrase,stop_words)
        if len(curr_filtr_phrase)>0:
            filtered_candidates.append(curr_filtr_phrase)
        
    #remove the key-phrases starting with stop_words assuming that the stop_word is a verb and 
    #the noun would be covered in the next pattern
#     filtered_candidates = []
#     for key_phrase in candidate_phrases:
#         if key_phrase.split(' ')[0] not in stop_words and key_phrase.split(' ')[-1] not in stop_words:
#             filtered_candidates.append(key_phrase)
    candidate_phrases = filterCandidatePhrases(text,filtered_candidates)
    candidate_phrases,candidate_locs = getPhraseListLocations(text, candidate_phrases)
    return candidate_phrases,candidate_locs
    
def lambda_unpack(f):
    return lambda args: f(*args)

def getWordLevelFeats(sent,token_feat_dict,tokenizer):
    word_feat_list = []
    for word in sent.split(' '):
        tokenized_word = tokenizer.tokenize(word)
        if len(tokenized_word)==1:
            word_feat_list.append(np.array(token_feat_dict[tokenized_word]))
        else:
            tok_feats = []
            for tok in tokenized_word:
                tok_feats+=np.array(token_feat_dict[tok])
            word_feat_list.append(tok_feats)
            
    return word_feat_list

def getWordFeatsFromBertTokenFeats(sent_tokens,bert_tokens,bert_token_feats):
    #steps for merging the bert tokens to get the BERT features for actual words
    #1. iterate over the BERT base tokenizer
    #2. lookup for the actual word in the current BERT lookup postions
    #3. If found:
        #3a. the word is not tokenized further - use the current BERT features as word embedding
    #else:
        #3b. the word is tokenized in BERT - find the sequence of tokens and sum up the features to get the word vector
    base_ctr = 0
    bert_ctr = 0
    word_feat_list = []

    for word in sent_tokens:
        if bert_tokens[bert_ctr] == word:#word not further tokenized, use the same feature vector
            word_feat_list.append(np.array(bert_token_feats[bert_ctr].detach().numpy()))
            base_ctr+=1
            bert_ctr+=1
        else:
            aggr_feats = np.array(bert_token_feats[bert_ctr].detach().numpy())
            aggr_word = bert_tokens[bert_ctr]
            merge_next = True
            while merge_next and bert_ctr<len(bert_tokens)-1:
                if '#' in bert_tokens[bert_ctr+1]:
                    aggr_word = aggr_word+bert_tokens[bert_ctr+1]
                    bert_ctr+=1
                    aggr_feats+=np.array(bert_token_feats[bert_ctr].detach().numpy())
                else:
                    merge_next = False
                    bert_ctr+=1
            word_feat_list.append(aggr_feats)
    assert len(sent_tokens)==len(word_feat_list)
    return word_feat_list

def getPOSPhrases(sent_tokens,candidate_pos_tags = ['NN','NNPS','NNS','NNP','VBG','VBN','VBP','VBZ','JJ','JJR','JJS'],
                  tag_pairs = ['NN','VB','JJ']):

    candidate_pos_tags = ['NN','NNPS','NNS','NNP','VBG','VBN','VBP','VBZ','JJ','JJR','JJS']
    tag_pairs = ['NN','VB','JJ']

    sent_tokens_pos= nltk.pos_tag(sent_tokens)
    sent_tokens = [(ele[0],ele[1],ctr) for ctr,ele in zip(range(len(sent_tokens_pos)),sent_tokens_pos) if ele[1] in candidate_pos_tags and len(ele[0])>2]

    noun_tags = [(tok,ctr) for tok,pos,ctr in sent_tokens if 'NN' in pos]
    verb_tags = [(tok,ctr) for tok,pos,ctr in sent_tokens if 'VB' in pos]
    adj_tags = [(tok,ctr) for tok,pos,ctr in sent_tokens if 'JJ' in pos]

    #return dict(zip(['noun_tags','verb_tags','adj_tags'],[[noun_tags],[verb_tags],[adj_tags]]))
    return dict(zip(['noun_tags','verb_tags','adj_tags'],[noun_tags,verb_tags,adj_tags]))

def getPOSSetsForSent(text_sent_tokens):
    pos_sent = getPOSPhrases(text_sent_tokens)
    merged_pos_list = []
    
    for key in pos_sent.keys():
        merge_list = []
        phrase_list = []

        curr_candidate_tokens = [ele[0] for ele in pos_sent[key]]
        pos_tok_seq = [ele[1] for ele in pos_sent[key]]
        
        assert len(curr_candidate_tokens) == len(pos_tok_seq)

        diff_list = [pos_tok_seq[i+1]-pos_tok_seq[i] for i in range(len(pos_tok_seq)-1)]
        diff_list.append(0)

        for ctr in range(len(pos_tok_seq)):
            if diff_list[ctr]==1:
                merge_list.append(pos_tok_seq[ctr])
                merge_list.append(pos_tok_seq[ctr+1])
            else:
                phrase_list.append(merge_list)
                merge_list = []
                if pos_tok_seq[ctr] not in reduce(lambda x,y: x+y,phrase_list):
                    phrase_list.append([pos_tok_seq[ctr]])

        merged_pos_list.append([list(set(ele)) for ele in phrase_list if len(ele)>0])
    return dict(zip(pos_sent.keys(),merged_pos_list))

def posSetFeats(text_sent_tokens,token_bert_feats):
    
    assert len(text_sent_tokens)==len(token_bert_feats)
    
    sent_pos_sets = getPOSSetsForSent(text_sent_tokens)
    pos_wise_feats = []
    pos_words = []
    for pos_key in sent_pos_sets.keys():
        curr_pos_key_feats = []
        curr_pos_entity = []
        
        for pos_set in sent_pos_sets[pos_key]:
            if len(pos_set)>1:
                feat_list = []
                sent = ''
                #merge all the tags in current list
                for token in pos_set:
                    feat_list.append(token_bert_feats[token])
                    sent = sent+' '+text_sent_tokens[token]
                curr_pos_key_feats.append(sum(feat_list))
                curr_pos_entity.append(sent.strip())
            else:
                curr_pos_key_feats.append(token_bert_feats[pos_set[0]])
                curr_pos_entity.append(text_sent_tokens[pos_set[0]])
        pos_wise_feats.append(curr_pos_key_feats)
        pos_words.append(curr_pos_entity)
        
    return dict(zip(sent_pos_sets.keys(),pos_wise_feats)),pos_words 

###################### Key-phrase based cosine similarity ######################
def getKeyPhraseFeatures(kp_list, kp_loc_idx,text_feats, text_tokens):
    
    key_phrase_feats = []
    for ele,loc_list in zip(kp_list,kp_loc_idx):
        if len(ele.split(' '))==1:
            idx_val = int(loc_list[0])
            key_phrase_feats.append(getTokenFeature(ele,idx_val,text_feats,text_tokens))
        else:
            curr_feature_vec = []
            for tok,tok_idx in zip(ele.split(' '),loc_list.split(' ')):
                curr_feature_vec.append(getTokenFeature(tok,int(tok_idx),text_feats,text_tokens))
            key_phrase_feats.append(sum(curr_feature_vec))
    return key_phrase_feats
            
def getTokenFeature(token, token_idx, text_feats, text_tokens):    
    if text_tokens[token_idx]==token:
        feat_vec = text_feats[token_idx]
    else:
        #print('Token not found in the location, searching entire text.: ', token)
        if token in text_tokens:
            idx_val = text_tokens.index(token)
            feat_vec = text_feats[idx_val]
        else:
            #print('Token not found.. returning default feature vector: ', token)
            feat_vec = np.full(len(text_feats[0]),0.01)
    return feat_vec

## ----------------- Methods borrowed from BERT tokenizer -----------------

def tokenize(text, never_split = [], do_lower_case = True):
    """Tokenizes a piece of text."""
    text = _clean_text(text)
    # This was added on November 1st, 2018 for the multilingual and Chinese
    # models. This is also applied to the English models now, but it doesn't
    # matter since the English models were not trained on any Chinese data
    # and generally don't have any Chinese data in them (there are Chinese
    # characters in the vocabulary because Wikipedia does have some Chinese
    # words in the English Wikipedia.).
    orig_tokens = whitespace_tokenize(text)
    split_tokens = []
    for token in orig_tokens:
        if do_lower_case and token not in never_split:
            token = token.lower()
            token = _run_strip_accents(token)
        split_tokens.extend(_run_split_on_punc(token))

    output_tokens = whitespace_tokenize(" ".join(split_tokens))
    return output_tokens


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False

def _run_split_on_punc(text):
    """Splits punctuation on a piece of text."""
    chars = list(text)
    i = 0
    start_new_word = True
    output = []
    while i < len(chars):
        char = chars[i]
        if _is_punctuation(char):
            output.append([char])
            start_new_word = True
        else:
            if start_new_word:
                output.append([])
            start_new_word = False
            output[-1].append(char)
        i += 1

    return ["".join(x) for x in output]

def _clean_text(text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
        cp = ord(char)
        if cp == 0 or cp == 0xfffd or _is_control(char):
            continue
        if _is_whitespace(char):
            output.append(" ")
        else:
            output.append(char)
    return "".join(output)

def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False

def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False

def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def _run_strip_accents(text):
    """Strips accents from a piece of text."""
    text = unicodedata.normalize("NFD", text)
    output = []
    for char in text:
        cat = unicodedata.category(char)
        if cat == "Mn":
            continue
        output.append(char)
    return "".join(output)

def getKPBasedSimilarity(text1, text2,  bert_layer = -1):
    
    
    token_feats_1,final_feats1,text1_bert_tokenized = getBERTFeatures(model, text1, attn_head_idx=bert_layer)
    token_feats_2,final_feats2,text2_bert_tokenized = getBERTFeatures(model, text2, attn_head_idx=bert_layer)

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    #get candidate key-phrases for both sentences
    kps_sent1,kps_loc_sent1 = getCandidatePhrases(text1)
    kps_sent2,kps_loc_sent2 = getCandidatePhrases(text2)

#     print(kps_sent1)
#     print()
#     print(kps_sent2)
#     print()
    
    sent1_kp_feats = getKeyPhraseFeatures(kps_sent1,kps_loc_sent1,merged_feats_text1,text1_sent_tokens)
    sent2_kp_feats = getKeyPhraseFeatures(kps_sent2,kps_loc_sent2,merged_feats_text2,text2_sent_tokens)

    curr_max = 0
    for sent1_kp, feats1 in zip(kps_sent1,sent1_kp_feats):
        for sent2_kp, feats2 in zip(kps_sent2,sent2_kp_feats):
            if len(sent1_kp)<3 or len(sent2_kp)<3:
                curr_sim = 0.1
            else:
                curr_sim = 1-spatial.distance.cosine(feats1,feats2)
            if len(sent1_kp.split(' '))==1 and len(sent2_kp.split(' '))==1:
                #penalize by 5 points?
                curr_sim = curr_sim-0.05
            #print(sent1_kp,'<>',sent2_kp,': ',curr_sim)
            if curr_sim>curr_max:
                curr_max = curr_sim

#     print()      
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')
    
    return curr_max

def getPOSBasedSimilarity(text1, text2,  bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = getBERTFeatures(model, text1, attn_head_idx=layer)
    token_feats_2,final_feats2,text2_bert_tokenized = getBERTFeatures(model, text2, attn_head_idx=layer)

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    sent1_pos_feats,sent1_pos = posSetFeats(text1_sent_tokens,merged_feats_text1)
    sent2_pos_feats,sent2_pos = posSetFeats(text2_sent_tokens,merged_feats_text2)

    #Do pos-tag wise feature similarity and take max() similarity in each pos as metric

    #get pos_tag wise cosine similarity
    pos_idx = 0 #0-noun, 1-verb, 2 - adj
    curr_pos = list(sent1_pos_feats.keys())[pos_idx]
    sent1_tags = sent1_pos[pos_idx]
    sent2_tags = sent2_pos[pos_idx]

    #print('Current POS: ',curr_pos)
    #print()
    sent1_curr_pos_feats = sent1_pos_feats[curr_pos]
    sent2_curr_pos_feats = sent2_pos_feats[curr_pos]

    curr_max = 0
    if len(sent1_curr_pos_feats)>0 and len(sent2_curr_pos_feats)>0:
        for sent1_ctr in range(len(sent1_curr_pos_feats)):
            for sent2_ctr in range(len(sent2_curr_pos_feats)):
                curr_sim = 1-spatial.distance.cosine(sent1_curr_pos_feats[sent1_ctr],sent2_curr_pos_feats[sent2_ctr])
                if curr_sim>curr_max:
                    curr_max = curr_sim
    return curr_max

def getCosineSimilarity(text1, text2,  bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = getBERTFeatures(model, text1, attn_head_idx=layer)
    token_feats_2,final_feats2,text2_bert_tokenized = getBERTFeatures(model, text2, attn_head_idx=layer)

    return 1-spatial.distance.cosine(final_feats1,final_feats2)

In [5]:
def getKPBasedSimilarity_loop(tup1,tup2, text1, text2, bert_layer = -1):
    
    token_feats_1,final_feats1,text1_bert_tokenized = tup1
    token_feats_2,final_feats2,text2_bert_tokenized = tup2

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    #get candidate key-phrases for both sentences
    kps_sent1,kps_loc_sent1 = getCandidatePhrases(text1)
    kps_sent2,kps_loc_sent2 = getCandidatePhrases(text2)

    sent1_kp_feats = getKeyPhraseFeatures(kps_sent1,kps_loc_sent1,merged_feats_text1,text1_sent_tokens)
    sent2_kp_feats = getKeyPhraseFeatures(kps_sent2,kps_loc_sent2,merged_feats_text2,text2_sent_tokens)

    curr_max = 0
    for sent1_kp, feats1 in zip(kps_sent1,sent1_kp_feats):
        for sent2_kp, feats2 in zip(kps_sent2,sent2_kp_feats):
            if len(sent1_kp)<3 or len(sent2_kp)<3:
                curr_sim = 0.1
            else:
                curr_sim = 1-spatial.distance.cosine(feats1,feats2)
            if len(sent1_kp.split(' '))==1 and len(sent2_kp.split(' '))==1:
                #penalize by 5 points?
                curr_sim = curr_sim-0.05
            #print(sent1_kp,'<>',sent2_kp,': ',curr_sim)
            if curr_sim>curr_max:
                curr_max = curr_sim


#     print()      
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')
    
    return curr_max

def getPOSBasedSimilarity_loop(tup1, tup2,  text1, text2, bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = tup1
    token_feats_2,final_feats2,text2_bert_tokenized = tup2

    text1_sent_tokens = tokenize(text1)
    text2_sent_tokens = tokenize(text2)

    merged_feats_text1 = getWordFeatsFromBertTokenFeats(text1_sent_tokens,text1_bert_tokenized,token_feats_1)
    merged_feats_text2 = getWordFeatsFromBertTokenFeats(text2_sent_tokens,text2_bert_tokenized,token_feats_2)

    sent1_pos_feats,sent1_pos = posSetFeats(text1_sent_tokens,merged_feats_text1)
    sent2_pos_feats,sent2_pos = posSetFeats(text2_sent_tokens,merged_feats_text2)

    #Do pos-tag wise feature similarity and take max() similarity in each pos as metric

    #get pos_tag wise cosine similarity
    pos_idx = 0 #0-noun, 1-verb, 2 - adj
    curr_pos = list(sent1_pos_feats.keys())[pos_idx]
    sent1_tags = sent1_pos[pos_idx]
    sent2_tags = sent2_pos[pos_idx]

    #print('Current POS: ',curr_pos)
    #print()
    sent1_curr_pos_feats = sent1_pos_feats[curr_pos]
    sent2_curr_pos_feats = sent2_pos_feats[curr_pos]

    curr_max = 0
    if len(sent1_curr_pos_feats)>0 and len(sent2_curr_pos_feats)>0:
        for sent1_ctr in range(len(sent1_curr_pos_feats)):
            for sent2_ctr in range(len(sent2_curr_pos_feats)):
                dist = 1-spatial.distance.cosine(sent1_curr_pos_feats[sent1_ctr],sent2_curr_pos_feats[sent2_ctr])
                if dist>curr_max:
                    curr_max = dist
                #print(sent1_tags[sent1_ctr],'<>',sent2_tags[sent2_ctr],":",dist)
            #print()
#     else:
#         print("No %s tokens in one of the texts, skipping" % (curr_pos))
#     print('-----------------------------------------------------------')
#     print('final similarity:', curr_max)
#     print('-----------------------------------------------------------')
#     print("Sentence level score: ", 1-spatial.distance.cosine(final_feats1,final_feats2))
#     print('-----------------------------------------------------------')

    return curr_max

def getCosineSimilarity_loop(tup1, tup2,  bert_layer = -1):

    token_feats_1,final_feats1,text1_bert_tokenized = tup1
    token_feats_2,final_feats2,text2_bert_tokenized = tup2

    return 1-spatial.distance.cosine(final_feats1,final_feats2)

###-------------------------------------- New functions --------------------------------------###

def removeStopwords(text):
    sent = ' '.join([tok for tok in text.split(' ') if tok not in stop_words])
    return sent

def getStartEndPOSList(text,candidate_phrases_list):
    start_pos_list = []
    end_pos_list = []
    processed_list = []
    for candidate in candidate_phrases_list:
        start_pos = [match.start() for match in re.finditer(candidate, text)]
        if len(start_pos)==1:
            processed_list.append(candidate)
            start_pos_list.append(start_pos[0])
            end_pos_list.append(start_pos[0]+len(candidate))
        else:
            tok_ctr = processed_list.count(candidate)
            start_pos_list.append(start_pos[tok_ctr])
            end_pos_list.append(start_pos[tok_ctr]+len(candidate))
            processed_list.append(candidate)
    return start_pos_list, end_pos_list

def filterCandidatePhrases(text, candidate_phrases_list):
    drop_list = []
    merge_list = []
    merge_list_start = []
    merge_list_end = []

    filtered_sent = removeStopwords(text)
    filtered_phrase_list = [removeStopwords(phrase) for phrase in candidate_phrases_list]

    start_pos_list, end_pos_list = getStartEndPOSList(text,candidate_phrases_list)
    filtered_start_pos_list, filtered_end_pos_list = getStartEndPOSList(filtered_sent,filtered_phrase_list)
    assert len(filtered_start_pos_list)==len(filtered_phrase_list)

    for i in range(len(start_pos_list)):
        curr_start,curr_end,ctr = start_pos_list[i],end_pos_list[i],i

        for j in range(i+1, len(start_pos_list)):
            lookup_start, lookup_end, lookup_ctr = start_pos_list[j], end_pos_list[j], j
            if curr_start==lookup_start and curr_end==lookup_end:
                continue
            if (curr_start<=lookup_start and curr_end>=lookup_end) or (lookup_start<=curr_start and lookup_end>=curr_end):
                if len(candidate_phrases_list[i])<len(candidate_phrases_list[j]):
                    drop_list.append(candidate_phrases_list[i])
                else:
                    drop_list.append(candidate_phrases_list[j])

        for k in range(len(start_pos_list)):
            if filtered_start_pos_list[i]-filtered_end_pos_list[k]==1:
                merge_list.append([candidate_phrases_list[i],candidate_phrases_list[k]])
                drop_list.append(candidate_phrases_list[i])
                drop_list.append(candidate_phrases_list[k])
                merge_list_start.append(min(start_pos_list[i],start_pos_list[k]))
                merge_list_end.append(max(end_pos_list[i],end_pos_list[k]))

    for ctr in range(len(merge_list)):
        candidate_phrases_list.append(text[merge_list_start[ctr]:merge_list_end[ctr]])
        
    #do not do set operation
    for ele in drop_list:
        if ele in candidate_phrases_list:
            candidate_phrases_list.remove(ele)
    return candidate_phrases_list

def stripStopWordsFromText(sent, stop_words):
    fw_ctr = 0
    bw_ctr = 0
    for tok in sent.split(' '):
        if tok in stop_words:
            fw_ctr+=1
        else:
            break
    for tok in reversed(sent.split(' ')):
        if tok in stop_words:
            bw_ctr-=1
        else:
            break
    if bw_ctr!=0:
        stripped_kp = ' '.join(sent.split(' ')[fw_ctr:bw_ctr])
    else:
        stripped_kp = ' '.join(sent.split(' ')[fw_ctr:])
            
    return stripped_kp.strip()

def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))        
    range_list = [list(range(ele[0],ele[1]+1)) for ele in results]
    
    return range_list

def getPhraseListLocations(text, candidate_phrases):
    #assuming that the 
    phrase_idx_list = []
    token_sent_list = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
    token_list = list(chain(*token_sent_list))
    for phrase in candidate_phrases:
        phrase_tokens = nltk.word_tokenize(phrase)
        phrase_idx = find_sub_list(phrase_tokens,token_list)
        phrase_idx_list.append(phrase_idx)
     
    processed_phrase_list = []
    processed_idx_list = []
    for phrase, loc_idx in zip(candidate_phrases,phrase_idx_list):
        if len(loc_idx)==1:
            processed_phrase_list.append(phrase)
            processed_idx_list.append(loc_idx[0])
        else:
            #count number of times the phrase has occurred in the list
            if phrase not in processed_phrase_list:
                kp_occ_ctr = candidate_phrases.count(phrase)
                if kp_occ_ctr == len(loc_idx):
                    #append current key-phrase `kp_occ_ctr` times into the lists
                    processed_phrase_list+=[phrase]*kp_occ_ctr
                    processed_idx_list+=loc_idx
                else: 
                    idx_drop_list = []
                    #the phrase index is calculated as part of another key-phrase index
                    #check other sublists that are 
                    #find other locations 
                    for lookup_loc in phrase_idx_list:
                        if lookup_loc!=loc_idx and len(lookup_loc[0])!=len(loc_idx[0]):
                            for i in range(len(curr)):
                                if((set(loc_idx[i]) & set(lookup_loc[0]))== set(loc_idx[i])):
                                    idx_drop_list.append(loc_idx[i])
                    for to_insert_loc in loc_idx:
                        if to_insert_loc not in idx_drop_list:
                            processed_phrase_list.append(phrase)
                            processed_idx_list.append(to_insert_loc)
                            
    str_loc_list = []
    for ele in processed_idx_list:
        str_loc = ''
        for tok in ele:
            str_loc = str_loc+' '+str(tok)
        str_loc_list.append(str_loc.strip())
            
    return processed_phrase_list,str_loc_list


In [6]:
# experiment_id, experiment_name
# 01daapwr6w051q9wwqy99jsgfy - Generic
# 01daaqy88qzb19jqz5prjfr76y - Engineering
# 01daaqyn9gbebc92aywnxedp0c - HR
# 01daatanxnrqa35e6004hb7mbn - Marketing
# 01daatbc3ak1qwc5nyc5ahv2xz - Product
# 01dadp74wfv607knpcb6vvxgtg - AI
# 01daayheky5f4e02qvrjptftxv - Ether Engineering

In [7]:
config_path = '/Users/venkat/Documents/mlflow/mlflow_bert/bert_mlflow_pyfunc/artifacts/bert_config.json'
mind_path = '/Users/venkat/Documents/mlflow/mlflow_bert/bert_mlflow_pyfunc/artifacts/mind.pkl'
model_path = '/Users/venkat/Documents/mlflow/mlflow_bert_deploy/mind-01daayheky5f4e02qvrjptftxv/artifacts/model.bin'

config = BertConfig.from_json_file(config_path)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
mind_dict = pickle.load(open(mind_path,'rb'))

model = BertForPreTrainingCustom(config)
state_dict = torch.load(model_path,map_location='cpu')

model.load_state_dict(state_dict)
model.eval()
print()




In [None]:
#calculate pair-wise cosine similarity
#extract features from all layers
feats_all_layers = [] #list of lists with each element being features from all layers

for layer in range(12):
    text1_list = []
    text2_list = []
    sent_feat_list = []
    for text in sent_bucket:
        sent_feat_list.append(getBERTFeatures(model, text, attn_head_idx=layer))
    feats_all_layers.append(sent_feat_list)

In [None]:
kp_dist_all_layers = []
cosine_dist_all_layers = []

for sent_feat_list in feats_all_layers:

    kp_dist_list = []
    cs_dist_list = []
    text1_list = []
    text2_list = []

    for i in range(len(sent_bucket)):
        sent1 = sent_bucket[i]
        for j in range(i+1,len(sent_bucket)):
            sent2 = sent_bucket[j]
            text1_list.append(sent1)
            text2_list.append(sent2)
            tup1 = sent_feat_list[i]
            tup2 = sent_feat_list[j]
            #pos_dist_list.append(getPOSBasedSimilarity_loop(tup1, tup2, sent1, sent2))
            kp_dist_list.append(getKPBasedSimilarity_loop(tup1, tup2,sent1, sent2))
            cs_dist_list.append(getCosineSimilarity_loop(tup1, tup2))
    print()
    kp_dist_all_layers.append(kp_dist_list)
    cosine_dist_all_layers.append(cs_dist_list)

In [None]:
#put all metrics into single dataframe
score_df = pd.DataFrame({'Sent1': text1_list,
                        'Sent2': text2_list,
                        'KP_Similarity_layer0': kp_dist_all_layers[0],
                         'KP_Similarity_layer1':kp_dist_all_layers[1],
                         'KP_Similarity_layer2':kp_dist_all_layers[2],
                         'KP_Similarity_layer3':kp_dist_all_layers[3],
                         'KP_Similarity_layer4':kp_dist_all_layers[4],
                         'KP_Similarity_layer5':kp_dist_all_layers[5],
                         'KP_Similarity_layer6':kp_dist_all_layers[6],
                         'KP_Similarity_layer7':kp_dist_all_layers[7],
                         'KP_Similarity_layer8':kp_dist_all_layers[8],
                         'KP_Similarity_layer9':kp_dist_all_layers[9],
                         'KP_Similarity_layer10':kp_dist_all_layers[10],
                         'KP_Similarity_layer11':kp_dist_all_layers[11],
                        'Cos_Similarity_layer0': cosine_dist_all_layers[0],
                        'Cos_Similarity_layer1': cosine_dist_all_layers[1],
                        'Cos_Similarity_layer2': cosine_dist_all_layers[2],
                        'Cos_Similarity_layer3': cosine_dist_all_layers[3],
                        'Cos_Similarity_layer4': cosine_dist_all_layers[4],
                        'Cos_Similarity_layer5': cosine_dist_all_layers[5],
                        'Cos_Similarity_layer6': cosine_dist_all_layers[6],
                        'Cos_Similarity_layer7': cosine_dist_all_layers[7],
                        'Cos_Similarity_layer8': cosine_dist_all_layers[8],
                        'Cos_Similarity_layer9': cosine_dist_all_layers[9],
                        'Cos_Similarity_layer10': cosine_dist_all_layers[10],
                        'Cos_Similarity_layer11': cosine_dist_all_layers[11]})

In [None]:
for i in range(12):
    feat_name = 'kp_cs_diff_layer'+str(i)
    score_df[feat_name] = score_df['Cos_Similarity_layer'+str(i)]-score_df['KP_Similarity_layer'+str(i)]

In [None]:
# ss_df = score_df[['Sent1','Sent2','KP_Similarity_layer11','Cos_Similarity_layer11','kp_cs_diff_layer11']].reset_index()
# ss_df = ss_df.sort_values(by='kp_cs_diff_layer11', ascending=False)

# plt.plot(t, ss_df['kp_cs_diff_layer11'], 'g', label='delta') # plotting t, a separately 
# plt.plot(t, ss_df['KP_Similarity_layer11'], 'r', label='keyphrase') # plotting t, b separately 
# plt.plot(t, ss_df['Cos_Similarity_layer11'], 'b', label='cosine') # plotting t, c separately 
# plt.legend(loc='upper left')
# plt.savefig('/Users/venkat/Downloads/Viz1.png',dpi = 300)
# plt.show()

In [None]:
t = np.linspace(0, 2*math.pi, 630)

for i in range(1,13):
    print(i-1)
    kp_layer = 'KP_Similarity_layer'+str(i-1)
    cs_layer = 'Cos_Similarity_layer'+str(i-1)
    diff_layer = 'kp_cs_diff_layer'+str(i-1)
    
    curr_df = score_df[['Sent1','Sent2',kp_layer,cs_layer,diff_layer]]
    curr_df = curr_df.sort_values(by=diff_layer, ascending=False)
    curr_df = curr_df[curr_df[kp_layer]>0]
    t = np.linspace(0, 2*math.pi, len(curr_df))
    
    plt.plot(t, curr_df[diff_layer], 'g', label='delta') # plotting t, a separately 
    plt.plot(t, curr_df[kp_layer], 'r', label='keyphrase') # plotting t, b separately 
    plt.plot(t, curr_df[cs_layer], 'b', label='cosine') # plotting t, c separately 
    layer = 'Layer-'+str(i)
    plt.title(layer)
    save_name = '/Users/venkat/Downloads/Viz_KP>0_'+str(i)+'.png'
    plt.savefig(save_name,dpi = 300)
    plt.show()

In [None]:
layer = 12
kp_layer = 'KP_Similarity_layer'+str(layer-1)
cs_layer = 'Cos_Similarity_layer'+str(layer-1)
diff_layer = 'kp_cs_diff_layer'+str(layer-1)

curr_df = score_df[['Sent1','Sent2',kp_layer,cs_layer,diff_layer]]
curr_df = curr_df.sort_values(by=diff_layer, ascending=False)
curr_df = curr_df[curr_df[kp_layer]>0]

curr_df.sort_values(by=diff_layer, ascending=False)

In [None]:
key = 575
print(curr_df['Sent1'][key])
print()
print(curr_df['Sent2'][key])
print()
print(getKPBasedSimilarity(curr_df['Sent1'][key],curr_df['Sent2'][key],-1))