In [6]:
%matplotlib inline
from nltk.cluster.kmeans import KMeansClusterer
import nltk
import argparse
import os
import sys
import csv
import random
import logging
import pickle
import re
from tqdm import tqdm, trange, tqdm_notebook
from math import ceil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from itertools import combinations, product
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
from pytorch_pretrained_bert.modeling_openai import OpenAIGPTPreTrainedModel,OpenAIGPTDoubleHeadsModel,OpenAIGPTConfig,OpenAIGPTModel,OpenAIGPTLMHead

from scipy.spatial.distance import cosine, cityblock
import iso8601
from datetime import datetime
import json
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
device = "cpu"

In [None]:
class OpenAIGPTLMHead_custom(nn.Module):
    """ Language Model Head for the transformer """

    def __init__(self, model_embeddings_weights, config):
        super(OpenAIGPTLMHead_custom, self).__init__()
        self.n_embd = config.n_embd
        self.vocab_size = config.vocab_size
        self.predict_special_tokens = config.predict_special_tokens
        embed_shape = model_embeddings_weights.shape
        #print("shape check",(model_embeddings_weights[1]))
        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
        self.set_embeddings_weights(model_embeddings_weights)

    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
        self.predict_special_tokens = predict_special_tokens
        embed_shape = model_embeddings_weights.shape
        self.decoder.weight = model_embeddings_weights  # Tied weights

    def forward(self, hidden_state):
#         print('decoder weight')
#         print((hidden_state.shape))
        lm_logits = self.decoder(hidden_state)
#         print(lm_logits.shape)
        if not self.predict_special_tokens:
            lm_logits = lm_logits[..., :self.vocab_size]
#             print("lm_logits.shape: ",lm_logits.shape)
        return lm_logits

class OpenAIGPTMultipleChoiceHead_custom(nn.Module):
    """ Classifier Head for the transformer """

    def __init__(self, config):
        super(OpenAIGPTMultipleChoiceHead_custom, self).__init__()
        self.n_embd = config.n_embd
        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
        self.linear = nn.Linear(config.n_embd, 1)

        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)

    def forward(self, hidden_states, mc_token_ids):
        # Classification logits
        # hidden_state (bsz, num_choices, seq_length, hidden_size)
        # mc_token_ids (bsz, num_choices)
        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
        # (bsz, num_choices, 1, hidden_size)
        #print('mc_token_ids: ', mc_token_ids[0][0].shape,mc_token_ids[0][1].shape)
        #print('mc_token_ids.shape: ', mc_token_ids.shape)
        #print('Hidden states before compute: ', hidden_states.shape)
#         print("Token IDS:",mc_token_ids)
        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
        #print('After transformation: ', multiple_choice_h.shape)
        # (bsz, num_choices, hidden_size)
#         multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
#         multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
        # (bsz, num_choices)
        return multiple_choice_h

class OpenAIGPTDoubleHeadsModel_custom(OpenAIGPTPreTrainedModel):
    """
    OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
    Special tokens need to be trained during the fine-tuning if you use them.
    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________
    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
        total_tokens_embeddings = config.vocab_size + config.n_special
    You should use the associate indices to index the embeddings.
    Params:
        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
            indices selected in the range [0, total_tokens_embeddings[
        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
            with the position indices (selected in the range [0, config.n_positions - 1[.
        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
            You can use it to add a third type of embedding to each input token in the sequence
            (the previous two being the word and position embeddings).
            The input, position and token_type embeddings are summed inside the Transformer before the first
            self-attention block.
        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., total_tokens_embeddings]
        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_choices].
        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
    Outputs:
        if `lm_labels` and `multiple_choice_labels` are not `None`:
            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
        else: a tuple with
            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
    Example usage:
    ```python
    # Already been converted into BPE token ids
    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
    config = modeling_openai.OpenAIGPTOpenAIGPTMultipleChoiceHead_customOpenAIGPTMultipleChoiceHead_customConfig()
    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
    ```
    """

    def __init__(self, config):
        super(OpenAIGPTDoubleHeadsModel_custom, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
        self.lm_head = OpenAIGPTLMHead_custom(self.transformer.tokens_embed.weight, config)
        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead_custom(config)
        self.apply(self.init_weights)

    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
        """ Update input and output embeddings with new embedding matrice
            Make sure we are sharing the embeddings
        """
        #self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)

    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, head_mask=None):
        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
        if self.transformer.output_attentions:
            all_attentions, hidden_states = hidden_states
#         print('hidden states',len(hidden_states))
        
        hidden_states = hidden_states[-1] #layer #
#         lm_logits = self.lm_head(hidden_states)
#         hidden_feats = self.multiple_choice_head(hidden_states, mc_token_ids)
#         print("FEAT.",hidden_feats)
#         losses = []
#         if lm_labels is not None:
#             shift_logits = lm_logits[..., :-1, :].contiguous()
#             shift_labels = lm_labels[..., 1:].contiguous()
#             loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
#             losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
#         print("hidden state shape",hidden_states.shape)
        lm_logits = 0
        return lm_logits, hidden_states #token #

In [None]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

def listRightIndex(alist, value):
    return len(alist) - alist[-1::-1].index(value) -1


def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
    """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)

        To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
        input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
    """

    tensor_datasets = []
    for dataset in encoded_datasets:
        n_batch = ceil(len(dataset[0][0])/cap_length)
        input_ids = np.zeros((n_batch, 1, input_len), dtype=np.int64)
        mc_token_ids = np.zeros((n_batch, 1), dtype=np.int64)
        i = 0
        init_pos = 0
        end_pos = cap_length
        for story, cont1, cont2, mc_label in dataset:
            if n_batch!=0:
                if n_batch==1:
                    with_cont1 = [start_token] + story[:cap_length] + [clf_token]
                    input_ids[i, 0, :len(with_cont1)] = with_cont1
                    mc_token_ids[i, 0] = len(with_cont1) - 1
                    i+=1
                else:
                    while i!=n_batch and end_pos<len(story):
                        try:
                            end_pos = init_pos + listRightIndex(story[init_pos:end_pos],story[-1])
                        except ValueError:
                            end_pos = init_pos+story[init_pos:].index(story[-1])
                        with_cont1 = [start_token] + story[init_pos:end_pos+1] + [clf_token]
                        input_ids[i, 0, :len(with_cont1)] = with_cont1
                        mc_token_ids[i, 0] = len(with_cont1) - 1
                        i+=1
                        init_pos = end_pos+1
                        end_pos = min(init_pos+cap_length-1,len(story))
        all_inputs = (input_ids, mc_token_ids)#, lm_labels, mc_labels)
        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
    return tensor_datasets

def load_rocstories_dataset(dataset_path):
    """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
    with open(dataset_path, encoding='utf_8') as f:
        f = csv.reader(f)
        output = []
        next(f) # skip the first line
        for line in tqdm(f):
            output.append(('.'.join(line[0 :4]), line[4], line[5], int(line[-1])))
    return output

def tokenize_and_encode(obj):
    """ Tokenize and encode a nested object """
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    elif isinstance(obj, int):
        return obj
    return list(tokenize_and_encode(o) for o in obj)

In [7]:
def getNSPScore(sample_text):
    
    m = torch.nn.Softmax()

    tokenized_text = tokenizer.tokenize(sample_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    segments_ids = [0]*tokenized_text.index('[SEP]')+[1]*(len(tokenized_text)-tokenized_text.index('[SEP]'))

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    pred_score, seq_rel, seq_out, pool_out = model1(tokens_tensor, segments_tensors)
    return m(seq_rel).detach().numpy()[0][0] #returns probability of being next sentence

def getSentMatchScore(sent1, sent2, nsp_dampening_factor = 0.7):
    
    sent1_feats = getBERTFeatures(model1, sent1, attn_head_idx)
    sent2_feats = getBERTFeatures(model1, sent2, attn_head_idx)
    
    cosine_distance = 1- cosine(sent1_feats, sent2_feats)
    
    nsp_input1 = sent1+' [SEP] '+sent2
    nsp_input2 = sent2+' [SEP] '+sent1
    
    nsp_score_1 = getNSPScore(nsp_input1)
    nsp_score_2 = getNSPScore(nsp_input2)
    nsp_score = np.mean([nsp_score_1,nsp_score_2])*nsp_dampening_factor
    
    len_diff = abs(len(sent1.split(' '))-len(sent2.split(' ')))
    if len_diff>2*(min(len(sent1.split(' ')),len(sent2.split(' ')))):
        #give more weight to nsp if the sentences of largely varying lengths
        score = 0.4*cosine_distance+0.6*nsp_score
    else:
        score = np.mean([cosine_distance,nsp_score])
    
    #print ("nsp score -> " + str(nsp_score))
    #print ("cosine score -> " + str(cosine_distance))
    return score
    

def getSentMatchScore_wfeature(sent1, sent2, sent1_feats, sent2_feats, nsp_dampening_factor = 0.7):
    cosine_distance = 1-cosine(sent1_feats, sent2_feats)
    
    nsp_input1 = sent1+' [SEP] '+sent2
    #nsp_input2 = sent2+' [SEP] '+sent1
    
    nsp_score_1 = getNSPScore(nsp_input1)
    #nsp_score_2 = getNSPScore(nsp_input2)
    
    nsp_score = nsp_score_1 * nsp_dampening_factor
    #nsp_score = nsp_score_1*nsp_dampening_factor
    
    len_diff = abs(len(sent1.split(' '))-len(sent2.split(' ')))
    if len_diff>2*(min(len(sent1.split(' ')),len(sent2.split(' ')))):
        #give more weight to nsp if the sentences of largely varying lengths
        score = 0.4*cosine_distance+0.6*nsp_score
    else:
        score = np.mean([cosine_distance,nsp_score])
    
    return score

def getSentMatchScore_wfeature_cosine(sent1, sent2, sent1_feats, sent2_feats, nsp_dampening_factor = 0.7):
    cosine_distance = 1-cosine(sent1_feats, sent2_feats)
    
    return cosine_distance
    

def getBERTFeatures(model, text, attn_head_idx = -1): #attn_head_idx - index o[]
    tokenized_text = tokenizer.tokenize(text)
    if len(tokenized_text)>200:
        tokenized_text = tokenized_text[0:200]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    _, _, seq_out, pool_out = model(tokens_tensor)
    seq_out = list(getPooledFeatures(seq_out[attn_head_idx]).T)
    #pool_out = list(pool_out.detach().numpy().T)
    
    return seq_out

def getPooledFeatures(np_array):
    np_array = np_array.reshape(np_array.shape[1],np_array.shape[2]).detach().numpy()
    np_array_mp = np.mean(np_array, axis=0).reshape(1, -1)
    return np_array_mp

def replaceContractions(text):
    #text = text.lower()
    c_filt_text = ''
    for word in text.split(' '):
        if word in contractions:
            c_filt_text = c_filt_text+' '+contractions[word]
        else:
            c_filt_text = c_filt_text+' '+word
    return c_filt_text

def cleanText(text):
        
    text = text.replace('\\n','')
    text = text.replace('\\','')
    #text = text.replace('\t', '')
    #text = re.sub('\[(.*?)\]','',text) #removes [this one]
    text = re.sub('(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?\s',
                ' __url__ ',text) #remove urls
    #text = re.sub('\'','',text)
    #text = re.sub(r'\d+', ' __number__ ', text) #replaces numbers
    text = re.sub('\W', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.replace('\t', '')
    text = text.replace('\n', '')
    return text

def formatTime(tz_time, datetime_object=False):
    isoTime = iso8601.parse_date(tz_time)
    ts = isoTime.timestamp()
    ts = datetime.utcfromtimestamp(ts).strftime("%Y-%m-%d %H:%M:%S:%f")

    if datetime_object:
        ts = datetime.fromisoformat(ts)
    return ts


In [None]:
## Defining constants over here
seed = 42 
model_name = 'openai-gpt'
output_dir = '../models/gpt/'
train_batch_size = 1
n_valid = 374
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
#n_gpu = torch.cuda.device_count()
#logger.info("device: {}, n_gpu {}".format(device, n_gpu))
device = 'cuda'
special_tokens = ['_start_', '_delimiter_', '_classify_']
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, special_tokens=special_tokens)
special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)

# model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
# tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
# model.to(device)
# model.eval()

model1 = OpenAIGPTDoubleHeadsModel_custom.from_pretrained(output_dir)
tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
model1.to(device)
model1.eval()
#print(type(model))
#print('model1')
#print(model1)
tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
logger.info("Encoding dataset...")

def feature_extractor(model1,text):
    trn_dt = ([text,'','',0],)   
    datasets = (trn_dt,)
    encoded_datasets = tokenize_and_encode(datasets)
    max_length = model1.config.n_positions//2 - 2
    input_length = len(encoded_datasets[0][0][0])+2
    input_length = min(input_length, model1.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    n_batches = ceil(len(encoded_datasets[0][0][0])/max_length)
    
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
    train_tensor_dataset = tensor_datasets[0]
    train_data = TensorDataset(*train_tensor_dataset)
    train_dataloader = DataLoader(train_data, batch_size=1)
    '''
    config = OpenAIGPTConfig.from_json_file('/home/shubham/Project/domain_mind/gpt2_experiment/model/config.json')
    model1 = OpenAIGPTMultipleChoiceHead_custom(config)
    '''
    #eval_loss, eval_accuracy = 0, 0
    #nb_eval_steps, nb_eval_examples = 0, 0
    final_clf=[]
    final_lm=[]
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, mc_token_ids = batch
        
        with torch.no_grad():
            a, clf_text_feature = model1(input_ids, mc_token_ids)
            final_clf.append(clf_text_feature[:,:,-1])
    if n_batches>1:
        clf_torch = torch.sum(torch.stack(final_clf),0)
        return clf_torch
    else:
        return clf_text_feature[:,:,-1,:]#, lm_text_feature


# parse csv medium dump data

In [8]:
## load it as csv.
import pandas

df = pandas.read_csv('../data/raw/se_medium_.csv', index_col=False, header=0);
#texts = ' '.join(list(df['Data'])[:100])
texts_org = list(df['Data'][:100])
#texts = '. '.join(list(map(lambda x: cleanText(x), list(df['Data']))))

In [None]:
df

In [None]:
import sys, pickle
sys.path.append('../')
sys.path.append('../../../')
import text_preprocessing.preprocess as tp


mod_texts_unfiltered = tp.preprocess(texts, stop_words=False, remove_punct=False)
mod_texts = []

for index, sent in enumerate(mod_texts_unfiltered[:]):
    if len(sent.split(' '))>250:
        length = len(sent.split(' '))
        split1 = ' '.join([i for i in sent.split(' ')[:round(length/2)]])
        split2 = ' '.join([i for i in sent.split(' ')[round(length/2):]])
        mod_texts.append(split1)
        mod_texts.append(split2)
        continue
        #mod_texts.pop(index)
    if len(sent.split(' '))<=6:
        continue
    mod_texts.append(sent)

print(len(mod_texts))
fv = {}

for index, sent in enumerate(mod_texts):
    fv[index] = feature_extractor(model1,sent)
    #fv[index] = getBERTFeatures(model1, sent, attn_head_idx=-1)
    #fv[index] = getBERTFeatures_KP(model1, sent, attn_head_idx=-3)
    print (index)

In [None]:
import networkx as nx

def build_graph(doc_list):
    eng_graph = nx.Graph()
    try:
        eng_graph.add_nodes_from(range(len(doc_list)))
    except Exception as e:
        print(e)
    return eng_graph


tg = build_graph(mod_texts)

In [None]:
attn_head_idx = -1

node_edge = []

for index1, sent1 in enumerate(mod_texts):
    print (index1)
    for index2, sent2 in enumerate(mod_texts):
        if index1!=index2 and index2>index1:
            #score = getSentMatchScore_wfeature(sent1, sent2,fv[index1],fv[index2])
            #score = getSentMatchScore_wfeature_cosine(sent1, sent2,fv[index1],fv[index2])
            score = 1 - cosine(fv[index1].cpu(),fv[index2].cpu())
#             if score > 0.8:
#                 #tg.add_edge(index1,index2,{'weight': score})
#                 tg.add_edge(index1,index2)
            tg.add_edge(index1,index2,weight=score)

In [None]:
import math
import community
max_mod = 0
mod_v = 0
for v in [0.15, 0.10, 0.05, 0.01]:
    flag = False
    for count in range(5):   
        temp_nodes = []
        for nodea,nodeb, weight in tg.edges.data():
            temp_nodes.append((nodea,nodeb,weight['weight']))
        temp_nodes = sorted(temp_nodes, key=lambda kv: kv[2], reverse=True)

        temp_nodes = temp_nodes[:math.ceil(len(temp_nodes)*v)+1]

        com_graph = nx.Graph()
        for nodea,nodeb, weight in temp_nodes:
            com_graph.add_edge(nodea,nodeb)

        partition = community.best_partition(com_graph)

        mod = community.modularity(partition, com_graph)
        if mod > max_mod and (mod < 0.4 or max_mod==0):
            max_mod=mod
            mod_v = v
        print ("The pruning value 'v' and modularity is: ", v, mod)
#         if mod > 0.3:
#             flag=True
#             print ("Modularity reached 3. The pruning value 'v' is: ", v)
#             break
        if mod==0:
            temp_nodes = []
            print ("Modularity reached 0. The pruning value 'v' is: ", 0.15)
            for nodea,nodeb, weight in tg.edges.data():
                temp_nodes.append((nodea,nodeb,weight['weight']))
            temp_nodes = sorted(temp_nodes, key=lambda kv: kv[2], reverse=True)

            temp_nodes = temp_nodes[:math.ceil(len(temp_nodes)*0.15)+1]

            com_graph = nx.Graph()
            for nodea,nodeb, weight in temp_nodes:
                com_graph.add_edge(nodea,nodeb)

            partition = community.best_partition(com_graph)

            mod = community.modularity(partition, com_graph)
            flag=True
            break
    if flag:
        print()
        break

for count in range(5):
    temp_nodes = []
    for nodea,nodeb, weight in tg.edges.data():
        temp_nodes.append((nodea,nodeb,weight['weight']))
    temp_nodes = sorted(temp_nodes, key=lambda kv: kv[2], reverse=True)
    #print (len(temp_nodes), mod_v)
    temp_nodes = temp_nodes[:math.ceil(len(temp_nodes)*mod_v)+1]
    #print (len(temp_nodes))
    com_graph = nx.Graph()
    for nodea,nodeb, weight in temp_nodes:
        com_graph.add_edge(nodea,nodeb)

    partition = community.best_partition(com_graph)
    mod = community.modularity(partition, com_graph)
    #print (mod)
    if mod>=max_mod:
        break
print ("The final modularity is ", mod)

In [None]:
import community
import matplotlib.pyplot as plt
#partition = community.best_partition(com_graph)

values = [partition.get(node) for node in com_graph.nodes()]
values=[partition.get(node) for node in com_graph.nodes()]
plt.rcParams['figure.figsize']= [16, 10]
measure_name = "Louviin Algorithm Community Structure"
pos = nx.spring_layout(com_graph, k=0.2, iterations=20)
nodes_plot=nx.draw_networkx_nodes(com_graph, pos, node_size=140, label=True, cmap=plt.get_cmap('magma', len(com_graph.nodes())/4),node_color=values, alpha=0.95)
edges_plot=nx.draw_networkx_edges(com_graph, pos, edge_color='r', alpha=0.1)
plt.title(measure_name, fontsize=22, fontname='Arial')
plt.colorbar(nodes_plot)
plt.axis('off')
plt.show()

In [None]:
community.modularity(partition, com_graph)

In [None]:
partition = sorted(partition.items(), key=lambda kv: kv[1], reverse=False)

In [None]:
current = 0
print ("--------------cluster " + str(0) + "------------ \n ")
for word, cluster in partition:
    if cluster!=current:
        print ("--------------cluster " + str(cluster) + "------------ \n ")
        print (mod_texts[word])
        current=cluster
    else:
        print (mod_texts[word] + "\n\n\n\n\n")

In [None]:
# to backlink to the documents

import sys, pickle
sys.path.append('../')
sys.path.append('../../../')
import text_preprocessing.preprocess as tp

doc_split = []
for t in texts_org:
    mod_texts_unfiltered_new = tp.preprocess(t, stop_words=False, remove_punct=False)
    mod_texts_new = []

    for index, sent in enumerate(mod_texts_unfiltered_new[:]):
        if len(sent.split(' '))>250:
            length = len(sent.split(' '))
            split1 = ' '.join([i for i in sent.split(' ')[:round(length/2)]])
            split2 = ' '.join([i for i in sent.split(' ')[round(length/2):]])
            mod_texts_new.append(split1)
            mod_texts_new.append(split2)
            continue
            #mod_texts.pop(index)
        if len(sent.split(' '))<=6:
            continue
        mod_texts_new.append(sent)
    doc_split.append(mod_texts_new)


In [None]:
current = 0
print ("--------------cluster " + str(0) + "------------ \n ")
for word, cluster in partition:
    if cluster!=current:
        print ("--------------cluster " + str(cluster) + "------------ \n ")
        #print (mod_texts[word])
        doc = -1
        for index, doc_s in enumerate(doc_split):
            if mod_texts[word] in doc_s:
                doc = index
        print (doc)
        current=cluster
    else:
        #print (mod_texts[word] + "\n\n\n\n\n")
        doc = 0
        for index, doc_s in enumerate(doc_split):
            if mod_texts[word] in doc_s:
                doc = index
        print (doc)

# without pruning (doesn't work w/ cosine similarity as weights)

In [None]:
com_graph_full = nx.Graph()
for nodea,nodeb, weight1 in tg.edges.data():
    com_graph_full.add_edge(nodea,nodeb, weight=weight1['weight'])

partition = community.best_partition(com_graph_full)
mod = community.modularity(partition, com_graph_full)
print (mod)

In [None]:
import community
import matplotlib.pyplot as plt
#partition = community.best_partition(com_graph)

values = [partition.get(node) for node in com_graph_full.nodes()]
values=[partition.get(node) for node in com_graph_full.nodes()]
plt.rcParams['figure.figsize']= [16, 10]
measure_name = "Louviin Algorithm Community Structure"
pos = nx.spring_layout(com_graph_full, k=0.2, iterations=20)
nodes_plot=nx.draw_networkx_nodes(com_graph_full, pos, node_size=140, label=True, cmap=plt.get_cmap('magma', len(com_graph_full.nodes())/4),node_color=values, alpha=0.95)
edges_plot=nx.draw_networkx_edges(com_graph_full, pos, edge_color='r', alpha=0.1)
plt.title(measure_name, fontsize=22, fontname='Arial')
plt.colorbar(nodes_plot)
plt.axis('off')
plt.show()

In [None]:
partition = sorted(partition.items(), key=lambda kv: kv[1], reverse=False)

In [None]:
current = 0
print ("--------------cluster " + str(0) + "------------ \n ")
for word, cluster in partition:
    if cluster!=current:
        print ("--------------cluster " + str(cluster) + "------------ \n ")
        print (mod_texts[word])
        current=cluster
    else:
        print (mod_texts[word] + "\n\n\n\n\n")

In [None]:
com_graph_full = nx.Graph()
for nodea,nodeb, weight1 in tg.edges.data():
    com_graph_full.add_edge(nodea,nodeb, weight=weight1['weight'])

partition = community.best_partition(com_graph_full)
mod = community.modularity(partition, com_graph_full)
print (mod)

# old approach (NA)

In [None]:
import sys, pickle
sys.path.append('../')
import text_preprocessing.preprocess as tp

with open('../data/engg_text_06May19_recency.pkl','rb') as fp:
    file = pickle.load(fp)

texts = '. '.join(sent for sent in file)

mod_texts_unfiltered = tp.preprocess(texts, stop_words=False, remove_punct=True)
mod_texts = []

for index, sent in enumerate(mod_texts_unfiltered[:500]):
    if len(sent.split(' '))>250:
        length = len(sent.split(' '))
        split1 = ' '.join([i for i in sent.split(' ')[:round(length/2)]])
        split2 = ' '.join([i for i in sent.split(' ')[round(length/2):]])
        mod_texts.append(split1)
        mod_texts.append(split2)
        continue
        #mod_texts.pop(index)
    if len(sent.split(' '))<=6:
        continue
    mod_texts.append(sent)

print(len(mod_texts))
fv = {}

for index, sent in enumerate(mod_texts):
    fv[index] = getBERTFeatures(model1, sent, attn_head_idx=-1)
    print (index)

In [None]:
import networkx as nx

def build_graph(doc_list):
    eng_graph = nx.Graph()
    try:
        eng_graph.add_nodes_from(range(len(doc_list)))
    except Exception as e:
        print(e)
    return eng_graph


tg = build_graph(mod_texts)

In [None]:
for index1, sent1 in enumerate(mod_texts):
    for index2, sent2 in enumerate(mod_texts):
        if sent1!=sent2:
            score = getSentMatchScore_wfeature(sent1, sent2, fv[index1], fv[index2])
            print ("sentence 1 \n \n", sent1)
            print ("\n\nsentence 2 \n \n", sent2)
            print ("\n\n Score -> ", score, "\n\n\n\n\n")

In [None]:
from tqdm import tqdm
import time
attn_head_idx = -1
start = time.time()
end = 0
node_edge = []
runs = len(mod_texts)*(len(mod_texts)-1)

sent1counter = 0
sent2counter = 0
counter = 0
try:
    i=0
    #tqdm()
    for index1, sent1 in enumerate(mod_texts):
        sent1counter=index1
        print (index1, time.time()-start)
        for index2, sent2 in enumerate(mod_texts):
            counter+=1
            #updt(runs, counter)
            sent2counter=index2
            if index1!=index2:
                score = getSentMatchScore_wfeature(sent1, sent2, fv[index1], fv[index2])
                if score>0.8:
                    #tg.add_edge(index1,index2,{'weight': score})
                    tg.add_edge(index1,index2)
except RuntimeError as e:
    print (e)
    print (sent1counter, sent2counter)
    

In [None]:
scores_round = [round(i,1) for i in scores]
scores_freq = {}
for i in scores_round:
    scores_freq[i] = scores_round.count(i)
scores_round = set(scores_freq.keys())
scores_round

In [None]:
sorted(scores_freq.items(), key=lambda kv: kv[0], reverse=True)

In [None]:
sent1counter

In [None]:
print (runs)

In [None]:
tg.edges

In [None]:
import community
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches

partition = community.best_partition(tg)

#drawing
# size = float(len(set(partition.values())))
# pos = nx.spring_layout(tg)
# count = 0.
# for com in set(partition.values()) :
#     count = count + 1.
#     list_nodes = [nodes for nodes in partition.keys()
#                                 if partition[nodes] == com]
#     nx.draw_networkx_nodes(tg, pos, list_nodes, node_size = 20,
#                                 node_color = str(count / size))

values = [partition.get(node) for node in tg.nodes()]
#nx.draw_networkx_edges(tg, pos, alpha=0.5)
#plt.show()
#plt.margins(0.1, 0.1)
#plt.figure(figsize=(10, 9))
#plt.axis('off')
#nx.draw_spring(tg, cmap = plt.get_cmap('jet'), node_color = values, node_size=300)
#draw_graph(tg, None)


values=[partition.get(node) for node in tg.nodes()]
plt.rcParams['figure.figsize']= [16, 10]
measure_name = "Louviin Algorithm Community Structure"
pos = nx.spring_layout(tg, k=0.2, iterations=20)
nodes_plot=nx.draw_networkx_nodes(tg, pos, node_size=140, label=True, cmap=plt.get_cmap('magma', len(tg.nodes())/4),node_color=values, alpha=0.95)
edges_plot=nx.draw_networkx_edges(tg, pos, edge_color='r', alpha=0.1)
plt.title(measure_name, fontsize=22, fontname='Arial')
plt.colorbar(nodes_plot)
plt.axis('off')
plt.show()

In [None]:
partition = sorted(partition.items(), key=lambda kv: kv[1], reverse=False)


In [None]:
partition

In [None]:
current = 0
print ("--------------cluster " + str(0) + "------------ \n ")
for word, cluster in partition:
    if cluster!=current:
        print ("--------------cluster " + str(cluster) + "------------ \n ")
        print (mod_texts[word])
        current=cluster
    else:
        print (mod_texts[word])

In [None]:

cluster0 = []
cluster_dict = {}
cluster1= []
cluster2 = []
cluster3 = []
cluster4= []
cluster5 = []
cluster6= []
cluster7 = []
for sent, cluster in partition:
    if cluster == 0:
        cluster0.append(sent)
        cluster_dict[sent] = 2
    elif cluster == 1:
        cluster1.append(sent)
    elif cluster == 2:
        cluster2.append(sent)
    elif cluster == 3:
        cluster3.append(sent)
    elif cluster == 4:
        cluster4.append(sent)
    elif cluster == 5:
        cluster5.append(sent)
    elif cluster == 6:
        cluster6.append(sent)
    elif cluster == 7:
        cluster7.append(sent)

In [None]:
print ("------------Cluster 0--------------- \n\n")
for sent in cluster0:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 1--------------- \n\n")
for sent in cluster1:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 2--------------- \n\n")
for sent in cluster2:
    print (mod_texts[sent] + "\n\n")
print ("------------Cluster 3--------------- \n\n")
for sent in cluster3:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 4--------------- \n\n")
for sent in cluster4:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 5--------------- \n\n")
for sent in cluster5:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 6--------------- \n\n")
for sent in cluster6:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 7--------------- \n\n")
for sent in cluster7:
    print (mod_texts[sent] + "\n\n")

## POC on slack dump (json)

In [None]:
import sys

sys.path.append('../')
from text_preprocessing import preprocess

from graphrank.graphrank import GraphRank

from graphrank.utils import GraphUtils
import networkx as nx
import json as js
import keyphrase_extraction as kp
import community
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches

In [None]:
import json

def getslacktext(location):
    with open(location) as f:
        meeting = json.load(f)
    
#     text = []
    
#     for i in range(len(list(meeting['segments']))):
#         #unfil_text = meeting['segments'][i]['filteredText']
#         unfil_text = meeting['segments'][i]['originalText']
#         if len(unfil_text.split(' '))<6:
#             continue
#         if len(unfil_text.split(' '))>250:
#             length = len(unfil_text.split(' '))
#             split1 = ' '.join([i for i in unfil_text.split(' ')[:round(length/2)]])
#             split2 = ' '.join([i for i in unfil_text.split(' ')[round(length/2):]])
#             text.append(split1)
#             text.append(split2)
#             continue
#         text.append(unfil_text)
    
    return meeting

text = getslacktext('engineering_6thMay2019.json')
len(text)

In [None]:
def getusermsg(text, userid = 'U9GLH098C'): #Venkat
    
    usermsg = []
    for index, msg in enumerate(text):
        if msg['type']=='message' and 'bot_id' not in msg.keys() and 'user' in msg.keys():
            if msg['user']==userid and msg['text']!='':
                    text = preprocess.preprocess(msg['text'], remove_punct=True,word_tokenize=False ,stop_words=False)
                    #print (text)
                    if len(text)!=0:
                        for t in text:
                            if len(t.split(' '))>6:
                                usermsg.append(t)
    return usermsg

def getusermsg_alluser(text): 
    
    usermsg = []
    user = {}
    cnt=0
    for index, msg in enumerate(text):
        if msg['type']=='message' and 'bot_id' not in msg.keys() and 'user' in msg.keys():
            if msg['text']!='':
                    text = preprocess.preprocess(msg['text'], remove_punct=True,word_tokenize=False ,stop_words=False)
                    #print (text)
                    if len(text)!=0:
                        for t in text:
#                             if len(t.split(' '))>6:
#                                 if msg['user'] in usermsg.keys() and len(usermsg[msg['user']])<10:
#                                     usermsg[msg['user']].append(t)
#                                 elif msg['user'] not in usermsg.keys():
#                                     usermsg[msg['user']] = []
#                                     usermsg[msg['user']].append(t)
                            if len(t.split(' '))>6:
                                usermsg.append(t) 
                                user[cnt] = msg['user']
                                cnt+=1
    return usermsg, user

#mod_texts = getusermsg(text,userid = 'U4QK2H8RL')[:20]
mod_texts, user_list = getusermsg_alluser(text)

In [None]:
mod_texts = mod_texts[:100]

In [None]:

print(len(mod_texts))
fv = {}

for index, sent in enumerate(mod_texts):
    fv[index] = getBERTFeatures(model1, sent, attn_head_idx=-1)
    print (index)

In [None]:
import networkx as nx

def build_graph(doc_list):
    eng_graph = nx.Graph()
    try:
        eng_graph.add_nodes_from(range(len(doc_list)))
    except Exception as e:
        print(e)
    return eng_graph


tg = build_graph(mod_texts)

In [None]:
from tqdm import tqdm
import time
attn_head_idx = -1
start = time.time()
end = 0
node_edge = []
runs = len(mod_texts)*(len(mod_texts)-1)

sent1counter = 0
sent2counter = 0
counter = 0
try:
    i=0
    #tqdm()
    for index1, sent1 in enumerate(mod_texts):
        sent1counter=index1
        print (index1, time.time()-start)
        for index2, sent2 in enumerate(mod_texts):
            
            #updt(runs, counter)
            sent2counter=index2
            if index1!=index2:
                score = getSentMatchScore_wfeature_test(sent1, sent2, fv[index1], fv[index2], user_list[index1], user_list[index1])
                if score>0:
                    counter+=1
                    tg.add_edge(index1,index2,weight=score)
                    #tg.add_edge(index1,index2)
except RuntimeError as e:
    print (e)
    print (sent1counter, sent2counter)
    

In [None]:
sorted(nx.betweenness_centrality(tg).items(), key=lambda kv: kv[1], reverse=True)

In [None]:
#tg.remove_node(1)

In [None]:
mod_texts[19]

In [None]:
print (len(mod_texts)*(len(mod_texts)-1))
(len(mod_texts)*(len(mod_texts)-1)) - counter

In [None]:
import community
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches

partition = community.best_partition(tg)

#drawing
# size = float(len(set(partition.values())))
# pos = nx.spring_layout(tg)
# count = 0.
# for com in set(partition.values()) :
#     count = count + 1.
#     list_nodes = [nodes for nodes in partition.keys()
#                                 if partition[nodes] == com]
#     nx.draw_networkx_nodes(tg, pos, list_nodes, node_size = 20,
#                                 node_color = str(count / size))

values = [partition.get(node) for node in tg.nodes()]
#nx.draw_networkx_edges(tg, pos, alpha=0.5)
#plt.show()
#plt.margins(0.1, 0.1)
#plt.figure(figsize=(10, 9))
#plt.axis('off')
#nx.draw_spring(tg, cmap = plt.get_cmap('jet'), node_color = values, node_size=300)
#draw_graph(tg, None)


values=[partition.get(node) for node in tg.nodes()]
plt.rcParams['figure.figsize']= [16, 10]
measure_name = "Louviin Algorithm Community Structure"
pos = nx.spring_layout(tg, k=0.2, iterations=20)
nodes_plot=nx.draw_networkx_nodes(tg, pos, node_size=140, label=True, cmap=plt.get_cmap('magma', len(tg.nodes())/4),node_color=values, alpha=0.95)
edges_plot=nx.draw_networkx_edges(tg, pos, edge_color='r', alpha=0.1)
plt.title(measure_name, fontsize=22, fontname='Arial')
plt.colorbar(nodes_plot)
plt.axis('off')
plt.show()

In [None]:
partition = sorted(partition.items(), key=lambda kv: kv[1], reverse=False)


In [None]:
partition

In [None]:

cluster0 = []
cluster_dict = {}
cluster1= []
cluster2 = []
cluster3 = []
cluster4= []
cluster5 = []
cluster6= []
cluster7 = []
for sent, cluster in partition:
    if cluster == 0:
        cluster0.append(sent)
        cluster_dict[sent] = 2
    elif cluster == 1:
        cluster1.append(sent)
    elif cluster == 2:
        cluster2.append(sent)
    elif cluster == 3:
        cluster3.append(sent)
    elif cluster == 4:
        cluster4.append(sent)
    elif cluster == 5:
        cluster5.append(sent)
    elif cluster == 6:
        cluster6.append(sent)
    elif cluster == 7:
        cluster7.append(sent)

In [None]:
print ("------------Cluster 0--------------- \n\n")
for sent in cluster0:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 1--------------- \n\n")
for sent in cluster1:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 2--------------- \n\n")
for sent in cluster2:
    print (mod_texts[sent] + "\n\n")
print ("------------Cluster 3--------------- \n\n")
for sent in cluster3:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 4--------------- \n\n")
for sent in cluster4:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 5--------------- \n\n")
for sent in cluster5:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 6--------------- \n\n")
for sent in cluster6:
    print (mod_texts[sent] + "\n\n")
    
print ("------------Cluster 7--------------- \n\n")
for sent in cluster7:
    print (mod_texts[sent] + "\n\n")

In [None]:
cluster1_user = []
print ("------------Cluster 0--------------- \n\n")
for sent in cluster0:
    if user_list[sent] not in cluster1_user:
        print (user_list[sent] + "\n\n")
        cluster1_user.append(user_list[sent])
    else:
        continue

cluster2_user = []
print ("------------Cluster 1--------------- \n\n")
for sent in cluster1:
    if user_list[sent] not in cluster2_user:
        print (user_list[sent] + "\n\n")
        cluster2_user.append(user_list[sent])
    else:
        continue
        
print ("------ Cluster 0 - Cluster 1---------")
for i in cluster1_user:
    if i in cluster2_user:
        print (i)

In [None]:
part ={}
for sent, cluster in partition:
    part[sent] = cluster

print (part[19],part[1],part[8],part[6],part[9])

In [None]:
for i in [19,1,8]:
    print (mod_texts[i])

In [None]:
for index1, sent1 in enumerate(mod_texts):
    for index2, sent2 in enumerate(mod_texts):
        if sent1!=sent2:
            score = getSentMatchScore_wfeature(sent1, sent2, fv[index1], fv[index2])
            print ("sentence 1 \n \n", sent1)
            print ("\n\nsentence 2 \n \n", sent2)
            print ("\n\n Score -> ", score, "\n\n\n\n\n")

In [None]:
with open('/home/ether/domain_mind/engineering/se_minds_new.pkl', 'rb') as f:
    file = pickle.load(f)
file

In [None]:
len(list(file['feature_vector'])[0])

In [None]:
temp_score = []
sent_com = {}
max_index = 0
max_score = 0
for i in range(10):
    sent_com[i] = []

for index, sent in enumerate(mod_texts[:]):
    temp_score = []
    for index2, vec in enumerate(list(file['feature_vector'])):
        temp_score.append(getSentMatchScore_wfeature(sent, list(file['sentence'])[index2], fv[index], vec, model1))
#         print ("-------cluster-------")
#         print (sent)
#         print ("score -> " + str(temp_score[index2]))
    max_score = max(temp_score)
    max_index = temp_score.index(max_score)
    sent_com[max_index].append(sent)
    #print (max_index)
    #break
#for i in mod_texts[:10]:
        

In [None]:
for cluster, sent in sent_com.items():
    print ("--------------community " + str(cluster) + "------------")
    for sentence in sent:
        print (sentence)

In [None]:
list(file['sentence'])[9]

In [None]:
text ="The only thing I can think of is that I had both the production and staging builds installed at that time vs now."
#text = "It is a very good time to re consider game of thrones story line"

## select 3 clusters instead of max-score while calculating belongingness of a sentences..


In [None]:
sent_com = {}
max_index = 0
max_score = 0
for i in range(100):
    sent_com[i] = []

for index, sent in enumerate(mod_texts[:]):
    temp_score = {}
    for index2, vec in enumerate(list(file['feature_vector'])):
        temp_score[index2] = getSentMatchScore_wfeature(sent, list(file['sentence'])[index2], fv[index], vec)
#         print ("-------cluster-------")
#         print (sent)
#         print ("score -> " + str(temp_score[index2]))
    temp_score = dict(sorted(temp_score.items(), key = lambda fv : fv[1], reverse=True))
    max_indexes = [i for i in temp_score.keys()][:3]
    #max_index = temp_score.index(max_score)
    print (max_indexes)
#     for indexes in max_indexes:
#         sent_com[indexes].append(index)
    sent_com[index] = max_indexes
    #print (max_index)
    #break
#for i in mod_texts[:10]:
        

In [None]:
for i in sent_com.keys():
    for j in sent_com.keys():
        if i!=j:
            if sorted(sent_com[i])==(sent_com[j]):
                print ("-------similar sentence--------")
                print (mod_texts[i])
                print ("\n")
                print (mod_texts[j])
            

## select 2 clusters instead of max-score while calculating belongingness of a sentences..


In [None]:
sent_com = {}
max_index = 0
max_score = 0
for i in range(100):
    sent_com[i] = []

for index, sent in enumerate(mod_texts[:]):
    temp_score = {}
    for index2, vec in enumerate(list(file['feature_vector'])):
        temp_score[index2] = getSentMatchScore_wfeature(sent, list(file['sentence'])[index2], fv[index], vec)
#         print ("-------cluster-------")
#         print (sent)
#         print ("score -> " + str(temp_score[index2]))
    temp_score = dict(sorted(temp_score.items(), key = lambda fv : fv[1], reverse=True))
    max_indexes = [i for i in temp_score.keys()][:2]
    #max_index = temp_score.index(max_score)
    #print (max_indexes)
#     for indexes in max_indexes:
#         sent_com[indexes].append(index)
    sent_com[index] = max_indexes
    #print (max_index)
    #break
#for i in mod_texts[:10]:
        

In [None]:
for i in sent_com.keys():
    for j in sent_com.keys():
        if i!=j:
            if sorted(sent_com[i])==(sent_com[j]):
                print ("-------similar sentence--------")
                print (mod_texts[i])
                print ("\n")
                print (mod_texts[j])
            

In [None]:
paragraph = "I have a couple S3 buckets. One for my static home page, one for holding images and one for holding the application version. As far as I know, ELB automatically creates the one for managing the application versions."

paragraph = preprocess.preprocess(paragraph, stop_words=False, remove_punct=False)

In [None]:
sent_vec = {}
for index, sentence in enumerate(paragraph):
    sent_vec[index] = getBERTFeatures(model1, sentence, attn_head_idx=-1)

In [None]:
score = {}
for index, sent in enumerate(paragraph):
    temp_score = {}
    for index2, vec in enumerate(list(file['feature_vector'])):
        temp_score[index2] = getSentMatchScore_wfeature(sent, list(file['sentence'])[index2], sent_vec[index], vec)
    score[index] = max(temp_score.values())

In [None]:
def get_norm_value(score):
    weighted_score = {}
    min_score = min(score.values())
    max_score = max(score.values())
    if min_score == max_score:
        return [1]*len(score.values())
    for index, s in enumerate(score.values()):
        weighted_score[index] = (s - min_score)/ (max_score - min_score)
    return weighted_score

def get_weighted_norm_value(score):
    weighted_score = {}
    tot_score = sum(score.values())
    for index, s in enumerate(score.values()):
        weighted_score[index] = s/tot_score
    return weighted_score

#weighted_score = list(get_weighted_norm_value(score).values())

In [None]:
paragraph = []
paragraph_vec = np.zeros((768,), dtype=float)
for index, sent in enumerate(paragraph):
    sent_vec[index] = list(np.array(sent_vec[index])*weighted_score[index])
    
#paragraph_vec = np.sum(np.array(sent_vec.values()))

In [None]:
for s in sent_vec.values():
    print (len(np.array(s)))
    paragraph_vec = np.add(paragraph_vec,np.array(s))
    break

In [None]:
paragraph_vec = np.zeros((768,1), dtype=float)
paragraph3_vec = np.array(list(file['feature_vector'])[0])
(paragraph_vec + paragraph3_vec).shape

In [None]:
np.array(list(file['feature_vector'])[0]).shape

In [None]:
def get_paragraph_vec(paragraph, file):
    sent_vec = {}
    for index, sentence in enumerate(paragraph):
        sent_vec[index] = getBERTFeatures(model1, sentence, attn_head_idx=-2)
    score = {}
    for index, sent in enumerate(paragraph):
        temp_score = {}
        for index2, vec in enumerate(list(file['feature_vector'])):
            temp_score[index2] = getSentMatchScore_wfeature(sent, list(file['sentence'])[index2], sent_vec[index], vec)
        score[index] = max(temp_score.values())
        
    weighted_score = list(get_weighted_norm_value(score).values())
    
    paragraph = []
    paragraph_vec = np.zeros((768,1), dtype=float)
    for index, sent in enumerate(paragraph):
        sent_vec[index] = list(np.array(sent_vec[index])*weighted_score[index])
    

    for s in sent_vec.values():
        #print(np.add(paragraph_vec,np.array(s)).shape)
        paragraph_vec = np.add(paragraph_vec,np.array(s))
    
    print (paragraph_vec.shape)
    return paragraph_vec
    #paragraph_vec = np.sum(np.array(sent_vec.values()))

In [None]:
paragraph = "I have a couple S3 buckets. One for my static home page, one for holding images and one for holding the application version. As far as I know, ELB automatically creates the one for managing the application versions."
paragraph2 = "Having a development environment and a production running at the same time is easy, but it’s expensive. It doubles it, in fact. Therefore, I usually destroy the dev environment as soon as I’m done with it."
paragraph4 = " I actually got a couple of S3 buckets. One for my the home page, one for images and one for the application version.  I know that, ELB will automatically create the one to managing the application versions. "
paragraph3 = "You find out that Harvard Business Review looked into how personality traits factor into group dynamics. While each person should have a functional role within the group, they also have a less obvious psychological role. You already know what functional roles you need to hire for — a product manager, two engineers, an analyst, and a designer. While a functional role is of course important, HBR found that the psychological role someone has to play is just as important to a team’s viability and productivity. Through their research, HBR came up with five different personality traits that are imperative to group success."
paragraph = preprocess.preprocess(paragraph, stop_words=False, remove_punct=False)
paragraph2 = preprocess.preprocess(paragraph2, stop_words=False, remove_punct=False)
paragraph3 = preprocess.preprocess(paragraph3, stop_words=False, remove_punct=False)
paragraph4 = preprocess.preprocess(paragraph4, stop_words=False, remove_punct=False)

In [None]:
paragraph_fv = get_paragraph_vec(paragraph, file)
paragraph2_fv = get_paragraph_vec(paragraph2, file)
paragraph3_fv = get_paragraph_vec(paragraph3, file)
paragraph4_fv = get_paragraph_vec(paragraph4, file)

In [None]:
from scipy.spatial.distance import cosine

#cosine(list(paragraph_fv), list(paragraph2_fv))
1-cosine(paragraph_fv, paragraph3_fv)

In [None]:
paragraph2_fv.shape