In [4]:
def euclideanDistance(sample1,sample2):
    import math
    sum = 0
    for i in range(sample1.shape[0]):
        number = sample1[i]-sample2[i]
        sum = sum + math.pow(number,2)
            
    return math.pow(sum,0.5)



def kMeans(features, input,k):
    from sklearn.cluster import KMeans

    features = features.reshape((features.shape[0],features.shape[1]*features.shape[2]))

    model = KMeans(n_clusters=k)
    labels = model.fit_predict(features)
    cluster_centers = model.cluster_centers_

    distance_list = []

    for center in list(cluster_centers):
        distances = {}
        for i, sentence in enumerate(features):
            distances[str(i)] = euclideanDistance(center,sentence)

        import operator 
        sorted_distances = sorted(distances.items(), key=operator.itemgetter(1))
        distance_list.append(sorted_distances)

    idx = [int(value[0][0]) for value in distance_list]
    idx = sorted(idx)
    
    return idx



def BERTFeatureExtraction(input, max_token_size):
    input = input.split(".")
    input = input[:-1]
    print("Number of sentences in the input text:", len(input))
    
    
    from transformers import BertModel, BertTokenizer
    import torch
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    features = []
    
    for sentence in (input):
        tokens = tokenizer.tokenize(sentence)
        special_tokens = ['[CLS]'] + tokens + ['[SEP]']
        
        if len(special_tokens)>max_token_size:
            print("Warning max_token_size is small. Change the parameter! Maybe be there is no dot in the input text.")
            
            
        padded_tokens = special_tokens + ['[PAD]' for i in range(max_token_size-len(special_tokens))]
        token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in padded_tokens]
        
        token_ids = torch.tensor(token_ids).unsqueeze(0)
        attn_mask = torch.tensor(attn_mask).unsqueeze(0)
        
        hidden_repr, cls_head = model(token_ids, attention_mask = attn_mask)     
        features.append(hidden_repr[0]) #batch dimension reduction (1, token_size, wv) => (token_size, wv)
    
    
    import numpy as np
    features = [value.detach().numpy() for value in features]
    features = [value.reshape((max_token_size,768)) for value in features]
    features = np.array(features).reshape((len(features),max_token_size,768))
    
    
    return features, input


#MAIN************************************************************************

input = "The subject of the first sentence is animals. Protecting animals is a very important topic. For instance, dog is a type of animal. Also, another topic is natural language processing. This topic is covered by the computer science but electrical and electronics engineers can do it too."


max_token_size = 50
k=2

features, input = BERTFeatureExtraction(input, max_token_size) #input is string, output is (batch_size=num_sen, token_size, hidden_dim)
idx = kMeans(features, input, k)


summary = [input[id]+"." for id in(idx)]
for sentence in summary:
    print(sentence)

Number of sentences in the input text: 5
The subject of the first sentence is animals.
 This topic is covered by the computer science but electrical and electronics engineers can do it too.
