<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/BERT_Application_STS_Quora_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q sentence-transformers

[K     |████████████████████████████████| 71kB 2.8MB/s 
[K     |████████████████████████████████| 778kB 13.0MB/s 
[K     |████████████████████████████████| 1.1MB 34.8MB/s 
[K     |████████████████████████████████| 890kB 39.1MB/s 
[K     |████████████████████████████████| 3.0MB 44.9MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import urllib.request
import zipfile
import os
import requests
from torch import Tensor, device
from typing import Tuple, List
from tqdm import tqdm
import sys
import importlib
import os
import torch
import numpy as np
import queue
from sentence_transformers import SentenceTransformer

In [3]:

folder_path = os.path.dirname(os.path.realpath('/content/drive/'))
print('Beginning download of datasets')

datasets = ['AllNLI.zip', 
            'stsbenchmark.zip', 
            'wikipedia-sections-triplets.zip', 
            'STS2017.en-de.txt.gz', 
            'TED2013-en-de.txt.gz', 
            'xnli-en-de.txt.gz']
server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"

for dataset in datasets:
    print("Download", dataset)
    url = server+dataset
    dataset_path = os.path.join(folder_path, dataset)
    urllib.request.urlretrieve(url, dataset_path)

    if dataset.endswith('.zip'):
        print("Extract", dataset)
        with zipfile.ZipFile(dataset_path, "r") as zip_ref:
            zip_ref.extractall(folder_path)
        os.remove(dataset_path)


print("All datasets downloaded and extracted")


Beginning download of datasets
Download AllNLI.zip
Extract AllNLI.zip
Download stsbenchmark.zip
Extract stsbenchmark.zip
Download wikipedia-sections-triplets.zip
Extract wikipedia-sections-triplets.zip
Download STS2017.en-de.txt.gz
Download TED2013-en-de.txt.gz
Download xnli-en-de.txt.gz
All datasets downloaded and extracted


In [8]:

def pytorch_cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    This function can be used as a faster replacement for 1-scipy.spatial.distance.cdist(a,b)
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    return torch.mm(a_norm, b_norm.transpose(0, 1))



def paraphrase_mining(model,
                      sentences: List[str],
                      show_progress_bar=False,
                      batch_size=32,
                      query_chunk_size: int = 5000,
                      corpus_chunk_size: int = 100000,
                      max_pairs: int = 500000,
                      top_k: int = 100):
    """
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    :param model: SentenceTransformer model for embedding computation
    :param sentences: A list of strings (texts or sentences)
    :param show_progress_bar: Plotting of a progress bar
    :param batch_size: Number of texts that are encoded simultaneously by the model
    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    :param max_pairs: Maximal number of text pairs returned.
    :param top_k: For each sentence, we retrieve up to top_k other sentences
    :return: Returns a list of triplets with the format [score, id1, id2]
    """

    top_k += 1  #A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs

    # Compute embedding for the sentences
    embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size,
                              convert_to_tensor=True)

    # Mine for duplicates
    pairs = queue.PriorityQueue()
    min_score = -1
    num_added = 0

    for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
        corpus_end_idx = min(corpus_start_idx + corpus_chunk_size, len(embeddings))
        for query_start_idx in range(0, len(embeddings), query_chunk_size):
            query_end_idx = min(query_start_idx + query_chunk_size, len(embeddings))

            # logging.info("Compute cosine similarities")
            cos_scores = pytorch_cos_sim(embeddings[query_start_idx:query_end_idx],
                                         embeddings[corpus_start_idx:corpus_end_idx]).cpu().numpy()
            cos_scores = np.nan_to_num(cos_scores)

            # logging.info("Sort scores")
            cos_score_argpartition = np.argpartition(-cos_scores, min(len(cos_scores)-1, top_k))

            # logging.info("Find most similar pairs out of {} queries".format(len(cos_scores)))
            for query_itr in range(len(cos_scores)):
                for corpus_itr in cos_score_argpartition[query_itr][0:top_k]:
                    i = query_start_idx + query_itr
                    j = corpus_start_idx + corpus_itr

                    if i != j and cos_scores[query_itr][corpus_itr] > min_score:
                        pairs.put((cos_scores[query_itr][corpus_itr], i, j))
                        num_added += 1

                        if num_added >= max_pairs:
                            entry = pairs.get()
                            min_score = entry[0]

    # Get the pairs
    added_pairs = set()  # Used for duplicate detection
    pairs_list = []
    while not pairs.empty():
        score, i, j = pairs.get()
        sorted_i, sorted_j = sorted([i, j])

        if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
            added_pairs.add((sorted_i, sorted_j))
            pairs_list.append([score, i, j])

    # Highest scores first
    pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
    return pairs_list


def information_retrieval(*args, **kwargs):
    """This function is decprecated. Use semantic_search insted"""
    return semantic_search(*args, **kwargs)


def semantic_search(query_embeddings: Tensor,
                      corpus_embeddings: Tensor,
                      query_chunk_size: int = 100,
                      corpus_chunk_size: int = 100000,
                      top_k: int = 10):
    """
    This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

    :param query_embeddings: A 2 dimensional tensor with the query embeddings.
    :param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
    :param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
    :param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
    :param top_k: Retrieve top k matching entries. Note, if your corpus is larger than query_chunk_size, |Chunks|*top_k are returned
    :return: Returns a sorted list with decreasing cosine similarity scores. Entries are dictionaries with the keys 'corpus_id' and 'score'
    """

    if isinstance(query_embeddings, (np.ndarray, np.generic)):
        query_embeddings = torch.from_numpy(query_embeddings)
    elif isinstance(query_embeddings, list):
        query_embeddings = torch.stack(query_embeddings)

    if len(query_embeddings.shape) == 1:
        query_embeddings = query_embeddings.unsqueeze(0)

    if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
        corpus_embeddings = torch.from_numpy(corpus_embeddings)
    elif isinstance(corpus_embeddings, list):
        corpus_embeddings = torch.stack(corpus_embeddings)

    #Normalize scores, so that the dot-product is equivalent to cosine similarity
    query_embeddings = query_embeddings / query_embeddings.norm(dim=1)[:, None]
    corpus_embeddings = corpus_embeddings / corpus_embeddings.norm(dim=1)[:, None]

    queries_result_list = [[] for _ in range(len(query_embeddings))]

    for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
        query_end_idx = min(query_start_idx + query_chunk_size, len(query_embeddings))

        # Iterate over chunks of the corpus
        for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
            corpus_end_idx = min(corpus_start_idx + corpus_chunk_size, len(corpus_embeddings))

            # Compute cosine similarites
            cos_scores = torch.mm(query_embeddings[query_start_idx:query_end_idx], corpus_embeddings[corpus_start_idx:corpus_end_idx].transpose(0, 1)).cpu().numpy()
            cos_scores = np.nan_to_num(cos_scores)

            # Partial sort scores
            cos_score_argpartition = np.argpartition(-cos_scores, min(top_k, len(cos_scores[0])-1))[:, 0:top_k]

            for query_itr in range(len(cos_scores)):
                for sub_corpus_id in cos_score_argpartition[query_itr]:
                    corpus_id = corpus_start_idx + sub_corpus_id
                    query_id = query_start_idx + query_itr
                    score = cos_scores[query_itr][sub_corpus_id]
                    queries_result_list[query_id].append({'corpus_id': corpus_id, 'score': score})

    #Sort and strip to top_k results
    for idx in range(len(queries_result_list)):
        queries_result_list[idx] = sorted(queries_result_list[idx], key=lambda x: x['score'], reverse=True)
        queries_result_list[idx] = queries_result_list[idx][0:top_k]

    return queries_result_list


def http_get(url, path):
    """
    Downloads a URL to a given path on disc
    """
    req = requests.get(url, stream=True)
    if req.status_code != 200:
        print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
        req.raise_for_status()
        return

    download_filepath = path+"_part"
    with open(download_filepath, "wb") as file_binary:
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = tqdm(unit="B", total=total, unit_scale=True)
        for chunk in req.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                progress.update(len(chunk))
                file_binary.write(chunk)

    os.rename(download_filepath, path)
    progress.close()


def batch_to_device(batch, target_device: device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    features = batch['features']
    for paired_sentence_idx in range(len(features)):
        for feature_name in features[paired_sentence_idx]:
            features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device)

    labels = batch['labels'].to(target_device)
    return features, labels


def fullname(o):
  """
  Gives a full name (package_name.class_name) for a class / object in Python. Will
  be used to load the correct classes from JSON files
  """

  module = o.__class__.__module__
  if module is None or module == str.__class__.__module__:
    return o.__class__.__name__  # Avoid reporting __builtin__
  else:
    return module + '.' + o.__class__.__name__

def import_from_string(dotted_path):
    """
    Import a dotted module path and return the attribute/class designated by the
    last name in the path. Raise ImportError if the import failed.
    """
    try:
        module_path, class_name = dotted_path.rsplit('.', 1)
    except ValueError:
        msg = "%s doesn't look like a module path" % dotted_path
        raise ImportError(msg)

    module = importlib.import_module(module_path)

    try:
        return getattr(module, class_name)
    except AttributeError:
        msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
        raise ImportError(msg)

In [None]:
"""
This script contains an example how to perform semantic search with PyTorch.

As dataset, we use the Quora Duplicate Questions dataset, which contains about 500k questions:
https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs

Questions are embedded and PyTorch is used for semantic similarity search.
"""
import os
import csv
import pickle
import time

model_name = 'distilbert-base-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 100000


embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(model_name.replace('/', '_'), max_corpus_size)


#Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
    # Check if the dataset exists. If not, download and extract
    # Download dataset if needed
    if not os.path.exists(dataset_path):
        print("Download dataset")
        http_get(url, dataset_path)

    # Get all unique sentences from the file
    corpus_sentences = set()
    with open(dataset_path, encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            corpus_sentences.add(row['question1'])
            if len(corpus_sentences) >= max_corpus_size:
                break

            corpus_sentences.add(row['question2'])
            if len(corpus_sentences) >= max_corpus_size:
                break

    corpus_sentences = list(corpus_sentences)
    print("Encode the corpus. This might take a while")
    corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)

    print("Store file on disc")
    with open(embedding_cache_path, "wb") as fOut:
        pickle.dump({'sentences': corpus_sentences, 'embeddings': corpus_embeddings}, fOut)
else:
    print("Load pre-computed embeddings from disc")
    with open(embedding_cache_path, "rb") as fIn:
        cache_data = pickle.load(fIn)
        corpus_sentences = cache_data['sentences'][0:max_corpus_size]
        corpus_embeddings = cache_data['embeddings'][0:max_corpus_size]

###############################
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))

while True:
    inp_question = input("Please enter a question: ")

    start_time = time.time()
    question_embedding = model.encode(inp_question, convert_to_tensor=True)
    hits = semantic_search(question_embedding, corpus_embeddings)
    end_time = time.time()
    hits = hits[0]  #Get the hits for the first query

    print("Input question:", inp_question)
    print("Results (after {:.3f} seconds):".format(end_time-start_time))
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))

    print("\n\n========\n")

100%|██████████| 245M/245M [00:19<00:00, 12.7MB/s]


Download dataset


100%|██████████| 58.2M/58.2M [00:01<00:00, 57.5MB/s]


Encode the corpus. This might take a while


HBox(children=(FloatProgress(value=0.0, description='Batches', max=12500.0, style=ProgressStyle(description_wi…


Store file on disc
Corpus loaded with 100000 sentences / embeddings
Please enter a question: Whats the best python course?
Input question: Whats the best python course?
Results (after 0.306 seconds):
	0.945	What are the best python online courses for beginners?
	0.889	What are the best learning sites for Python?
	0.878	What are best site for learning python?
	0.868	How do I learn Python systematically?
	0.867	Where should I start at to learn about how to do Python?



Please enter a question: O que é SQL?
Input question: O que é SQL?
Results (after 0.329 seconds):
	0.667	How do I learn SQL?
	0.666	What is quoro?
	0.661	Does Quora stand for QUestion OR Answer?
	0.659	What doe Quora mean?
	0.659	What Quora mean?



Please enter a question: Itau
Input question: Itau
Results (after 0.280 seconds):
	0.549	Can you start a sentence with "it"?
	0.542	What are some things new employees should know going into their first day at Itron?
	0.534	What's the difference between IT and CSE?
	0.528	Whic

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')


100%|██████████| 405M/405M [00:27<00:00, 14.8MB/s]


In [None]:
import requests
from torch import Tensor, device
from typing import Tuple, List
from tqdm import tqdm
import sys
import importlib
import os
import torch
import numpy as np
import queue

def pytorch_cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    This function can be used as a faster replacement for 1-scipy.spatial.distance.cdist(a,b)
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = a / a.norm(dim=1)[:, None]
    b_norm = b / b.norm(dim=1)[:, None]
    return torch.mm(a_norm, b_norm.transpose(0, 1))



def paraphrase_mining(model,
                      sentences: List[str],
                      show_progress_bar=False,
                      batch_size=32,
                      query_chunk_size: int = 5000,
                      corpus_chunk_size: int = 100000,
                      max_pairs: int = 500000,
                      top_k: int = 100):
    """
    Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all
    other sentences and returns a list with the pairs that have the highest cosine similarity score.

    :param model: SentenceTransformer model for embedding computation
    :param sentences: A list of strings (texts or sentences)
    :param show_progress_bar: Plotting of a progress bar
    :param batch_size: Number of texts that are encoded simultaneously by the model
    :param query_chunk_size: Search for most similar pairs for #query_chunk_size at the same time. Decrease, to lower memory footprint (increases run-time).
    :param corpus_chunk_size: Compare a sentence simultaneously against #corpus_chunk_size other sentences. Decrease, to lower memory footprint (increases run-time).
    :param max_pairs: Maximal number of text pairs returned.
    :param top_k: For each sentence, we retrieve up to top_k other sentences
    :return: Returns a list of triplets with the format [score, id1, id2]
    """

    top_k += 1  #A sentence has the highest similarity to itself. Increase +1 as we are interest in distinct pairs

    # Compute embedding for the sentences
    embeddings = model.encode(sentences, show_progress_bar=show_progress_bar, batch_size=batch_size,
                              convert_to_tensor=True)

    # Mine for duplicates
    pairs = queue.PriorityQueue()
    min_score = -1
    num_added = 0

    for corpus_start_idx in range(0, len(embeddings), corpus_chunk_size):
        corpus_end_idx = min(corpus_start_idx + corpus_chunk_size, len(embeddings))
        for query_start_idx in range(0, len(embeddings), query_chunk_size):
            query_end_idx = min(query_start_idx + query_chunk_size, len(embeddings))

            # logging.info("Compute cosine similarities")
            cos_scores = pytorch_cos_sim(embeddings[query_start_idx:query_end_idx],
                                         embeddings[corpus_start_idx:corpus_end_idx]).cpu().numpy()
            cos_scores = np.nan_to_num(cos_scores)

            # logging.info("Sort scores")
            cos_score_argpartition = np.argpartition(-cos_scores, min(len(cos_scores)-1, top_k))

            # logging.info("Find most similar pairs out of {} queries".format(len(cos_scores)))
            for query_itr in range(len(cos_scores)):
                for corpus_itr in cos_score_argpartition[query_itr][0:top_k]:
                    i = query_start_idx + query_itr
                    j = corpus_start_idx + corpus_itr

                    if i != j and cos_scores[query_itr][corpus_itr] > min_score:
                        pairs.put((cos_scores[query_itr][corpus_itr], i, j))
                        num_added += 1

                        if num_added >= max_pairs:
                            entry = pairs.get()
                            min_score = entry[0]

    # Get the pairs
    added_pairs = set()  # Used for duplicate detection
    pairs_list = []
    while not pairs.empty():
        score, i, j = pairs.get()
        sorted_i, sorted_j = sorted([i, j])

        if sorted_i != sorted_j and (sorted_i, sorted_j) not in added_pairs:
            added_pairs.add((sorted_i, sorted_j))
            pairs_list.append([score, i, j])

    # Highest scores first
    pairs_list = sorted(pairs_list, key=lambda x: x[0], reverse=True)
    return pairs_list


def information_retrieval(*args, **kwargs):
    """This function is decprecated. Use semantic_search insted"""
    return semantic_search(*args, **kwargs)


def semantic_search(query_embeddings: Tensor,
                      corpus_embeddings: Tensor,
                      query_chunk_size: int = 100,
                      corpus_chunk_size: int = 100000,
                      top_k: int = 10):
    """
    This function performs a cosine similarity search between a list of query embeddings  and a list of corpus embeddings.
    It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.

    :param query_embeddings: A 2 dimensional tensor with the query embeddings.
    :param corpus_embeddings: A 2 dimensional tensor with the corpus embeddings.
    :param query_chunk_size: Process 100 queries simultaneously. Increasing that value increases the speed, but requires more memory.
    :param corpus_chunk_size: Scans the corpus 100k entries at a time. Increasing that value increases the speed, but requires more memory.
    :param top_k: Retrieve top k matching entries. Note, if your corpus is larger than query_chunk_size, |Chunks|*top_k are returned
    :return: Returns a sorted list with decreasing cosine similarity scores. Entries are dictionaries with the keys 'corpus_id' and 'score'
    """

    if isinstance(query_embeddings, (np.ndarray, np.generic)):
        query_embeddings = torch.from_numpy(query_embeddings)
    elif isinstance(query_embeddings, list):
        query_embeddings = torch.stack(query_embeddings)

    if len(query_embeddings.shape) == 1:
        query_embeddings = query_embeddings.unsqueeze(0)

    if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
        corpus_embeddings = torch.from_numpy(corpus_embeddings)
    elif isinstance(corpus_embeddings, list):
        corpus_embeddings = torch.stack(corpus_embeddings)

    #Normalize scores, so that the dot-product is equivalent to cosine similarity
    query_embeddings = query_embeddings / query_embeddings.norm(dim=1)[:, None]
    corpus_embeddings = corpus_embeddings / corpus_embeddings.norm(dim=1)[:, None]

    queries_result_list = [[] for _ in range(len(query_embeddings))]

    for query_start_idx in range(0, len(query_embeddings), query_chunk_size):
        query_end_idx = min(query_start_idx + query_chunk_size, len(query_embeddings))

        # Iterate over chunks of the corpus
        for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
            corpus_end_idx = min(corpus_start_idx + corpus_chunk_size, len(corpus_embeddings))

            # Compute cosine similarites
            cos_scores = torch.mm(query_embeddings[query_start_idx:query_end_idx], corpus_embeddings[corpus_start_idx:corpus_end_idx].transpose(0, 1)).cpu().numpy()
            cos_scores = np.nan_to_num(cos_scores)

            # Partial sort scores
            cos_score_argpartition = np.argpartition(-cos_scores, min(top_k, len(cos_scores[0])-1))[:, 0:top_k]

            for query_itr in range(len(cos_scores)):
                for sub_corpus_id in cos_score_argpartition[query_itr]:
                    corpus_id = corpus_start_idx + sub_corpus_id
                    query_id = query_start_idx + query_itr
                    score = cos_scores[query_itr][sub_corpus_id]
                    queries_result_list[query_id].append({'corpus_id': corpus_id, 'score': score})

    #Sort and strip to top_k results
    for idx in range(len(queries_result_list)):
        queries_result_list[idx] = sorted(queries_result_list[idx], key=lambda x: x['score'], reverse=True)
        queries_result_list[idx] = queries_result_list[idx][0:top_k]

    return queries_result_list


def http_get(url, path):
    """
    Downloads a URL to a given path on disc
    """
    req = requests.get(url, stream=True)
    if req.status_code != 200:
        print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
        req.raise_for_status()
        return

    download_filepath = path+"_part"
    with open(download_filepath, "wb") as file_binary:
        content_length = req.headers.get('Content-Length')
        total = int(content_length) if content_length is not None else None
        progress = tqdm(unit="B", total=total, unit_scale=True)
        for chunk in req.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                progress.update(len(chunk))
                file_binary.write(chunk)

    os.rename(download_filepath, path)
    progress.close()


def batch_to_device(batch, target_device: device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    features = batch['features']
    for paired_sentence_idx in range(len(features)):
        for feature_name in features[paired_sentence_idx]:
            features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device)

    labels = batch['labels'].to(target_device)
    return features, labels


def fullname(o):
  """
  Gives a full name (package_name.class_name) for a class / object in Python. Will
  be used to load the correct classes from JSON files
  """

  module = o.__class__.__module__
  if module is None or module == str.__class__.__module__:
    return o.__class__.__name__  # Avoid reporting __builtin__
  else:
    return module + '.' + o.__class__.__name__

def import_from_string(dotted_path):
    """
    Import a dotted module path and return the attribute/class designated by the
    last name in the path. Raise ImportError if the import failed.
    """
    try:
        module_path, class_name = dotted_path.rsplit('.', 1)
    except ValueError:
        msg = "%s doesn't look like a module path" % dotted_path
        raise ImportError(msg)

    module = importlib.import_module(module_path)

    try:
        return getattr(module, class_name)
    except AttributeError:
        msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
        raise ImportError(msg)

In [None]:
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

# Corpus with example sentences
corpus = ['O Die é o cara das bases',
          'O Die é também é o cara do hadoop',
          'O Paulo trabalha junto com o Die',
          'O Alex é o gestor da equipe',
          'O Alex faz doutorado na USP',
          'O Paulo faz doutorado na Unicamp',
          'O Die estuda na unicamp, faz física, ewwwwwwwwwwwwww!',
          'O Alex, Die e Paulo, estão em novo projeto',
          'O projeto não é trivial, mas o Die está com muitas boas ideias',
          'O Alex sugeriu uma solução do COTA',
          'O Paulo quer implementar o COTA']

corpus_embeddings = embedder.encode(corpus)

In [None]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Corpus with example sentences
corpus = ['O Die é o cara das bases',
          'O Die é também é o cara do hadoop',
          'O Paulo trabalha junto com o Die',
          'O Alex é o gestor da equipe',
          'O Alex faz doutorado na USP',
          'O Paulo faz doutorado na Unicamp',
          'O Die estuda na unicamp, faz física, ewwwwwwwwwwwwww!',
          'O Alex, Die e Paulo, estão em novo projeto',
          'O projeto não é trivial, mas o Die está com muitas boas ideias',
          'O Alex sugeriu uma solução do COTA',
          'O Paulo quer implementar o COTA']
          
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['Quem estuda na USP.', 'O que o Alex é.', 'O que o Paulo faz.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = pytorch_cos_sim(query_embedding, corpus_embeddings)[0]

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))






Query: Quem estuda na USP.

Top 5 most similar sentences in corpus:
O Alex faz doutorado na USP (Score: 0.7248)
O Paulo quer implementar o COTA (Score: 0.6507)
O Alex sugeriu uma solução do COTA (Score: 0.5770)
O Paulo faz doutorado na Unicamp (Score: 0.5087)
O Alex é o gestor da equipe (Score: 0.4835)




Query: O que o Alex é.

Top 5 most similar sentences in corpus:
O Alex é o gestor da equipe (Score: 0.8278)
O Alex faz doutorado na USP (Score: 0.7685)
O Alex sugeriu uma solução do COTA (Score: 0.7520)
O Alex, Die e Paulo, estão em novo projeto (Score: 0.6739)
O Paulo quer implementar o COTA (Score: 0.4930)




Query: O que o Paulo faz.

Top 5 most similar sentences in corpus:
O Paulo faz doutorado na Unicamp (Score: 0.8410)
O Paulo quer implementar o COTA (Score: 0.8379)
O Paulo trabalha junto com o Die (Score: 0.7115)
O Alex, Die e Paulo, estão em novo projeto (Score: 0.5069)
O Die é o cara das bases (Score: 0.4950)


In [None]:
from sklearn.cluster import KMeans

embedder = SentenceTransformer('bert-base-nli-mean-tokens')

corpus_embeddings = embedder.encode(corpus)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['O Paulo faz doutorado na Unicamp', 'O Paulo quer implementar o COTA']

Cluster  2
['O Die é o cara das bases', 'O Die é também é o cara do hadoop']

Cluster  3
['O Alex é o gestor da equipe', 'O Alex faz doutorado na USP', 'O Alex sugeriu uma solução do COTA']

Cluster  4
['O Die estuda na unicamp, faz física, ewwwwwwwwwwwwww!']

Cluster  5
['O Paulo trabalha junto com o Die', 'O Alex, Die e Paulo, estão em novo projeto', 'O projeto não é trivial, mas o Die está com muitas boas ideias']

