In [1]:
import logging
import numpy as np
from sentence_transformers import SentenceTransformer
from weighted_bert.models import WeightedAverage, WeightedRemoval

logging.basicConfig(level="INFO")

%load_ext autoreload
%autoreload 2

# Example use

In [2]:
weighting_checkpoint = "savasy/bert-base-turkish-ner-cased"
embedding_checkpoint = "sentence-transformers/distiluse-base-multilingual-cased-v1"

documents = [
[
    "Tesla'nın otomobilleri insan hayatlarını riske atıyor olabilir.",
    "Türkiye ve Kore arasında gerçekleşen voleybol müsabakasını Türkiye Milli Takımı kazandı.",
    "Bu bir metin.",
],
[
    "Mustafa Kemal Atatürk 19 Mayıs 1919'da Samsun'a ayak bastı.",
    "Bu bir metin.",
],
]

In [3]:
# Initialize models
embedding_model = SentenceTransformer(embedding_checkpoint)
weighter_a = WeightedAverage(weighting_checkpoint)
weighter_r = WeightedRemoval(weighting_checkpoint)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/distiluse-base-multilingual-cased-v1
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:weighted_bert.models:	savasy/bert-base-turkish-ner-cased
INFO:weighted_bert.models:	savasy/bert-base-turkish-ner-cased


In [4]:
# Calculate embeddings
collection_sentence_embeddings = [embedding_model.encode(doc) for doc in documents]
embeddings_a = np.array([weighter_a.get_document_embedding(doc, sentence_emb)
                for doc, sentence_emb in zip(documents, collection_sentence_embeddings)])
embeddings_r = weighter_r.get_document_embeddings(documents, collection_sentence_embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00, 50.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.74it/s]
INFO:weighted_bert.models:	Calculating first singular vector...
INFO:weighted_bert.models:	Calculating corrected embeddings...


In [5]:
embeddings_a.shape

(2, 512)

In [6]:
embeddings_r.shape

(2, 512)