In [1]:
import logging
import numpy as np
from sentence_transformers import SentenceTransformer
from weighted_bert.models import WeightedAverage, WeightedRemoval

logging.basicConfig(level="DEBUG")

%load_ext autoreload
%autoreload 2

# Example use

## Using a HuggingFace model for NER

In [2]:
weighting_checkpoint = "savasy/bert-base-turkish-ner-cased"
embedding_checkpoint = "emrecan/bert-base-turkish-cased-mean-nli-stsb-tr"

documents = [
[
    "Tesla'nın otomobilleri insan hayatlarını riske atıyor olabilir.",
    "Türkiye ve Kore arasında gerçekleşen voleybol müsabakasını Türkiye Milli Takımı kazandı.",
    "Bu bir metin.",
],
[
    "Mustafa Kemal Atatürk 19 Mayıs 1919'da Samsun'a ayak bastı.",
    "Bu bir metin.",
],
]

In [3]:
# Initialize models
embedding_model = SentenceTransformer(embedding_checkpoint)
weighter_a = WeightedAverage(weighting_checkpoint)
weighter_r = WeightedRemoval(weighting_checkpoint)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: emrecan/bert-base-turkish-cased-mean-nli-stsb-tr
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/emrecan/bert-base-turkish-cased-mean-nli-stsb-tr HTTP/1.1" 200 1605
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /emrecan/bert-base-turkish-cased-mean-nli-stsb-tr/resolve/c4d66371214a20c0c91a39c83351ddc24f398800/.gitattributes HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /emrecan/bert-base-turkish-cased-mean-nli-stsb-tr/resolve/c4d66371214a20c0c91a39c83351ddc24f398800/README.md HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.c

In [4]:
# Calculate embeddings
collection_sentence_embeddings = [embedding_model.encode(doc) for doc in documents]
embeddings_a = np.array([weighter_a.get_document_embedding(doc, sentence_emb)
                for doc, sentence_emb in zip(documents, collection_sentence_embeddings)])
embeddings_r = weighter_r.get_document_embeddings(documents, collection_sentence_embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.27it/s]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 3, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [2, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 3, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [2, 0]
INFO:weighted_bert.models:	Calculating first singular vector...
INFO:weighted_bert.models:	Calculating corrected embeddings...


In [5]:
embeddings_a.shape, embeddings_r.shape

((2, 768), (2, 768))

## Using a rule based entity detector

In [6]:
import re 
from typing import List, Any

def detect(sentence: str):
    sentence_entites = [] 
    entity_list = ['tesla', "atatürk", "türkiye"]

    for ent in entity_list:
        matches = re.finditer(ent, sentence.lower())
        indexes = [(match.start(), match.end()) for match in matches]
        if indexes:
            for start, end in indexes:
                sentence_entites.append({"text": ent, "start": start, "end": end})
    
    return sentence_entites

def entity_detector(document: List[str]) -> List[List[Any]]:
    return [detect(sentence) for sentence in document]


In [9]:
# Initialize models
weighter_a = WeightedAverage(entity_detector=entity_detector)
weighter_r = WeightedRemoval(entity_detector=entity_detector)

INFO:weighted_bert.models:Entity detector function you have provided is being used, not initializing HuggingFace model.
INFO:weighted_bert.models:Entity detector function you have provided is being used, not initializing HuggingFace model.


In [10]:
# Calculate embeddings
collection_sentence_embeddings = [embedding_model.encode(doc) for doc in documents]
embeddings_a = np.array([weighter_a.get_document_embedding(doc, sentence_emb)
                for doc, sentence_emb in zip(documents, collection_sentence_embeddings)])
embeddings_r = weighter_r.get_document_embeddings(documents, collection_sentence_embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00, 50.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.67it/s]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 2, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 2, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 0]
INFO:weighted_bert.models:	Calculating first singular vector...
INFO:weighted_bert.models:	Calculating corrected embeddings...


In [12]:
embeddings_a.shape, embeddings_r.shape

((2, 768), (2, 768))