In [1]:
import logging
import numpy as np
from sentence_transformers import SentenceTransformer
from weighted_bert.models import WeightedAverage, WeightedRemoval
from weighted_bert.data import InputExample

logging.basicConfig(level="DEBUG")
logging.getLogger("urllib3").setLevel(logging.WARNING)

%load_ext autoreload
%autoreload 2

# Example use

## Using a HuggingFace model for NER

In [2]:
weighting_checkpoint = "savasy/bert-base-turkish-ner-cased"
embedding_checkpoint = "emrecan/bert-base-turkish-cased-mean-nli-stsb-tr"

documents = [
[
    "Tesla'nın otomobilleri insan hayatlarını riske atıyor olabilir.",
    "Türkiye ve Kore arasında gerçekleşen voleybol müsabakasını Türkiye Milli Takımı kazandı.",
    "Bu bir metin.",
],
[
    "Mustafa Kemal Atatürk 19 Mayıs 1919'da Samsun'a ayak bastı.",
    "Bu bir metin.",
],
]

In [3]:
# Initialize models
embedding_model = SentenceTransformer(embedding_checkpoint)
weighter_a = WeightedAverage(weighting_checkpoint)
weighter_r = WeightedRemoval(weighting_checkpoint)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: emrecan/bert-base-turkish-cased-mean-nli-stsb-tr
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:weighted_bert.models:	savasy/bert-base-turkish-ner-cased
INFO:weighted_bert.models:	savasy/bert-base-turkish-ner-cased


In [4]:
# Calculate embeddings
input_examples = [InputExample(doc, embedding_model.encode(doc)) for doc in documents]
embeds_a = weighter_a.fit_transform(input_examples)
embeds_r = weighter_r.fit_transform(input_examples)

Batches: 100%|██████████| 1/1 [00:00<00:00, 49.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 80.33it/s]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 3, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [2, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 3, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [2, 0]


In [5]:
weighter_a.embeddings_.shape, weighter_r.embeddings_.shape

((2, 768), (2, 768))

## Using a rule based entity detector

In [6]:
import re
from typing import List, Any

def detect(sentence: str):
    sentence_entites = [] 
    entity_list = ['tesla', "atatürk", "türkiye"]

    for ent in entity_list:
        matches = re.finditer(ent, sentence.lower())
        indexes = [(match.start(), match.end()) for match in matches]
        if indexes:
            for start, end in indexes:
                sentence_entites.append({"text": ent, "start": start, "end": end})
    
    return sentence_entites

def entity_detector(document: List[str]) -> List[List[Any]]:
    return [detect(sentence) for sentence in document]


In [7]:
# Initialize models
embedding_model = SentenceTransformer(embedding_checkpoint)
weighter_a = WeightedAverage(weighting_checkpoint)
weighter_r = WeightedRemoval(weighting_checkpoint)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: emrecan/bert-base-turkish-cased-mean-nli-stsb-tr
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda
INFO:weighted_bert.models:	savasy/bert-base-turkish-ner-cased
INFO:weighted_bert.models:	savasy/bert-base-turkish-ner-cased


In [8]:
# Calculate embeddings
input_examples = [InputExample(doc, embedding_model.encode(doc)) for doc in documents]
embeds_a = weighter_a.fit_transform(input_examples)
embeds_r = weighter_r.fit_transform(input_examples)

Batches: 100%|██████████| 1/1 [00:00<00:00, 67.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 87.90it/s]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 3, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [2, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [1, 3, 0]
DEBUG:weighted_bert.models:Entity count list for doc: [2, 0]


In [9]:
embeds_a.shape, embeds_r.shape

((2, 768), (2, 768))