In [None]:
!pip install langchain optimum[onnxruntime] transformers

In [None]:
from datetime import datetime

from langchain_text_splitters import CharacterTextSplitter
import numpy as np
from optimum.onnxruntime import ORTModelForFeatureExtraction
import onnxruntime as onnxrt
from torch.utils.data import Dataset
from transformers import AutoTokenizer, pipeline

In [None]:
example = """
Bulgaria (/bʌlˈɡɛəriə, bʊl-/ ⓘ; Bulgarian: България, romanized: Bŭlgariya), officially the Republic of Bulgaria,[a] is a country in Southeast Europe. Located west of the Black Sea and south of the Danube river, Bulgaria is bordered by Greece and Turkey to the south, Serbia and North Macedonia to the west, and Romania to the north. It covers a territory of 110,994 square kilometres (42,855 sq mi) and is the 16th largest country in Europe. Sofia is the nation's capital and largest city; other major cities include Burgas, Plovdiv, and Varna.

One of the earliest societies in the lands of modern-day Bulgaria was the Neolithic Karanovo culture, which dates back to 6,500 BC. In the 6th to 3rd century BC the region was a battleground for ancient Thracians, Persians, Celts and Macedonians; stability came when the Roman Empire conquered the region in AD 45. After the Roman state splintered, tribal invasions in the region resumed. Around the 6th century, these territories were settled by the early Slavs. The Bulgars, led by Asparuh, attacked from the lands of Old Great Bulgaria and permanently invaded the Balkans in the late 7th century. They established the First Bulgarian Empire, victoriously recognised by treaty in 681 AD by the Byzantine Empire. It dominated most of the Balkans and significantly influenced Slavic cultures by developing the Cyrillic script. The First Bulgarian Empire lasted until the early 11th century, when Byzantine emperor Basil II conquered and dismantled it. A successful Bulgarian revolt in 1185 established a Second Bulgarian Empire, which reached its apex under Ivan Asen II (1218–1241). After numerous exhausting wars and feudal strife, the empire disintegrated and in 1396 fell under Ottoman rule for nearly five centuries.

The Russo-Turkish War of 1877–78 resulted in the formation of the third and current Bulgarian state. Many ethnic Bulgarians were left outside the new nation's borders, which stoked irredentist sentiments that led to several conflicts with its neighbours and alliances with Germany in both world wars. In 1946, Bulgaria came under the Soviet-led Eastern Bloc and became a socialist state. The ruling Communist Party gave up its monopoly on power after the revolutions of 1989 and allowed multiparty elections. Bulgaria then transitioned into a democracy and a market-based economy. Since adopting a democratic constitution in 1991, Bulgaria has been a unitary parliamentary republic composed of 28 provinces, with a high degree of political, administrative, and economic centralisation.

Bulgaria has an upper-middle-income economy, ranking 68th in the Human Development Index. Its market economy is part of the European Single Market and is largely based on services, followed by industry—especially machine building and mining—and agriculture. The country faces a demographic crisis; its population peaked at 9 million in 1989, and has since decreased to 6.4 million as of 2023. Bulgaria is a member of the European Union, NATO, and the Council of Europe. It is also a founding member of the OSCE and has taken a seat on the United Nations Security Council three times.

Etymology
The name Bulgaria is derived from the Bulgars, a tribe of Turkic origin that founded the First Bulgarian Empire. Their name is not completely understood and is difficult to trace it back earlier than the 4th century AD,[8] but it is possibly derived from the Proto-Turkic word bulģha ("to mix", "shake", "stir") and its derivative bulgak ("revolt", "disorder").[9] The meaning may be further extended to "rebel", "incite" or "produce a state of disorder", and so, in the derivative, the "disturbers".[10][11][12] Tribal groups in Inner Asia with phonologically close names were frequently described in similar terms, as the Buluoji, a component of the "Five Barbarian" groups, which during the 4th century were portrayed as both: a "mixed race" and "troublemakers".[13]
"""

In [None]:
model_checkpoint = "Qdrant/multilingual-e5-large-onnx"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap=0
)

text_splits = text_splitter.split_text(example)

for text_split in text_splits:
  print(text_split)
  print('==========')

Bulgaria (/bʌlˈɡɛəriə, bʊl-/ ⓘ; Bulgarian: България, romanized: Bŭlgariya), officially the Republic of Bulgaria,[a] is a country in Southeast Europe. Located west of the Black Sea and south of the Danube river, Bulgaria is bordered by Greece and Turkey to the south, Serbia and North Macedonia to the west, and Romania to the north. It covers a territory of 110,994 square kilometres (42,855 sq mi) and is the 16th largest country in Europe. Sofia is the nation's capital and largest city; other major cities include Burgas, Plovdiv, and Varna.

One of the earliest societies in the lands of modern-day Bulgaria was the Neolithic Karanovo culture, which dates back to 6,500 BC. In the 6th to 3rd century BC the region was a battleground for ancient Thracians, Persians, Celts and Macedonians; stability came when the Roman Empire conquered the region in AD 45. After the Roman state splintered, tribal invasions in the region resumed. Around the 6th century, these territories were settled by the ear

In [None]:
print(len(text_splits))

3


In [None]:
import multiprocessing

from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer, pipeline

onnxrt_options = onnxrt.SessionOptions()

onnxrt_options.execution_mode = onnxrt.ExecutionMode.ORT_SEQUENTIAL
onnxrt_options.intra_op_num_threads = multiprocessing.cpu_count()

onnxrt_options.graph_optimization_level = onnxrt.GraphOptimizationLevel.ORT_ENABLE_ALL
onnxrt_options.add_session_config_entry('session.intra_op.allow_spinning', '1')

model = ORTModelForFeatureExtraction.from_pretrained(
    model_checkpoint,
    session_options=onnxrt_options,
    providers=['CPUExecutionProvider']
)

onnx_extractor = pipeline("feature-extraction", model=model, tokenizer=tokenizer)

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, data_list):
      self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, index):
      return self.data_list[index]

In [None]:
dataset = EmbeddingDataset(text_splits)

In [None]:
raw_embeddings_list = []

start_time = datetime.now()

for output in onnx_extractor(dataset, batch_size=100):
    raw_embeddings_list.extend(output)

end_time = datetime.now()

print(end_time - start_time)

0:00:18.077521


In [None]:
print(len(raw_embeddings_list))
print(len(raw_embeddings_list[0][0]))

3
1024


In [None]:
embeddings_list = []

for raw_embedding in raw_embeddings_list:
    embeddings_list.append(raw_embedding[0])

In [None]:
average_unified_embedding = np.average(embeddings_list, axis=0)

In [None]:
print(len(average_unified_embedding))
print(average_unified_embedding[:5])

1024
[ 1.27175705  0.01042874 -1.14785713 -1.37164895  0.74253729]


In [None]:
print(type(average_unified_embedding))

<class 'numpy.ndarray'>


In [None]:
average_unified_embedding_list = average_unified_embedding.tolist()

print(type(average_unified_embedding_list))
print(average_unified_embedding_list[:5])

<class 'list'>
[1.2717570463816326, 0.010428741574287415, -1.147857129573822, -1.371648947397868, 0.7425372898578644]
