In [None]:
import os
from tqdm.notebook import tqdm
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

In [11]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

from uuid import uuid4

In [3]:
# load .txt files from directory
data_dir = "data"
docs = [TextLoader(f"{data_dir}/{file}").load()[0] for file in os.listdir(data_dir) if file.endswith(".txt")]

In [4]:
# split text into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)
all_splits = [split for doc in docs for split in splitter.split_documents([doc])]

In [5]:
len(all_splits)

221

Use Huggingface Embeddings: [multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small)

In [6]:
model_name="intfloat/multilingual-e5-small"
embed_model = HuggingFaceEmbeddings(model_name=model_name, cache_folder="cached_models/")

  warn(f"Failed to load image Python extension: {e}")


In [None]:
# generate embeddings with progress bar
# embeddings = [embed_model.embed_documents([chunk.page_content]) for chunk in tqdm(all_splits, desc="Embedding Chunks")]

In [7]:
index = faiss.IndexFlatL2(len(embed_model.embed_query("sample query")))

In [8]:
vector_store = FAISS(
    embedding_function=embed_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [12]:
uuids = [str(uuid4()) for _ in range(len(all_splits))]

In [13]:
vector_store.add_documents(documents=all_splits, ids=uuids)

['d421e8dc-9663-4f24-a094-e319a3cf7813',
 'b6fb752a-1603-49e7-9e96-d0ec474b93b7',
 '9227460d-37d8-44ee-8f8a-9c390246aed2',
 '2bc0ae08-fc35-494d-ac50-94189f168124',
 'fd00767f-19c9-4691-8808-0a7efc70241a',
 'e4abfbe6-b37a-41ef-a92b-299278f5926a',
 '0761d493-841d-40cc-b388-67387b99b5ce',
 'befd5428-a075-4203-b262-16a4c7a71288',
 '9b56ee61-90c4-4a67-a210-f15efa6ba9cc',
 'b183a989-03bd-4e64-a847-295fa0c36c56',
 '4e27e995-1361-4f79-828b-0cd4f21a8163',
 '00b48f2c-ef09-4123-819a-7741b4c4614d',
 '5e887fac-dfc7-4de2-8ecc-0a4f4fd07311',
 '1d6c0dfc-8130-4fb1-91c4-797afb249981',
 'd2f60873-4a89-4a89-82bf-e29866c69fd8',
 '59e62d13-b923-4552-a528-d15c7a01cd0b',
 '407d3040-1346-4c87-a0dc-3ff667e409b3',
 '32f9351a-0b8f-437a-a29d-bad3a27db8a9',
 'f193277f-fe2d-4ad8-b8ca-4c3e17ae02dc',
 '3ce01b87-6b77-4ebe-ba9d-c520d97f7770',
 '9e067f20-9ce9-42bc-8abd-7ccfc9f35952',
 '49669887-1a54-4cbd-9425-06ea44630b46',
 '78659644-6394-4726-826f-45ea95f4e5da',
 'c1c9664d-e6d5-47c3-ab54-4309600bdbda',
 '03c4de3d-0d2f-

Query

In [15]:
results = vector_store.similarity_search(
    "What is the core of quantum computing?",
    k=2,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* 한국어

Tiếng Việt

Edit links

From Wikipedia, the free encyclopedia

Type of quantum computer

A topological quantum computer is a type of quantum computer. It utilizes anyons, a type of quasiparticle that occurs in two-dimensional systems. The anyons' world lines intertwine to form braids in a three-dimensional spacetime (one temporal and two spatial dimensions). The braids act as the logic gates of the computer. The primary advantage of using quantum braids over trapped quantum particles is in their stability. While small but cumulative perturbations can cause quantum states to decohere and introduce errors in traditional quantum computations, such perturbations do not alter the topological properties of the braids. This stability is akin to the difference between cutting and reattaching a string to form a different braid versus a ball (representing an ordinary quantum particle in four-dimensional spacetime) colliding with a wall. It was proposed by Russian-American physicist Alexei

In [16]:
results = vector_store.similarity_search(
    "the most important technological innovation in history",
    k=5,
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")
    print("="*10, "end", "="*10)

* v t e Quantum information science General DiVincenzo's criteria NISQ era Quantum computing timeline Quantum information Quantum programming Quantum simulation Qubit physical vs. logical Quantum processors cloud-based Theorems Bell's Eastin–Knill Gleason's Gottesman–Knill Holevo's No-broadcasting No-cloning No-communication No-deleting No-hiding No-teleportation PBR Quantum speed limit Threshold Solovay–Kitaev Purification Quantum communication Classical capacity entanglement-assisted quantum capacity Entanglement distillation Entanglement swapping Monogamy of entanglement LOCC Quantum channel quantum network Quantum teleportation quantum gate teleportation Superdense coding Quantum cryptography Post-quantum cryptography Quantum coin flipping Quantum money Quantum key distribution BB84 SARG04 other protocols Quantum secret sharing Quantum algorithms Amplitude amplification Bernstein–Vazirani BHT Boson sampling Deutsch–Jozsa Grover's HHL Hidden subgroup Quantum annealing Quantum counti

In [None]:
faiss.write_index(index, "faiss_store/faiss_index.bin")

In [None]:
import json

with open("faiss_store/chunks.json", "w") as f:
    json.dump([chunk.page_content for chunk in all_splits], f)