# Import Libraries

In [None]:
!pip install faiss-gpu
!pip install sentence_transformers
import faiss
import pickle
import pandas as pd
import os
import gc
import numpy as np
from sentence_transformers import SentenceTransformer
import subprocess

from IPython.display import FileLink, display



# Download Resources

In [None]:
os.makedirs('dpq-wiki-parsed', exist_ok=True)
os.chdir('dpq-wiki-parsed')

file_names = [
    'a.parquet',
    'b.parquet',
    'c.parquet',
    'chunk_index.parquet',
    'd.parquet',
    'e.parquet',
    'f.parquet',
    'g.parquet',
    'h.parquet',
    'i.parquet',
    'j.parquet',
    'k.parquet',
    'l.parquet',
    'm.parquet',
    'n.parquet',
    'number.parquet',
    'o.parquet',
    'p.parquet',
    'q.parquet',
    'r.parquet',
    's.parquet',
    't.parquet',
    'u.parquet',
    'v.parquet',
    'w.parquet',
    'wiki_index.parquet',
    'x.parquet',
    'y.parquet',
    'z.parquet'
]
repo_id = "dpquoc/wiki-parsed"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")
os.chdir('..')

In [None]:
os.makedirs('bge-small-en', exist_ok=True)
os.chdir('bge-small-en')

file_names = [
    'config.json',
    'config_sentence_transformers.json',
    'modules.json',
    'pytorch_model.bin',
    'sentence_bert_config.json',
    'special_tokens_map.json',
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt'
]

repo_id = "BAAI/bge-small-en-v1.5"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")

folder_name = '1_Pooling'
folder_path = os.path.join(os.getcwd(), folder_name)
os.makedirs(folder_path, exist_ok=True)
os.chdir(folder_path)
!wget https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/1_Pooling/config.json
os.chdir('..')
os.chdir('..')

# Create Embeddings

In [None]:
from huggingface_hub import login
from huggingface_hub import HfApi
login()

In [None]:
base_path = '/content/dpq-wiki-parsed/'

file_names = [
    'a.parquet',
    'b.parquet',
    'c.parquet',
    'd.parquet',
    'e.parquet',
    'f.parquet',
    'g.parquet',
    'h.parquet',
    'i.parquet',
    'j.parquet',
    'k.parquet',
    'l.parquet',
    'm.parquet',
    'n.parquet',
    'number.parquet',
    'o.parquet',
    'p.parquet',
    'q.parquet',
    'r.parquet',
    's.parquet',
    't.parquet',
    'u.parquet',
    'v.parquet',
    'w.parquet',
    'x.parquet',
    'y.parquet',
    'z.parquet'
]

model_embedding = SentenceTransformer('/content/bge-small-en', device="cuda:0")
faiss_index_path = "/content/wikipedia_embeddings.index"

In [None]:
def chunk_split(text, chunk_size=100, stride=90):
    # Split text into words
    words = text.split()
    chunks = []
    for i in range(0, len(words), stride):
        if i + chunk_size <= len(words):
            # Full chunk
            chunks.append(' '.join(words[i:i + chunk_size]))
        else:
            # Last chunk, which might be smaller
            chunks.append(' '.join(words[i:len(words)]))
    return chunks

In [None]:
# GET ALL TEXT CHUNKS
all_chunks = []

for file_name in file_names:
    # Construct the full path
    full_path = os.path.join(base_path, file_name)

    # Read the file into a DataFrame
    df = pd.read_parquet(full_path)
    print(f"Processing file_name: {file_name} ......")

    df['chunks'] = df.text.apply(chunk_split)
    chunks = [chunk for sublist in df['chunks'].tolist() for chunk in sublist]
    all_chunks.extend(chunks)
    del df, chunks
    gc.collect()

print(f"The final shape of the array 'all_chunks' is: {len(all_chunks)}")


In [None]:
for file_name in file_names:
    # Construct the full path
    full_path = os.path.join(base_path, file_name)
    real_name, _ = os.path.splitext(file_name)


    # Read the file into a DataFrame
    df = pd.read_parquet(full_path)
    print(f"Processing file_name: {file_name} ......")

    df['chunks'] = df.text.apply(chunk_split)
    chunks = [chunk for sublist in df['chunks'].tolist() for chunk in sublist]
    del df

    embeddings = model_embedding.encode(chunks,
                                        batch_size=64,
                                        convert_to_tensor=False,
                                        convert_to_numpy=True,
                                        normalize_embeddings=True)

    del chunks # free some memory

    file_name_hug = real_name + ".npy"
    np.save(file_name_hug, embeddings)
    print(file_name_hug, embeddings.shape)

    del embeddings

    api = HfApi()

    api.upload_file(
        path_or_fileobj="/content/" + file_name_hug,
        path_in_repo=file_name,
        repo_id="dpquoc/np-chunks",
        repo_type="dataset",
    )
    os.remove("/content/" + file_name_hug)
    gc.collect()

In [None]:
document_embeddings.shape

In [None]:
document_embeddings = np.array(document_embeddings)
index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings)
faiss.write_index(index, faiss_index_path)
print(f"Faiss Index Successfully Saved to '{faiss_index_path}'")

In [None]:
document_embeddings = []
document_embeddings.extend(embeddings)

document_embeddings = np.array(document_embeddings)
index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings)
faiss.write_index(index, faiss_index_path)
print(f"Faiss Index Successfully Saved to '{faiss_index_path}'")

In [None]:
import gc
gc.collect()

In [None]:
document_embeddings = []

In [None]:
arrays = []

for file_name in file_names:
    array = np.load(file_name)
    arrays.append(array)

all_data = np.concatenate(arrays)
del arrays , array


In [None]:
# Assuming all_data is your final numpy array
# Sample 20% of the data for training
n_train = int(0.2 * all_data.shape[0])
train_data = np.random.permutation(all_data)[:n_train]

# Dimension of the vectors
d = 384
print(d)

# Number of centroids
k = 256

# Create the quantizer
quantizer = faiss.IndexFlatL2(d)

# Create the index
index = faiss.IndexIVFPQ(quantizer, d, k, 96, 8)  # 96 bytes -> 12 * 8 bits

# Train the index
index.train(train_data)

# Add all the vectors to the index
index.add(all_data)
faiss.write_index(index, 'chunk_index.faiss')

In [None]:
faiss.write_index(index, 'quantized_index.faiss')

In [None]:
n = index.ntotal

print(f"The index contains {n} vectors.")

# FINAL

In [None]:
import subprocess
import pandas as pd
import os
import gc
import numpy as np
import faiss
from huggingface_hub import login, HfApi


In [None]:
login()


Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
os.chdir("..")
os.mkdir("tmp")
os.chdir("tmp")

file_names = [
    "a.npy",
    "b.parquet",
    "c.parquet",
    "d_e_f.npy",
    "g.parquet",
    "h.parquet",
    "i.parquet",
    "j.parquet",
    "k.parquet",
    "l.parquet",
    "m.parquet",
    "n.parquet",
    "number.parquet",
    "o.parquet",
    "p.parquet",
    "q.parquet",
    "r.parquet",
    "s.parquet",
    "t.parquet",
    "u.parquet",
    "v.parquet",
    "w.parquet",
    "x.parquet",
    "y.parquet",
    "z.parquet"
]
repo_id = "dpquoc/np-chunks"

# Create a list to store subprocess.Popen objects
processes = []

for file_name in file_names:
    link = f'https://huggingface.co/datasets/{repo_id}/resolve/main/{file_name}'
    command = ["wget", link]

    # Redirect output to /dev/null (Linux) or NUL (Windows)
    processes.append(subprocess.Popen(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL))

# Wait for all processes to complete
for process in processes:
    process.wait()

print("All downloads initiated.")

os.chdir("..")
os.chdir("working")


All downloads initiated.


In [None]:

file_names = [
    "a.npy",
    "b.parquet",
    "c.parquet",
    "d_e_f.npy",
    "g.parquet",
    "h.parquet",
    "i.parquet",
    "j.parquet",
    "k.parquet",
    "l.parquet",
    "m.parquet",
    "n.parquet",
    "number.parquet",
    "o.parquet",
    "p.parquet",
    "q.parquet",
    "r.parquet",
    "s.parquet",
    "t.parquet",
    "u.parquet",
    "v.parquet",
    "w.parquet",
    "x.parquet",
    "y.parquet",
    "z.parquet"
]

In [None]:
arrays = np.array([])

for file_name in file_names:
    # Load the array from the file
    array = np.load("/content/tmp/" + file_name)

    # Get 20% of the elements randomly
    sample_size = int(0.1 * array.shape[0])
    sample = np.random.permutation(array)[:sample_size]
    del array

    # Add the sample to the numpy array
    if arrays.size == 0:
        arrays = sample
    else:
        arrays = np.vstack((arrays, sample))
    del sample
    gc.collect()

In [None]:
arrays = arrays.astype('float32').reshape(-1, d)
gc.collect()

574

In [None]:
# Dimension of the vectors
d = 384

# Number of centroids
k = 256

# Create the quantizer
quantizer = faiss.IndexFlatL2(d)

# Create the index
index = faiss.IndexIVFPQ(quantizer, d, k, 96, 8)  # 96 bytes -> 12 * 8 bits

# Train the index
index.train(arrays)

In [None]:

del arrays

In [None]:
gc.collect()

0

In [None]:
faiss.write_index(index, 'trained.index')

In [None]:
# Later, you can load the index from the file
index = faiss.read_index('trained.index')

In [None]:
del array
gc.collect()

1465

In [None]:
for file_name in file_names:
    # Load the array from the file
    array = np.load("/content/tmp/" + file_name)
    array = array.astype('float32')
    index.add(array)
    del array
    gc.collect()

faiss.write_index(index, 'chunk_faiss.index')

In [None]:
api = HfApi()

api.upload_file(
    path_or_fileobj= 'chunk_faiss.index',
    path_in_repo='chunk_faiss.index',
    repo_id="dpquoc/wiki-faiss-index",
    repo_type="dataset",
)

In [None]:
index.ntotal

37573660