In [None]:
# Goal 
    # - [ ]  Store the text in a vector store
    # - [ ]  Embed it
    # - [ ]  Create a chatbot over that content
    # - [ ]  fine-tune a smaller open-source model on the extracted data

In [2]:
import os
import faiss
import numpy as np

In [4]:
os.getcwd()

'/home/linux-pc/gh/projects/NeuralNexus/New-Features/CustomLLM/custom-voice-bot/notebooks'

In [5]:
from sentence_transformers import SentenceTransformer
RESPONSE_PATH = '/home/linux-pc/gh/projects/NeuralNexus/New-Features/CustomLLM/custom-voice-bot/data/prompt_response/response1.json'
import json

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
with open(RESPONSE_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

In [7]:
texts = data["text"]

In [8]:
# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for all texts
embeddings = model.encode(data["text"], convert_to_tensor=True)

In [9]:
# Store embeddings in a vector store

# Convert embeddings to numpy array (float32)
embeddings_np = embeddings.cpu().numpy().astype('float32')

# Dimension of embeddings
d = embeddings_np.shape[1]

# Create FAISS index
index = faiss.IndexFlatL2(d)  # L2 distance metric

# Add embeddings to index
index.add(embeddings_np)

# Save index if needed
faiss.write_index(index, "texts.index")


In [10]:
def search_similar(query, index, model, texts, k=3):
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
    distances, indices = index.search(query_embedding, k)
    results = [texts[i] for i in indices[0]]
    return results

In [11]:
query = "Hey I miss you a lot"
results = search_similar(query, index, model, texts)
print("Top relevant texts:", results)

Top relevant texts: ["I'm here", "Well I'm feeling ok so far ... slight headache but if you want to drive out of town we can .... Let's just see how I'm feeling when I get home! I'm here til noon today!", 'Love you both ..... I am fine']


In [None]:
# Fine tuning a model on the text data:


In [13]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

In [14]:
# Wrap your texts
dataset = Dataset.from_dict({"text": texts})

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model_ft = GPT2LMHeadModel.from_pretrained("gpt2")
model_ft.resize_token_embeddings(len(tokenizer))

In [19]:
# Tokenize
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=64)


In [None]:

tokenized_dataset = dataset.map(tokenize)

In [18]:

# Fine-tune
training_args = TrainingArguments(
    output_dir="./finetuned-gpt2",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_steps=5,
    save_steps=10,
)

trainer = Trainer(
    model=model_ft,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()


NameError: name 'tokenized_dataset' is not defined