## Finetuning embedding 

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

# Load the text file
file_path = "data/security_analysis_6th_edition.txt"
loader = TextLoader(file_path, encoding="utf-8")
documents = loader.load()

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=250,
    length_function=len
)

chunks = text_splitter.split_documents(documents)

print(len(chunks))

813


In [3]:
import uuid

id_set = set()

for document in chunks:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [4]:
# shuffle the chunks
# If you split the data sequentially as in your current code, your model would:
# Train primarily on bonds and fixed income concepts (first 76%)
# Validate on middle sections (next 12%)
# Test on equity analysis and indices (last 12%)
# This would create a biased model that doesn't properly understand all aspects of financial analysis.
import random

random.seed(42)  # For reproducibility
random.shuffle(chunks)

In [5]:
# Calculate the indices for slicing
train_end = int(len(chunks) * 0.80)
val_end = train_end + int(len(chunks) * 0.10)

# Slice the data
training_documents = chunks[:train_end]
eval_documents = chunks[train_end:val_end]
test_documents = chunks[val_end:]

something

In [7]:
import os
import getpass
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")
os.environ["RAGAS_APP_TOKEN"] = getpass.getpass("Please enter your Ragas API key!")

In [8]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
generator_embeddings = LangchainEmbeddingsWrapper(embeddings)
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)

In [9]:
import copy

td = copy.deepcopy(training_documents)
for document in td:
  id = document.metadata["id"]
  document.page_content = id+"###"+document.page_content

training_dataset = generator.generate_with_langchain_docs(
    td, 
    testset_size=10
)

training_dataset.to_pandas()

Applying HeadlinesExtractor:   0%|          | 0/329 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/650 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to ap

Applying SummaryExtractor:   0%|          | 0/658 [00:00<?, ?it/s]

Property 'summary' already exists in node 'e373e9'. Skipping!
Property 'summary' already exists in node 'e61348'. Skipping!
Property 'summary' already exists in node '49cd04'. Skipping!
Property 'summary' already exists in node '1e0713'. Skipping!
Property 'summary' already exists in node 'af7b82'. Skipping!
Property 'summary' already exists in node '290224'. Skipping!
Property 'summary' already exists in node '294b0a'. Skipping!
Property 'summary' already exists in node 'a13e85'. Skipping!
Property 'summary' already exists in node '97cd48'. Skipping!
Property 'summary' already exists in node '2eeb7c'. Skipping!
Property 'summary' already exists in node 'e7ba7e'. Skipping!
Property 'summary' already exists in node '703de3'. Skipping!
Property 'summary' already exists in node 'ab6162'. Skipping!
Property 'summary' already exists in node 'd0985f'. Skipping!
Property 'summary' already exists in node '28ff77'. Skipping!
Property 'summary' already exists in node 'e200e7'. Skipping!
Property

Applying CustomNodeFilter: 0it [00:00, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/658 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '294b0a'. Skipping!
Property 'summary_embedding' already exists in node 'a13e85'. Skipping!
Property 'summary_embedding' already exists in node 'e61348'. Skipping!
Property 'summary_embedding' already exists in node '49cd04'. Skipping!
Property 'summary_embedding' already exists in node 'af7b82'. Skipping!
Property 'summary_embedding' already exists in node 'e373e9'. Skipping!
Property 'summary_embedding' already exists in node '1e0713'. Skipping!
Property 'summary_embedding' already exists in node 'ab6162'. Skipping!
Property 'summary_embedding' already exists in node '2eeb7c'. Skipping!
Property 'summary_embedding' already exists in node 'e7ba7e'. Skipping!
Property 'summary_embedding' already exists in node '97cd48'. Skipping!
Property 'summary_embedding' already exists in node '290224'. Skipping!
Property 'summary_embedding' already exists in node '28ff77'. Skipping!
Property 'summary_embedding' already exists in node 'd0985f'. Sk

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/2 [00:00<?, ?it/s]

Generating Samples: 0it [00:00, ?it/s]

In [10]:
training_dataset.to_pandas()