In [1]:
import os
from dotenv import load_dotenv
from llama_index.core import Document
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import Settings
import chromadb
import pandas as pd
import re
load_dotenv()
llm = Gemini(api_key=os.environ["GOOGLE_API_KEY"],model="models/gemini-pro")
embed_model = GeminiEmbedding(api_key=os.environ["GOOGLE_API_KEY"],model_name="models/embedding-001")
Settings.llm = llm
Settings.embed_model = embed_model


# Load data from PDF
from llama_index.core import SimpleDirectoryReader

  from tqdm.autonotebook import tqdm


In [2]:
# Step 2: Load CSV
csv_file = "processed_file.csv"  # Replace with your CSV file
df = pd.read_csv(csv_file)

# Step 3: Chunk the Data (Row-wise)
documents = [
    Document(text=row["content"], metadata={"ArticleID": row["Article_Number"]})
    for _, row in df.iterrows()
]

In [3]:
em=embed_model.get_text_embedding("hello world")

In [4]:
len(em)

768

In [5]:
documents[:10]

[Document(id_='8ce91dc0-7e29-4db6-b0b0-8a95b3e34a3c', embedding=None, metadata={'ArticleID': 'Article 1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text=' Name and territory of the Union (1) India, that is Bharat, shall be a Union of States (2) The States and the territories thereof shall be as specified in the First Schedule (3) The territory of India shall comprise The territories of the States; the Union territories specified in the First Schedule; and such other territories as may be acquired', mimetype=None, path=None, url=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}'),
 Document(id_='c38aa4be-0bd7-4a54-a3d8-f38604fedea0', embedding=None, metadata={'ArticleID': 'Article 2'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, met

In [6]:
print(len(documents))

455


In [7]:
# Create a client and a new collection
client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = client.get_or_create_collection("constitution")

# Create a vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create a storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
pinecone_client = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
pinecone_index = pinecone_client.Index("indianconstitution")
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(
    vector_store=vector_store
)

In [13]:
# Create an index from the documents and save it to the disk.
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context,show_progress=True,
)

Parsing nodes: 100%|██████████| 455/455 [00:00<00:00, 978.88it/s] 
Generating embeddings: 100%|██████████| 459/459 [03:44<00:00,  2.04it/s]
Upserted vectors: 100%|██████████| 459/459 [00:06<00:00, 72.81it/s]


In [14]:
import os
from dotenv import load_dotenv
from llama_index.llms.gemini import Gemini
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.core.prompts.prompt_type import PromptType
import chromadb
from pinecone import Pinecone
from llama_index.vector_stores.pinecone import PineconeVectorStore

In [16]:
load_dotenv()
llm = Gemini(api_key=os.environ["GOOGLE_API_KEY"],model="models/gemini-1.5-pro-002")
embed_model = GeminiEmbedding(api_key=os.environ["GOOGLE_API_KEY"],model_name="models/embedding-001")

Settings.llm = llm
Settings.embed_model = embed_model

# # Load from disk
# load_client = chromadb.PersistentClient(path="./chroma_db")

# # Fetch the collection
# chroma_collection = load_client.get_collection("constitution")

# # Fetch the vector store
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

pinecone_client = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
pinecone_index = pinecone_client.Index("indianconstitution")
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)


# Get the index from the vector store
index = VectorStoreIndex.from_vector_store(
    vector_store
)

In [17]:
pinecone_client.list_collections()

[]

In [18]:
DEFAULT_TEXT_QA_PROMPT_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
    )

DEFAULT_TEXT_QA_PROMPT = PromptTemplate(
        DEFAULT_TEXT_QA_PROMPT_TMPL, 
        prompt_type=PromptType.QUESTION_ANSWER
        
    )

query_engine = index.as_query_engine(
    )

In [19]:
query = "can India be called bharat and by which article"
response = query_engine.query(query)
print(response)
print("\n")

India can also be called Bharat, as stated in Article 1 of the Constitution of India.





In [20]:
res = llm.complete(query)
print(res)

India can be called Bharat.  Article 1 of the Indian Constitution states: "India, that is Bharat, shall be a Union of States."  This means both names are officially recognized and hold equal status.



In [21]:
import pandas as pd

df = pd.read_json("hf://datasets/nisaar/Constitution_of_India/constitution_train.jsonl", lines=True)

In [23]:
# import metrics
from ragas.metrics import (
    Faithfulness,
    AnswerRelevancy,
    ContextPrecision,
    ContextRecall,
)

# init metrics with evaluator LLM
from ragas.llms import LlamaIndexLLMWrapper

evaluator_llm = LlamaIndexLLMWrapper(Gemini(api_key=os.environ["GOOGLE_API_KEY"],model="models/gemini-1.5-pro-002"))
metrics = [
    Faithfulness(llm=evaluator_llm),
    AnswerRelevancy(llm=evaluator_llm),
    ContextPrecision(llm=evaluator_llm),
    ContextRecall(llm=evaluator_llm),
]

Upserted vectors:   0%|          | 0/459 [15:36<?, ?it/s]


In [24]:
# convert to HF dataset

ds_dict = df.to_dict()
ds_dict['ground_truth'] = ds_dict.pop('answer')
ds_dict.keys()

dict_keys(['question', 'ground_truth'])

In [25]:
from ragas import EvaluationDataset

# Convert DataFrame to EvaluationDataset
evaluation_dataset = EvaluationDataset.from_pandas(df)

In [26]:
from ragas.integrations.llama_index import evaluate

result = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    dataset=evaluation_dataset,
    llm=evaluator_llm,
    embeddings=GeminiEmbedding(api_key=os.environ["GOOGLE_API_KEY"],model_name="models/embedding-001")
)

Running Query Engine:   0%|          | 0/933 [00:00<?, ?it/s]Exception raised in Job[0]: ValidationError(2 validation errors for QueryStartEvent
query.str
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
query.QueryBundle
  Input should be a dictionary or an instance of QueryBundle [type=dataclass_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/dataclass_type)
Exception raised in Job[1]: ValidationError(2 validation errors for QueryStartEvent
query.str
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type
query.QueryBundle
  Input should be a dictionary or an instance of QueryBundle [type=dataclass_type, input_value=None, input_type=NoneType]
    For further information visit https://

AttributeError: 'float' object has no attribute 'response'

In [None]:
# Read the CSV file without header
input_file = "../data/Constitution(updated).csv"
df_no_header = pd.read_csv(input_file, header=None)

# Define the new header
new_header = ["Article"]

# Add the header to the DataFrame
df_no_header.columns = new_header

# Save the updated DataFrame to a new CSV file
output_file_with_header = "../data/Constitution(updated).csv"
df_no_header.to_csv(output_file_with_header, index=False)

print(f"Updated CSV with header saved to {output_file_with_header}")

In [None]:
df

In [None]:
df

In [None]:
import pandas as pd
output_path = '../data/processed_file.csv'
file_path = '../data/Constitution(updated).csv'

df = pd.read_csv(file_path, names=['Content'])

# Extract the article number and the rest of the string into separate columns
df['Article Number'] = df['Content'].str.extract(r'Article (\d+)')
df['Content'] = df['Content'].str.replace(r'Article \d+', '', regex=True).str.strip()

# Save the result back to a new CSV
df.to_csv(output_path, index=False)

print(df)

In [None]:
import pandas as pd
output_path = '../data/processed_file.csv'
file_path = '../data/Constitution(updated).csv'
# Load the CSV into a DataFrame
# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv(file_path, names=['Content'])

# Extract the article number (number + optional letters) and the rest of the string into separate columns
df['Article Identifier'] = df['Content'].str.extract(r'Article (\d+\w*)')
df['Content'] = df['Content'].str.replace(r'Article \d+\w*\.', '', regex=True).str.strip()

# Save the result back to a new CSV
df.to_csv(file_path, index=False)

print(df)

In [None]:
file_path = '../data/Constitution(updated).csv'
df = pd.read_csv(file_path)

In [None]:
df

In [None]:
df['Articles'].str.extract(r'(Article \d+\w*)')