In [None]:
#pip install langchain langchain-google-genai google-generativeai faiss-cpu python-dotenv tiktoken

!pip install langchain langchain-google-genai google-generativeai faiss-cpu python-dotenv tiktoken langchain-community


In [None]:
#2-Implement a RAG system for extracting information from multiple excel sheets using LLM, Langchain, word embedding, excel sheet prompt and others tools if necessary. If possible display the extracted information in a table format
import pandas as pd
import glob
import os
def merge_csv_files(input_folder):
    # Get all CSV files in the input folder
    csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

    # Initialize an empty list to hold DataFrames
    dataframes = []

    # Read each CSV file and append it to the list
    for file in csv_files:
        df = pd.read_csv(file, on_bad_lines='skip')
        dataframes.append(df)

    # Concatenate all DataFrames into a single DataFrame
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Write the merged DataFrame to a new CSV file
    #merged_df.to_csv(output_file, index=False)
    return merged_df

merged_df = merge_csv_files('/content/')

In [None]:
# ---  Load Environment Variables (API Key) ---
# Assuming you have the necessary libraries for interacting with Gemini already installed
from google.colab import auth
#auth.authenticate_user()

import google.generativeai as genai

# Replace "YOUR_API_KEY" with your actual API key for Gemini
genai.configure(api_key="API Key Here")
google_api_key="API Key Here"

In [None]:
# --- 3. Split Documents into Chunks ---
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

print("Splitting documents into chunks...")

# Convert DataFrame rows to Document objects
documents = []
for index, row in merged_df.iterrows():
    # Assuming you want to use all columns for the document content
    # You might need to adjust this based on which columns contain relevant text
    document_content = " ".join(row.astype(str))
    documents.append(Document(page_content=document_content))


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)
texts = text_splitter.split_documents(documents)
if not texts:
    raise ValueError("Document splitting resulted in no chunks.")
print(f"Split into {len(texts)} chunks.")

In [None]:
# vector embeddings using sentence-transformers

!pip install -U sentence-transformers
from langchain_community.embeddings import SentenceTransformerEmbeddings

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# embed the documents
embeddings = embedding_function.embed_documents([text.page_content for text in texts])

print(f"Found {len(embeddings)} documents")
print(f"Here's a sample of the first document: {embeddings[0][:5]}...")

In [None]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

#create open source embedding functon
embed_input = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

input_keyword = input("Enter the keyword to search")

# 1. Embed the Query
query_embedding = embed_input.embed_query(input_keyword)

# 2. Calculate Similarity (using cosine similarity as an example)
# Reshape the query_embedding to a 2D array for cosine_similarity
query_embedding_2d = [query_embedding]

# Calculate cosine similarity between the query embedding and all document embeddings
similarities = cosine_similarity(query_embedding_2d, embeddings)[0]

# 3. Rank and Retrieve
# Get the indices of the top similar documents
top_n = 5  # You can adjust the number of top results you want
top_indices = similarities.argsort()[-top_n:][::-1]

print(f"\nTop {top_n} most similar document chunks for the keyword '{input_keyword}':")
for index in top_indices:
    print(f"Similarity: {similarities[index]:.4f}")
    print(texts[index].page_content)
    print("-" * 20)

In [None]:
# Assuming you have the necessary libraries for interacting with Gemini already installed
from google.colab import auth
#auth.authenticate_user()

import google.generativeai as genai

# Replace "YOUR_API_KEY" with your actual API key for Gemini
genai.configure(api_key="API Key Here")

# Initialize the Gemini model
#print("Available models:")
#for m in genai.list_models():
#  if 'generateContent' in m.supported_generation_methods:
#    print(m.name)

#models/gemini-2.0-flash
model = genai.GenerativeModel('gemini-2.0-flash')
# Get the text from the top similar document chunks based on the previously calculated top_indices
top_texts = [texts[index].page_content for index in top_indices]

# Combine the top texts into a single context string
context = " ".join(top_texts)

# Construct the prompt for the Gemini model
# This prompt instructs the model to answer the question based *only* on the provided context.
prompt = f"""Based on the following context, answer the question: '{input_keyword}'

Context:
{context}

If the information is not available in the context, please state that you cannot answer based on the provided information.
"""

# Send the prompt to the Gemini model and get the response
# This is a conceptual example; the actual function call might vary depending on the Gemini library.
response = model.generate_content(prompt)

# Print the response from the Gemini model
print("\nGemini Response based on context:")
print(response.text)