<a href="https://colab.research.google.com/github/fahmidjobbi/ChatterBot/blob/main/LLM_LangChain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pinecone-client openai langchain-chroma langchain langchain-openai chromadb pypdf langchain-community pandas xmltodict langchain

In [None]:
!pip install openai==0.28

In [None]:
import xmltodict
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter


def load_xml_to_dataframe(xml_file):
    """Loads an XML file and converts it to a pandas DataFrame."""
    with open(xml_file, 'r', encoding='utf-8') as file:
        xml_data = xmltodict.parse(file.read())

    items = xml_data['rss']['channel']['item']  # Adjust based on your XML structure

    df = pd.DataFrame(items)

    df.fillna('', inplace=True)  # Fill empty values with empty string to avoid NaN

    return df

def split_text_data(df):
    """Combines relevant text fields and splits them into labeled chunks."""
    # Combine relevant text fields, clearly labeled
    df['combined_text'] = (
        'id: ' + df['g:id'] + '\n' +
        'title: ' + df['title'] + '\n' +
        'description: ' + df['description'] + '\n' +
        'link: ' + df['link'] + '\n' +
        'image_link: ' + df['g:image_link'] + '\n' +
        'availability: ' + df['g:availability'] + '\n' +
        'price: ' + df['g:price'] + '\n' +
        'sale_price: ' + df['g:sale_price'] + '\n' +
        'shipping_type: ' + df['g:shipping_type'] + '\n' +
        'brand: ' + df['g:brand'] + '\n' +
        'condition: ' + df['g:condition'] + '\n' +
        'google_product_category: ' + df['g:google_product_category'] + '\n' +
        'category_name: ' + df['g:category_name'] + '\n' +
        'category_code: ' + df['g:category_code'] + '\n' +
        'inventory: ' + df['g:inventory'] + '\n' +
        'tags: ' + df['g:tags'] + '\n'  # Add more fields as needed
    )

    # Initialize the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Adjust chunk size as needed
        chunk_overlap=200,  # Set overlap for context preservation
        separators=["\n\n", "\n", " ", ""],  # Separators to split text
    )

    # Split each product's combined text into chunks
    all_splits = []
    for text in df['combined_text']:
        splits = text_splitter.split_text(text)
        all_splits.extend(splits)

    return all_splits

# Load the data
xml_file_path = '/content/new-product-feed-tag.xml'  # Path to your XML file
df = load_xml_to_dataframe(xml_file_path)

# Perform the text splitting
text_chunks = split_text_data(df)

# Check the result
for i, chunk in enumerate(text_chunks, 1):
    print(f"Chunk {i}: {chunk}")
    print(f"Length of chunk {i}: {len(chunk)}")
    print("-" * 50)

In [None]:
import openai
# Function to create embeddings using OpenAI API
def create_embeddings(text_chunks):
    """Creates embeddings for the provided text chunks using OpenAI API."""
    openai.api_key = ""  # Replace with your OpenAI API key

    embeddings = []
    for chunk in text_chunks:
        response = openai.Embedding.create(
            model="text-embedding-3-large",  # Replace with desired model
            input=chunk
        )
        embeddings.append(response['data'][0]['embedding'])  # Collect the embeddings

    return embeddings

# Create embeddings for the text chunks
embeddings = create_embeddings(text_chunks)

# Check the embeddings result
for i, emb in enumerate(embeddings, 1):
    print(f"Embedding {i}: {emb[:10]}...")  # Print the first 10 elements of the embedding
    print("-" * 50)


In [None]:
import os
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings

# Define the persist directory (change to a new one if needed)
persist_directory = "/content/persist_directory_new_2"  # Changed to a new directory path


# Create the embedding model using OpenAI's embedding API with the specific model name
embedding_model = OpenAIEmbeddings(
    openai_api_key="",  # Replace with your OpenAI API key
    model="text-embedding-3-large"  # Specify the model here
)

# Check if the persist directory exists
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory, mode=0o777)  # mode=0o777 sets full read, write, and execute permissions
    print(f"Directory {persist_directory} created with full permissions.")

    # If the directory doesn't exist, create the vector database
    documents = [Document(page_content=chunk, metadata={"source": f"chunk_{i}"}) for i, chunk in enumerate(text_chunks)]
    print("Vector database not found... Creating a new one.")

    # Create the vector database and persist it to the specified directory
    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory
    )

    # The `from_documents` method handles the persistence now, no need to call `persist()`
    print("Vector database created and persisted.")
else:
    print(f"Persist directory {persist_directory} already exists. Loading existing vector database.")

    # Load the existing vector database
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

# You can now use 'vectordb' for further operations (whether new or loaded).


In [None]:
import shutil
from google.colab import files

# Define the persist directory path
persist_directory = "/content/persist_directory_match"  # Make sure this matches your persist directory

# Compress the directory into a .zip file
shutil.make_archive("/content/persist_directory_match", 'zip', persist_directory)

# Download the .zip file
files.download('/content/persist_directory_match.zip')


In [None]:
# Assuming the vector database 'vectordb' is already created and available

# Retrieve and generate using the relevant snippets of PDF
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# Test retrieval with a sample query
query = "çiçek"  # Replace with a test query relevant to your documents

# Retrieve the top 3 similar documents
retrieved_docs = retriever.get_relevant_documents(query)

# Display the retrieved documents
for doc in retrieved_docs:
    print(f"Document: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("\n" + "="*50 + "\n")


In [None]:
import os
from langchain_chroma import Chroma
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma as ChromaVectorStore

# Define the persist directory
persist_directory = "/content/persist_directory_match"

# Create the embedding model using OpenAI's embedding API with the specific model name
embedding_model = OpenAIEmbeddings(
    openai_api_key="",  # Replace with your OpenAI API key
    model="text-embedding-3-large"  # Specify the model here
)

# Check if the persist directory exists
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory, mode=0o777)  # Create directory with full permissions
    print(f"Directory {persist_directory} created with full permissions.")
    topics_list = ['Çiçek', 'Hediye', 'Yenilebilir Çiçek']
    # Example text chunks representing your topics
    text_chunks = [
      'Çiçek', 'Hediye', 'Yenilebilir Çiçek'
    ]

    # Create documents from text chunks
    documents = [Document(page_content=chunk, metadata={"source": f"chunk_{i}"}) for i, chunk in enumerate(text_chunks)]
    print("Vector database not found... Creating a new one.")

    # Create the vector database and persist it
    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory
    )
    print("Vector database created and persisted.")
else:
    print(f"Persist directory {persist_directory} already exists. Loading existing vector database.")
    vectordb = ChromaVectorStore(persist_directory=persist_directory, embedding_function=embedding_model)




In [None]:
import time

def match_product_type_to_topic(product_type):
    # Start the timer
    start_time = time.time()

    # Generate the embedding for the product type
    product_embedding = embedding_model.embed_query(product_type)

    # Query the vector database for the most similar topic
    results = vectordb.similarity_search_by_vector(product_embedding, k=1)  # Get the top 1 match

    if results:
        matched_topic = results[0].page_content
    else:
        matched_topic = "No matching topic found."

    # End the timer
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(f"Time taken for processing: {elapsed_time:.4f} seconds")

    return matched_topic

# Example usage
product_type = "çiçekler"
matched_topic = match_product_type_to_topic(product_type)
print(f"The best matching topic for '{product_type}' is: {matched_topic}")
