In [2]:
import pandas as pd

In [3]:
def extract_questions_to_txt(jsonl_path, output_txt_path):
    # Read the .jsonl file into a pandas DataFrame
    df = pd.read_json(jsonl_path, lines=True)
    
    # Extract the 'question' column
    questions = df['question']
    
    # Write the questions to a .txt file, each question on a new line
    with open(output_txt_path, 'w') as f:
        for question in questions:
            f.write(question + '\n')

In [4]:
extract_questions_to_txt('dev.jsonl', 'boolean_questions.txt')

 RAG 

In [5]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_community.document_loaders import TextLoader #load the document
from langchain_text_splitters import RecursiveCharacterTextSplitter #for creating chunks from the loaded document
from langchain_openai import OpenAIEmbeddings #for converting chunks into embeddings
from langchain_chroma import Chroma #database for stroring the embeddings

In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
import os
dir = os.getcwd()
db_dir = os.path.join(dir,"chroma_db")
print(db_dir)

/Users/chapter318/Library/CloudStorage/OneDrive-GeorgiaInstituteofTechnology/F24/8803_AI4DM/Proj2/FlaskApp_Template/chroma_db


Create Vector DB

In [8]:
#Read the text content from the .txt file and load it as langchain document
loader = TextLoader('boolean_questions.txt')
document = loader.load()

In [9]:
#Split the document into chunks using text splitters 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(document)

print("Document chunk info:\n")
print(f"Number of document chunks: {len(chunks)}")
print(f"Sample chunk: \n{chunks[3].page_content}\n")


Document chunk info:

Number of document chunks: 181
Sample chunk: 
can you wear short sleeve shirt with asu jacket
has wisconsin ever been in the little league world series
does damon and elena get together in season 3
is there a player in the nfl missing a hand
is the other boleyn girl part of a series
is there a group called the five heartbeats
is mount everest a part of the himalayas
can an emt-basic start an iv
has no 1 court at wimbledon got a roof
has anyone come back from 3-0 in the nba finals
do radio waves travel at the speed of light
did anyone from the 1980 us hockey team play in the nhl
do all triangles have at least two acute angles
is baylor and mary hardin baylor the same school
can you get the death penalty as a minor
did indian football team qualified for fifa 2018
are t rex and tyrannosaurus rex the same
is the old panama canal still in use
do you need a pal to possess ammunition
do blue and pink cotton candy taste the same
did to kill a mockingbird win an academy aw

In [10]:
#create embeddings using openAI embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)
#store the embeddings and chunks into Chroma DB
Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_dir)

<langchain_chroma.vectorstores.Chroma at 0x1281e6990>

### Retrieve and generate

In [11]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import ChatPromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
#setting up the DB for retrieval
embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
vectorDB = Chroma(persist_directory=db_dir,embedding_function=embeddings_used)

  embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
  vectorDB = Chroma(persist_directory=db_dir,embedding_function=embeddings_used)


In [13]:
#setting up Retriver
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [14]:
def getRetriever(dir):
    """
    dir is the directory of the vector DB
    """
    embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorDB = Chroma(persist_directory=dir,embedding_function=embeddings_used)
    retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever

In [15]:
headers = {"Authorization": "Bearer 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"}

In [16]:
from huggingface_hub import InferenceClient

def textGeneration_langChain_RAG(user_type, retrieverDir, api_key):
    """
    user_type: The type of user (e.g., children, adults, etc.).
    retrieverDir: Directory of the vector DB with relevant boolean questions.
    api_key: Your Hugging Face API key for authentication.
    """

    # Initialize the Inference Client
    client = InferenceClient(api_key=api_key)

    # Retrieve relevant boolean questions from Chroma DB using user_type
    retriever = getRetriever(retrieverDir)
    
    # Modify the query to include user_type in the context
    query = f"Get some boolean questions for a {user_type}."
    retrieved_docs = retriever.get_relevant_documents(query)
    
    # Extract content from retrieved documents
    context = "\n".join(doc.page_content for doc in retrieved_docs)

    # Create a clear and direct system prompt for generating a boolean question
    system_prompt = (
        "Based on the following context, generate one complete boolean question that a {user_type} would ask:\n"
        "{context}\n\n"
        "Make sure it's only one grammatically correct question and can be answered with yes/no."
    )

    # Prepare the final prompt to send to the Hugging Face API
    final_prompt = system_prompt.format(user_type=user_type, context=context)

    # Prepare the messages for the chat API
    messages = [
        {"role": "user", "content": final_prompt}
    ]

    # Stream the response from the Hugging Face Inference Client
    stream = client.chat.completions.create(
        model="HuggingFaceTB/SmolLM2-1.7B-Instruct", 
        messages=messages, 
        max_tokens=500,
        stream=True
    )

    # Collect and print the output from the stream
    full_response = ""
    for chunk in stream:
        full_response += chunk.choices[0].delta.content

    return full_response.strip()  # Return the final response witho


In [20]:
# Example usage
api_key = "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"  # Replace with your actual Hugging Face API key
output = textGeneration_langChain_RAG(
    user_type="mom",  # Specify the user type
    retrieverDir=db_dir,  # Your Chroma DB directory
    api_key=api_key  # Your Hugging Face API key
)
print(output)

Can a car run on anything other than gasoline?
