## Data Cleaning

In [1]:
import pandas as pd

In [18]:
df = pd.read_csv("hf://datasets/ShehryarAzhar/stories/stories_dataset.csv")
df.head(10)

Unnamed: 0,id,title,story
0,1,Write an Adventure Story,"Far across the sea, on the distant shores of A..."
1,2,Write an Adventure Story,"Far across the sea, on the distant shores of A..."
2,3,Write an Adventure Story,High atop the snow-capped peaks of Mount Evere...
3,4,Write an Adventure Story,"In the sprawling metropolis of New Arcadia, wh..."
4,5,Write an Adventure Story,"In the vast expanse of the Wildlands, where un..."
5,6,Write an Adventure Story,In the shadow of the towering peaks of Mount V...
6,7,Write an Adventure Story,"In the heart of the sprawling desert, where th..."
7,8,Write an Adventure Story,"In the mist-shrouded realm of Avaloria, where ..."
8,9,Write an Adventure Story,"In the rugged mountains of the North, where sn..."
9,10,Write an Adventure Story,"In the heart of the untamed jungle, where the ..."


In [4]:
def write_stories_to_txt(df, output_file):
    # Open the output file in write mode
    with open(output_file, 'w') as f:
        # Group the dataframe by 'title' (story type)
        grouped = df.groupby('title')
        
        # Loop through each story type and its corresponding stories
        for story_type, stories in grouped:
            # Write the story type as a heading
            f.write(f"{story_type}\n")
            f.write("=" * len(story_type) + "\n")  # Adding a separator line
            
            # Loop through all stories under this story type and write them
            for story in stories['story']:
                f.write(f"{story}\n\n")  # Add a new line after each story
            
            # Add a couple of blank lines between different story types
            f.write("\n\n")

    print(f"Stories have been written to {output_file}")

In [5]:
write_stories_to_txt(df, "stories.txt")

Stories have been written to stories.txt


## RAG

In [6]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_community.document_loaders import TextLoader #load the document
from langchain_text_splitters import RecursiveCharacterTextSplitter #for creating chunks from the loaded document
from langchain_openai import OpenAIEmbeddings #for converting chunks into embeddings
from langchain_chroma import Chroma #database for stroring the embeddings

In [7]:
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
import os
dir = os.getcwd()
db_dir = os.path.join(dir,"chroma_db")
print(db_dir)

/Users/caochunqin/Desktop/githomework/milestone2_pt2/chroma_db


### Create vector DB

In [19]:
#Read the text content from the .txt file and load it as langchain document
loader = TextLoader('stories.txt')
document = loader.load()

In [20]:
#Split the document into chunks using text splitters 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(document)

print("Document chunk info:\n")
print(f"Number of document chunks: {len(chunks)}")
print(f"Sample chunk: \n{chunks[3].page_content}\n")

Document chunk info:

Number of document chunks: 3990
Sample chunk: 
In the bustling city of Bustleton, where the traffic never seemed to move and the pigeons had perfected synchronized flying, lived a man named Bob. Bob had a peculiar talent—he could never seem to find his socks.  Every morning, Bob would rummage through his dresser drawers in search of a matching pair of socks, only to emerge with one polka-dotted sock and one striped sock. No matter how many times he bought new socks or organized his drawers, the socks seemed to vanish into thin air.  One day, in a fit of frustration, Bob decided to take matters into his own hands. He set up a surveillance camera in his bedroom to catch the elusive sock thief in action. But when he reviewed the footage the next morning, he discovered the culprit—his mischievous pet cat, Whiskers, who had been hoarding Bob's socks under the bed.  With a bemused smile, Bob realized that his sock-stealing cat was just another quirky aspect of life in B

In [11]:
#create embeddings using openAI embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [12]:
#store the embeddings and chunks into Chroma DB
Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_dir)

<langchain_chroma.vectorstores.Chroma at 0x1068b8e00>

### Retrieve and generate

In [13]:
#setting up the DB for retrieval
embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
vectorDB = Chroma(persist_directory=db_dir,embedding_function=embeddings_used)

In [14]:
#setting up Retriver
retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [15]:
def getRetriever(dir):
    """
    dir is the directory of the vector DB
    """
    embeddings_used = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorDB = Chroma(persist_directory=dir,embedding_function=embeddings_used)
    retriever = vectorDB.as_retriever(search_type="similarity", search_kwargs={"k": 3})
    return retriever

In [16]:
def textGeneration_langChain_RAG(msg,type,retrieverDir):
    """
    msg is the scenario for the story from the pic (hugging face model output);
    type is the genre of the story- Horror, Fantasy, Adventure, Comedy, Mystery, Romance
    retriever is the vector DB with relevant stories from txt version of 
        stories dataset from Hugging face - https://huggingface.co/datasets/ShehryarAzhar/stories
    """
    llm = ChatOpenAI(
            model="gpt-4o",
            temperature=0.2,
            max_tokens=200,
            timeout=None,
            max_retries=2
        )

    system_prompt = (
        "You are an expert short {story_type} story teller. " 
        "Use the following pieces of retrieved context to generate {story_type} story. "
        "Use a simple narrative structure to generate {story_type} story based on the given scenario"
        "keep the story to less than 200 words."
        "\n\n"
        "{context}"
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{scenario_lang}"),
        ]
    )

    rag_chain = prompt | llm | StrOutputParser()

    retriever = getRetriever(retrieverDir)

    out_message = rag_chain.invoke({
            "story_type" : type,
            "context":retriever,
            "scenario_lang" : msg,
        })
    
    return out_message

In [17]:
scenario = "bookshelves with many different colored books on them in a library" #example output from huggingface model
story = textGeneration_langChain_RAG(scenario,"Horror", db_dir)
print(story)

In the heart of the town stood an ancient library, its towering bookshelves filled with volumes of every color imaginable. Each book seemed to pulse with a life of its own, their spines glowing faintly in the dim light.

One stormy evening, a curious young woman named Elara found herself drawn to the library. As she wandered through the aisles, she noticed a peculiar pattern: the books seemed to be arranged not by author or genre, but by the color of their spines. Intrigued, she reached for a vibrant crimson book, its cover cold to the touch.

The moment she opened it, the room darkened, and a chilling wind swept through the library. The words on the pages began to swirl and rise, forming a ghostly figure before her. It was the spirit of a long-forgotten librarian, trapped within the pages for centuries.

"You have awakened me," the specter whispered, its voice echoing through the aisles. "Now, you
