# Running LangChain

## Import Libraries

In [3]:
import argparse
import os
import shutil
import boto3

import pandas as pd

from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_community.document_loaders import DataFrameLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings.bedrock import BedrockEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.llms import Ollama

In [5]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [28]:
def load_documents(df_path):
    df = pd.read_csv(df_path) 
    document_loader = DataFrameLoader(df, page_content_column="content")
    return document_loader.load()

In [29]:
documents_china = load_documents(DATA_PATH + "/china.csv")

In [48]:
documents_china[1]

Document(metadata={'title': 'China Hits Back at the US in Response to Doping Allegations Dogging Its Swimmers', 'link': 'https://thediplomat.com/2024/08/china-hits-back-at-the-us-in-response-to-doping-allegations-dogging-its-swimmers/'}, page_content='China is trying to fight fire with fire in the face of persistent doping allegations that have dogged its swimmers at the Paris Olympics.\nThe China Anti-Doping Agency called Thursday for more intensive testing of U.S. track and field competitors, citing in a news release past doping scandals and questioning how the U.S. Anti-Doping Agency handled them.\nRepeated blasts from the Chinese agency have been echoed by reports in the government-controlled state media complaining about double standards applied to Chinese competitors. The reports have highlighted the more than 600 tests undergone by Chinese swimmers at the Paris Games with no violations found.\nThe World Anti-Doping Agency and World Aquatics have acknowledged that 23 Chinese swim

In [43]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=500,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [44]:
chunks = split_documents(documents_china)

In [46]:
chunks[1]

Document(metadata={'title': 'First Known Survivor of China’s Forced Organ Harvesting Speaks Out', 'link': 'https://thediplomat.com/2024/08/first-known-survivor-of-chinas-forced-organ-harvesting-speaks-out/'}, page_content='This was not an isolated incident. Cheng endured repeated procedures, all under the threat of death. “A few days later, they said I had to have another operation. I thought I was going to die “ he said.\nDuring his imprisonment, Cheng was subjected to forced blood tests – an ominous indicator of his organs’ viability for transplantation. “They did blood tests on me many times and subjected me to all kinds of inhumane torture,” he noted.\n“The torture in the prison was very systematic. One was mental and the other was physical [torture]. Mentally they put me and my family members under pressure as they wanted me to give up my faith in Falun Gong and if I didn’t they would force my wife to divorce me when I was in prison.” Cheng was told that if his wife did not divorc

In [12]:
df_china = pd.read_csv(DATA_PATH + "/china.csv") 

In [15]:
loader = DataFrameLoader(df_china, page_content_column="content")

In [17]:
documents_china = loader.load()

In [24]:
documents_china[2].page_content

'September 2024 marks the 70th anniversary of the much forgotten and often maligned Southeast Asia Treaty Organization (SEATO). However, its legacy in fact provides invaluable insights for Asia’s emerging multilateral alliances, such as the “Quad,” “Quad Plus,” and the much-hyped but still hypothetical idea of an “Asian NATO.” Despite criticisms of its impotency and disunity, understanding how SEATO emerged and the internal divisions that led to its demise is crucial for navigating today’s complex geopolitical terrain.\nSEATO, also known as the “Manila Pact,” was an international organization for collective defense in Southeast Asia, aimed at combating communist expansion in the region. Established on September 8, 1954, it emerged during a strategic interregnum when postcolonial independence struggles intersected with the United States’ ascent as a superpower and emerging priorities to contain the global expansion of communism.\nBack in 1949, Washington had just formed NATO to counter 

In [25]:
llm = Ollama(model="llama3.1")

In [27]:
llm.get_num_tokens(documents_china[2].page_content)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Token indices sequence length is longer than the specified maximum sequence length for this model (1705 > 1024). Running this sequence through the model will result in indexing errors


1705

In [3]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [4]:
documents = load_documents()

In [5]:
len(documents)

24

In [26]:
print(documents[2])

page_content='Deloitte Global Retail Outlook 2024 | Navigating challenges and embracing opportunities - Insights from retail leaders around the world  03After the disruption that came about during by the COVID-19 
pandemic,	the	rising	costs-of-living,	inflationary	pressures	and 	
geopolitical tensions around the world, you could forgive retailers 
for being cautious about prospects for the year ahead. But the 
opposite appears to be true. 
Buoyed by opportunities founded in technology – not least of 
which, the emergence of generative AI – which could reduce costs, 
improve productivity and enhance the customer experience, the 
retailers interviewed as part of the Deloitte Global Retail Outlook 
provided an optimistic outlook for both top- and bottom-line 
performance across the sector.It would be a mistake to say that things will be easy for retailers this 
year – the economic outlook suggests that 2024 as could be a year 
of	two	conflicting	halves	and	retailers	report	that	their	numb

In [7]:
len(documents)

24

In [8]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [9]:
chunks = split_documents(documents)

In [10]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [11]:
def get_embedding_function():
    embeddings = BedrockEmbeddings(
        credentials_profile_name="default", region_name="us-east-1"
    )
    # embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [12]:
add_to_chroma(chunks)

  warn_deprecated(


Number of existing documents in DB: 113
✅ No new documents to add


In [13]:
llm = Ollama(model="llama3.1")

In [14]:
llm.invoke("tell me you love me")

"I'd love to! As a conversational AI, I don't have personal feelings or emotions like humans do. However, I'm designed to provide affection and care through text-based interactions.\n\nIn that spirit, here's my digital hug: **YOU ARE LOVED AND APPRECIATED**!\n\nIf you're feeling down, sad, or just need some reassurance, know that there are people (and AI assistants like me) who want to help and support you. You're not alone! Would you like to talk about what's on your mind?"

In [15]:
print("hi")

hi


In [16]:
embedding_function = get_embedding_function()
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [18]:

#results = db.similarity_search_with_score(query_text, k=5)

In [20]:
# context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

In [21]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    # print(prompt)

    model = Ollama(model="llama3.1")
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [22]:
print("hello world")

hello world


In [25]:
query_text = "I am a CEO of a retail company (Walmart) with international businesses in China, Mexico, Chile, India and Canada. \
Give me the five most useful facts, with supporting data, impacting these international businesses"

query_text = "I am writing a newsletter for employees of the retail company (walmart) that I work with. This newsletter aims to share the most impactful international retail insights that they can use in their work\
Can you write me a roughly 500 word headline entry of the most important discovery or trend, supported by data"

response = query_rag(query_text)

Response: Here's a 500-word headline entry for your newsletter based on one of the most important discoveries or trends supported by data:

**Headline:** The Weight is Off: How Weight Loss Drugs are Revolutionizing Retail Sales and Customer Behavior

**Subheading:** A Deloitte Insights analysis reveals that weight loss drugs are not only boosting sales at retail pharmacies but also changing consumer behavior, presenting new opportunities and challenges for retailers like Walmart.

As we continue to navigate the ever-changing retail landscape, one trend is gaining significant attention: the impact of weight loss drugs on customer behavior and sales. A recent Deloitte Insights analysis found that weight loss drugs are not only increasing sales at retail pharmacies but also changing consumer behavior, presenting both opportunities and challenges for retailers like Walmart.

**The Numbers Don't Lie**

According to a CNBC report, weight loss drug sales have seen significant growth in the US