# Langchain Intro

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
from pathlib import Path
import pandas as pd
import polars as pl
from pprint import pprint
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
import os
from dotenv import load_dotenv, find_dotenv

from langchain.llms import OpenAI

### Using A Basic OpenAI Langchain LLM

- Without using a chain.
- Very simplistic.

<br>

[![image.png](https://i.postimg.cc/cLyVwqrs/image.png)](https://postimg.cc/Mv5PJF74)


In [3]:
_ = load_dotenv(find_dotenv())  # read local .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_GPT_MODEL: str = "gpt-3.5-turbo-instruct"
TEMPERATURE: float = 0.5

In [4]:
# Create LLM
llm: OpenAI = OpenAI(
    openai_api_key=OPENAI_API_KEY,
    model=OPENAI_GPT_MODEL,
    temperature=TEMPERATURE,
)
prompt: str = "Write a very short poem."
result = llm(prompt=prompt)

print(result)

## Updated Logic

- Add Chains (from langchain)

<br>

[![image.png](https://i.postimg.cc/RVLp2WgH/image.png)](https://postimg.cc/0brCwyRk)

In [5]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

### LLM Chain

[![image.png](https://i.postimg.cc/05VBzdbB/image.png)](https://postimg.cc/0rJf3SGY)

In [6]:
# Used for text completion LLMs
prompt_template: str = """Write a very short {language} function that will {task}"""
code_prompt: PromptTemplate = PromptTemplate(
    input_variables=["language", "task"], template=prompt_template
)
code_chain: LLMChain = LLMChain(llm=llm, prompt=code_prompt)

# Hardcoded inputs!
language, task = ("Python", "generate a list of numbers")

result = code_chain(inputs={"language": language, "task": task})
print(result)

In [7]:
print(result.get("text"))

## Connecting Chains Together

- Chains in series (sequential chain)

<br>

[![image.png](https://i.postimg.cc/8Cdgm9wV/image.png)](https://postimg.cc/GBpfcMZM)

In [8]:
from langchain.chains import SequentialChain

# Used for text completion LLMs
code_prompt_template: str = (
    """Write a very short {language} function that will {task}"""
)
unittest_prompt_template: str = (
    """Write a unit test for the following {language} code: \n{code}"""
)

# Propmt(s)
code_prompt: PromptTemplate = PromptTemplate(
    input_variables=["language", "task"], template=code_prompt_template
)
unittest_prompt: PromptTemplate = PromptTemplate(
    input_variables=["language", "code"], template=unittest_prompt_template
)

# Chain(s)
code_chain: LLMChain = LLMChain(
    llm=llm,
    prompt=code_prompt,
    output_key="code",  # from text to code
)
unittest_chain: LLMChain = LLMChain(
    llm=llm,
    prompt=unittest_prompt,
    output_key="test",  # from text to test
)
# Feed the output of code_chain into unittest_chain
final_chain: SequentialChain = SequentialChain(
    chains=[code_chain, unittest_chain],
    input_variables=["language", "task"],
    output_variables=["code", "test"],
)
# Hardcoded inputs!
language, task = ("Python", "generate a list of numbers")

result = final_chain(inputs={"language": language, "task": task})
print(result)

In [9]:
# Output of the unittest_chain (key="test")
print(result.get("test"))

<hr><br>

# Simple Terminal Chatbot

<br>

### Chat Model

- It's optimized for conversations.
- It has the following inputs:
  - **System Message:** A system message is a message generated by the system or platform hosting the conversation. It is not authored by a human user or the AI model.
  
  - **Human Message:** Any message created by a human user that is directed towards the LLM for processing or interaction.

  - **AI/Assistant Message:** The output generated by the LLM in response to a human message or internal processing.
  
[![image.png](https://i.postimg.cc/FRgxXmJ6/image.png)](https://postimg.cc/YG0FQ5RN)

<br>

### ChatPrompt Template

[![image.png](https://i.postimg.cc/s2nTM32Z/image.png)](https://postimg.cc/4K96MkVJ)

In [10]:
OPENAI_CHAT_MODEL: str = "gpt-3.5-turbo"

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import (
    HumanMessagePromptTemplate,
    AIMessagePromptTemplate,
    SystemMessagePromptTemplate,
    ChatPromptTemplate,
)

In [12]:
chat_llm = ChatOpenAI(
    model=OPENAI_CHAT_MODEL,
    temperature=TEMPERATURE,
)

# Used for chat completion LLMs
prompt = ChatPromptTemplate(
    input_variables=["content"],
    messages=[HumanMessagePromptTemplate.from_template("{content}")],
)

chain: LLMChain = LLMChain(llm=chat_llm, prompt=prompt)

In [13]:
content: str = "Hello, who are you?"

result: dict[str, Any] = chain({"content": content})
print(result)

### Add Memory

- Memory allows LLMs to access and consider previous conversations or inputs, leading to more coherent and contextually relevant responses.

<br>

[![image.png](https://i.postimg.cc/MGPkWySn/image.png)](https://postimg.cc/8F6X4fsG)

<br><br>

#### Message PlaceHolder

- It's used to track the conversations.
- It contains the contents of the chat. i.e. the human and the AI messages.

[![image.png](https://i.postimg.cc/3wPBCRjL/image.png)](https://postimg.cc/p9JzPP68)


In [14]:
from langchain.memory import ConversationBufferMemory
from langchain.prompts import MessagesPlaceholder

In [15]:
chat_llm = ChatOpenAI(
    model=OPENAI_CHAT_MODEL,
    temperature=TEMPERATURE,
)

memory = ConversationBufferMemory(
    memory_key="messages",  # key for storing the chat history
    return_messages=True,  # returns the promp classes. e.g. HumanMsg, AIMsg, etc
)

# Used for chat completion LLMs
prompt = ChatPromptTemplate(
    input_variables=["content", "messages"],
    messages=[
        HumanMessagePromptTemplate.from_template("{content}"),
        MessagesPlaceholder(
            variable_name="messages"
        ),  # used to track the conversations
    ],
)

chain: LLMChain = LLMChain(
    llm=chat_llm,
    prompt=prompt,
    memory=memory,  # Add memory (new!)
)

In [16]:
content: str = "Hello, who are you?"

result: dict[str, Any] = chain({"content": content})
print(result)

In [17]:
# Follow up question
content: str = "What is NLP?"

result: dict[str, Any] = chain({"content": content})
print(result)

In [18]:
# Follow up question
content: str = "How can I study it?"

result: dict[str, Any] = chain({"content": content})
print(result)

### Saving And Extending Conversations

- Add the ability for the chatbot to remember previous conversations.
    
<br>

[![image.png](https://i.postimg.cc/NGTgxztb/image.png)](https://postimg.cc/Dmvkv58b)

<br>

> We can use:
- a JSON file.

[![image.png](https://i.postimg.cc/B6FGMXrt/image.png)](https://postimg.cc/bDqMvNNj)

In [19]:
from langchain.memory import FileChatMessageHistory

In [20]:
chat_llm = ChatOpenAI(
    model=OPENAI_CHAT_MODEL,
    temperature=TEMPERATURE,
)

memory = ConversationBufferMemory(
    chat_memory=FileChatMessageHistory(file_path="chat_history.json"),  # New!
    memory_key="messages",  # key for storing the chat history
    return_messages=True,  # returns the promp classes. e.g. HumanMsg, AIMsg, etc
)

# Used for chat completion LLMs
prompt = ChatPromptTemplate(
    input_variables=["content", "messages"],
    messages=[
        HumanMessagePromptTemplate.from_template("{content}"),
        MessagesPlaceholder(
            variable_name="messages"
        ),  # used to track the conversations
    ],
)

chain: LLMChain = LLMChain(
    llm=chat_llm,
    prompt=prompt,
    memory=memory,  # Add memory
)

In [21]:
content: str = "Hello, what is 20+5?"
result: dict[str, Any] = chain({"content": content})

# Follow up question
content: str = "subtract 10 from the result"
result: dict[str, Any] = chain({"content": content})


print(result)

<hr><br>

## Loading Files With Document Loaders

[![image.png](https://i.postimg.cc/8CDcCFjL/image.png)](https://postimg.cc/TK7f73jP)

<br>

### Text Loader

[![image.png](https://i.postimg.cc/yNV6HwbP/image.png)](https://postimg.cc/rDP2GYc0)

In [22]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

In [23]:
fp: Path = Path("../../../data/facts.txt")
loader: TextLoader = TextLoader(file_path=fp)
docs: list[Any] = loader.load()

print(docs)

In [24]:
CHUNK_SIZE: int = 200
CHUNK_OVERlAP: int = 0

# Used to chunk the texts
text_splitter = CharacterTextSplitter(
    chunk_size=CHUNK_SIZE,  # 1. dets the rel. number of chars per chunk
    separator="\n",  # 2. each hunk is separated by this!
    chunk_overlap=CHUNK_OVERlAP,
)

# Load and split
docs: list[Any] = loader.load_and_split(text_splitter=text_splitter)

for d in docs:
    print(d.page_content)

### Embeddings

- In the field of Natural Language Processing (NLP), embeddings are a powerful technique for representing words, phrases, or sentences as dense numerical vectors. 
- These vectors capture semantic and syntactic relationships between words, allowing machines to process and understand text in a more meaningful way.

[![image.png](https://i.postimg.cc/02BkcpJk/image.png)](https://postimg.cc/ppKMd5Z4)


In [25]:
from langchain.embeddings import OpenAIEmbeddings

In [26]:
emb_model = OpenAIEmbeddings()

emb_model.embed_query("It's good to be good")

[0.012109475210309029,
 -0.004843165632337332,
 0.012602847069501877,
 -0.04598977416753769,
 -0.029252594336867332,
 0.017324231564998627,
 -0.018560783937573433,
 -0.0182610135525465,
 -0.013439706526696682,
 -0.02647971920669079,
 0.02097143791615963,
 0.0024481250438839197,
 0.013577100820839405,
 0.0008399816579185426,
 -0.01267778966575861,
 -0.0029009031131863594,
 0.06260205060243607,
 -0.013164917007088661,
 0.00835609994828701,
 0.016237562522292137,
 -0.019435115158557892,
 -0.005617572460323572,
 -0.018573274835944176,
 -0.01779886707663536,
 -0.00858717318624258,
 0.0020905863493680954,
 -0.005258472636342049,
 -0.030351752415299416,
 0.018910516053438187,
 -0.015075952745974064,
 0.011191428638994694,
 0.004546517971903086,
 -0.015100933611392975,
 0.003890770021826029,
 -0.009255411103367805,
 -0.026329834014177322,
 -0.022495269775390625,
 -0.0037564977537840605,
 0.024531211704015732,
 0.012171926908195019,
 0.020446838811039925,
 -0.0050773611292243,
 0.00142781261820

### Vector Stores

- Qdrant

In [27]:
from langchain.vectorstores import Qdrant

In [28]:
url: str = "http://localhost:6333/"
qdrant = Qdrant.from_documents(
    docs,
    emb_model,
    url=url,
    prefer_grpc=False,
    collection_name="facts",
    force_recreate=False,
)

In [29]:
query: str = "What is an interesting fact about the English language"
found_docs = qdrant.similarity_search(query, k=2)

print(found_docs)

In [30]:
print(found_docs[0].page_content)

<br>

### Retrieval Chain

[![image.png](https://i.postimg.cc/X7SX9Rtj/image.png)](https://postimg.cc/qgwkT5kS)

In [31]:
from langchain.chains import RetrievalQA

In [32]:
# Retriever DB
retriever = qdrant.as_retriever()

chain = RetrievalQA.from_chain_type(
    llm=chat_llm,
    retriever=retriever,
    chain_type="stuff",  #  it feeds the LLM a prompt built from document list.
    return_source_documents=True,  # to easily detect hallucination
    input_key="query",
)

In [33]:
query: str = "What is an interesting fact about the English language"

# result: str = chain.run(query)
result: dict[str, Any] = chain({"query": query})
print(result)

### Using The Debug Option

```python
import langchain

langchain.debug = True
```

In [34]:
import langchain

langchain.debug = True


query: str = "What is an interesting fact about the English language"

result: dict[str, Any] = chain({"query": query})
print(result)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is an interesting fact about the English language"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is an interesting fact about the English language",
  "context": "1. \"Dreamt\" is the only English word that ends with the letters \"mt.\"\n2. An ostrich's eye is bigger than its brain.\n3. Honey is the only natural food that is made without destroying any kind of life.\n\n1. \"Dreamt\" is the only English word that ends with the letters \"mt.\"\n2. An ostrich's eye is bigger than its brain.\n3. Honey is the only natural food that is made without destroying any kind of life.\n\n1. \"Dreamt\" is the only English word that ends with the 

In [35]:
type(qdrant)

langchain.vectorstores.qdrant.Qdrant