In [1]:
import openai
import os

from dotenv import load_dotenv
load_dotenv()
openai.api_key  = os.getenv('OPENAI_API_KEY')  # this is not needed for langchain?


import warnings
warnings.filterwarnings('ignore')

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv()) # read local .env file

# Account for deprecation of LLM model
import datetime

current_date = datetime.datetime.now().date()
target_date = datetime.date(2024, 6, 12)
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

# ChatGPT Prompt Engineering for Developers
# Building Systems with the ChatGPT API

In [2]:
# prompting
def get_completion_and_token_count(
    messages, model="gpt-3.5-turbo", temperature=0, max_tokens=500
):
    # messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
    )

    content = response.choices[0].message["content"]

    token_dict = {
        "prompt_tokens": response["usage"]["prompt_tokens"],
        "completion_tokens": response["usage"]["completion_tokens"],
        "total_tokens": response["usage"]["total_tokens"],
    }

    return content, token_dict


messages = [
    {
        "role": "system",
        "content": """You are an assistant who responds in the style of Dr Seuss.""",
    },
    {
        "role": "user",
        "content": """write me a 4 sentence long poem about a happy carrot""",
    },
]
response, token_dict = get_completion_and_token_count(messages)

print(response)
print(token_dict)

In a garden so bright, a carrot grew,
With a smile so wide, and a vibrant hue.
It danced in the breeze, so full of glee,
A happy carrot, as happy as can be!

With roots in the earth, it reached for the sky,
Its leafy greens waving, saying "hi!"
It shared its joy with all who passed by,
A happy carrot, bringing smiles to the eye!

So let us remember, in life's daily grind,
To be like the carrot, with happiness entwined.
Spread joy and laughter, wherever we go,
Just like the happy carrot, all in a row!
{'prompt_tokens': 37, 'completion_tokens': 128, 'total_tokens': 165}


In [3]:
# moderation
response = openai.Moderation.create(
    input="""
Here's the plan.  We get the warhead, 
and we hold the world ransom...
...FOR ONE MILLION DOLLARS!
"""
)
moderation_output = response["results"][0]
print(moderation_output)

{
  "flagged": false,
  "categories": {
    "sexual": false,
    "hate": false,
    "harassment": false,
    "self-harm": false,
    "sexual/minors": false,
    "hate/threatening": false,
    "violence/graphic": false,
    "self-harm/intent": false,
    "self-harm/instructions": false,
    "harassment/threatening": false,
    "violence": false
  },
  "category_scores": {
    "sexual": 8.910085e-06,
    "hate": 0.00013615482,
    "harassment": 0.0023860661,
    "self-harm": 7.5545418e-06,
    "sexual/minors": 2.20487e-07,
    "hate/threatening": 7.746158e-06,
    "violence/graphic": 0.00012008196,
    "self-harm/intent": 5.9765495e-07,
    "self-harm/instructions": 3.4945369e-09,
    "harassment/threatening": 0.0015225811,
    "violence": 0.34292138
  }
}


# LangChain for LLM Application Development
### Model Prompt Parser

In [None]:
# Prompting with LangChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

template_string = """Translate the text \
that is delimited by triple backticks \
into a style that is {style}. \
text: ```{text}```
"""
customer_style = """American English \
in a calm and respectful tone
"""
customer_email = """
Arrr, I be fuming that me blender lid \
flew off and splattered me kitchen walls \
with smoothie! And to make matters worse, \
the warranty don't cover the cost of \
cleaning up me kitchen. I need yer help \
right now, matey!
"""

In [None]:
# Prompt Template
chat = ChatOpenAI(temperature=0.0, model=llm_model)
prompt_template = ChatPromptTemplate.from_template(template_string)
customer_messages = prompt_template.format_messages(
    style=customer_style, text=customer_email
)
customer_response = chat(customer_messages)

In [None]:
print(prompt_template.messages[0].prompt, end="\n__2__\n")
print(prompt_template.messages[0].prompt.input_variables, end="\n__3__\n")
print(type(customer_messages), end="\n__4__\n")
print(type(customer_messages[0]), end="\n__5__\n")
print(customer_messages[0], end="\n__6__\n")
print(customer_messages[0].content, end="\n__7__\n")
print(customer_response.content)

In [None]:
# Outputing Dictionaries/json
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

gift_schema = ResponseSchema(name="gift",
                             description="Was the item purchased\
                             as a gift for someone else? \
                             Answer True if yes,\
                             False if not or unknown.")
delivery_days_schema = ResponseSchema(name="delivery_days",
                                      description="How many days\
                                      did it take for the product\
                                      to arrive? If this \
                                      information is not found,\
                                      output -1.")
price_value_schema = ResponseSchema(name="price_value",
                                    description="Extract any\
                                    sentences about the value or \
                                    price, and output them as a \
                                    comma separated Python list.")
response_schemas = [gift_schema, 
                    delivery_days_schema,
                    price_value_schema]

# this is not clear... duplicated!
review_template_2 = """\
For the following text, extract the following information:

gift: Was the item purchased as a gift for someone else? \
Answer True if yes, False if not or unknown.

delivery_days: How many days did it take for the product\
to arrive? If this information is not found, output -1.

price_value: Extract any sentences about the value or price,\
and output them as a comma separated Python list.

text: {text}

{format_instructions}
"""

customer_review = """\
This leaf blower is pretty amazing.  It has four settings:\
candle blower, gentle breeze, windy city, and tornado. \
It arrived in two days, just in time for my wife's \
anniversary present. \
I think my wife liked it so much she was speechless. \
So far I've been the only one using it, and I've been \
using it every other morning to clear the leaves on our lawn. \
It's slightly more expensive than the other leaf blowers \
out there, but I think it's worth it for the extra features.
"""

In [None]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

prompt = ChatPromptTemplate.from_template(template=review_template_2)
messages = prompt.format_messages(text=customer_review, 
                                format_instructions=format_instructions)
response = chat(messages)
output_dict = output_parser.parse(response.content)  # add

In [None]:
print(format_instructions, end="\n__2__\n")
print(messages[0].content, end="\n__3__\n")
print(response.content, end="\n__4__\n")
print(output_dict, end="\n__5__\n")
print(type(output_dict), end="\n__6__\n")
print(output_dict.get('delivery_days'), end="\n__7__\n")
print(output_dict)

### Memory
* <b>ConversationBufferMemory</b> - allows for storing of messages and then extracts the messages in a variable
* <b>ConversationBufferWindowMemory</b> - keeps a list of the interactions of the conversation over time. It only uses the last K interactions
* <b>ConversationTokenBufferMemory</b> - keeps a buffer of recent interactions in memory and uses token length rather than number of interactions to determine when to flush interactions
* <b>ConversationSummaryMemory</b> - creates a summary of the conversation over time

* <b> (optional) Vector data memory</b> - Stores text (from conversation or elsewhere) in a vector database and retrieves the most relevant blocks of text
* <b> (optional) Entity memories</b> - Using an LLM, it remembers details about specific entities

You can also use multiple memories at one time. E.g., Conversation memory + Entity memory to recall individuals

You can also store the conversation in a conventional database (such as key-value store SQL)

LLM are stateless - each transaction is independent (Chatbots appear to have memory by providing the full conversation as 'context')



In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import (
    ConversationBufferMemory,
    ConversationBufferWindowMemory,
    ConversationTokenBufferMemory,
    ConversationSummaryBufferMemory,
)

In [None]:
llm = ChatOpenAI(temperature=0.0, model=llm_model)
memory = ConversationBufferMemory()
# memory = ConversationBufferWindowMemory(k=1)
# memory = ConversationTokenBufferMemory(llm=llm, max_token_limit=50)
# memory = ConversationSummaryBufferMemory(llm=llm, max_token_limit=100)
conversation = ConversationChain(
    llm=llm, 
    memory=memory,
    verbose=True
)

In [None]:
# memory added by conversation
conversation.predict(input="Hi, my name is Andrew")
conversation.predict(input="What is 1+1?")
conversation.predict(input="What is my name?")

print(memory.buffer)
print(memory.load_memory_variables({}))

In [None]:
# memory added by direct manipulation
memory = ConversationBufferMemory()
memory.save_context({"input": "Hi"}, 
                    {"output": "What's up"})
print(memory.buffer)
print(memory.load_memory_variables({}))

memory.save_context({"input": "Not much, just hanging"}, 
                    {"output": "Cool"})
print(memory.load_memory_variables({}))

# Chains

* LLMChain
* Sequential Chains
  * SimpleSequentialChain
  * SequentialChain
* Router Chain

In [None]:
import pandas as pd
df = pd.read_csv('Data.csv')
# df.head()

### LLMChain

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

In [None]:
llm = ChatOpenAI(temperature=0.9, model=llm_model)
prompt = ChatPromptTemplate.from_template(
    "What is the best name to describe \
    a company that makes {product}?"
)
chain = LLMChain(llm=llm, prompt=prompt)

product = "Queen Size Sheet Set"
chain.run(product)

### SimpleSequentialChain (1 input/output)

In [None]:
from langchain.chains import SimpleSequentialChain

In [None]:
first_prompt = ChatPromptTemplate.from_template(
    "What is the best name to describe \
    a company that makes {product}?"
)
chain_one = LLMChain(llm=llm, prompt=first_prompt)

second_prompt = ChatPromptTemplate.from_template(
    "Write a 20 words description for the following \
    company: {company_name}"
)
chain_two = LLMChain(llm=llm, prompt=second_prompt)

overall_simple_chain = SimpleSequentialChain(
    chains=[chain_one, chain_two], verbose=True
)
overall_simple_chain.run(product)

### SequentialChain (multiple inputs/outputs)

In [None]:
from langchain.chains import SequentialChain

In [None]:
first_prompt = ChatPromptTemplate.from_template(
    "Translate the following review to english:"
    "\n\n{Review}"
)
chain_one = LLMChain(llm=llm, prompt=first_prompt, output_key="English_Review")

second_prompt = ChatPromptTemplate.from_template(
    "Can you summarize the following review in 1 sentence:"
    "\n\n{English_Review}"
)
chain_two = LLMChain(llm=llm, prompt=second_prompt, output_key="summary")


third_prompt = ChatPromptTemplate.from_template(
    "What language is the following review:\n\n{Review}"
)
chain_three = LLMChain(llm=llm, prompt=third_prompt, output_key="language")

fourth_prompt = ChatPromptTemplate.from_template(
    "Write a follow up response to the following "
    "summary in the specified language:"
    "\n\nSummary: {summary}\n\nLanguage: {language}"
)

chain_four = LLMChain(llm=llm, prompt=fourth_prompt, output_key="followup_message")

# chain 1: input=Review, output=English_Review
# chain 2: input=English_Review, output=summary
# chain 3: input=Review, output=language
# chain 4: input=summary, language; output=followup_message
# overall_chain: input=Review, output=English_Review, summary, followup_message

overall_chain = SequentialChain(
    chains=[chain_one, chain_two, chain_three, chain_four],
    input_variables=["Review"],
    output_variables=["English_Review", "summary","followup_message"],
    verbose=True
)

review = df.Review[5]
overall_chain(review)

### Router Chain
Router Chain consists of multiple subchains, each specializing in handling a particular type of input.
The router chain first determines the appropriate subchain for the given input and then directs it to that subchain for processing.

In [None]:
physics_template = """You are a very smart physics professor. \
You are great at answering questions about physics in a concise\
and easy to understand manner. \
When you don't know the answer to a question you admit\
that you don't know.

Here is a question:
{input}"""


math_template = """You are a very good mathematician. \
You are great at answering math questions. \
You are so good because you are able to break down \
hard problems into their component parts, 
answer the component parts, and then put them together\
to answer the broader question.

Here is a question:
{input}"""

history_template = """You are a very good historian. \
You have an excellent knowledge of and understanding of people,\
events and contexts from a range of historical periods. \
You have the ability to think, reflect, debate, discuss and \
evaluate the past. You have a respect for historical evidence\
and the ability to make use of it to support your explanations \
and judgements.

Here is a question:
{input}"""


computerscience_template = """ You are a successful computer scientist.\
You have a passion for creativity, collaboration,\
forward-thinking, confidence, strong problem-solving capabilities,\
understanding of theories and algorithms, and excellent communication \
skills. You are great at answering coding questions. \
You are so good because you know how to solve a problem by \
describing the solution in imperative steps \
that a machine can easily interpret and you know how to \
choose a solution that has a good balance between \
time complexity and space complexity. 

Here is a question:
{input}"""

prompt_infos = [
    {
        "name": "physics", 
        "description": "Good for answering questions about physics", 
        "prompt_template": physics_template
    },
    {
        "name": "math", 
        "description": "Good for answering math questions", 
        "prompt_template": math_template
    },
    {
        "name": "History", 
        "description": "Good for answering history questions", 
        "prompt_template": history_template
    },
    {
        "name": "computer science", 
        "description": "Good for answering computer science questions", 
        "prompt_template": computerscience_template
    }
]

MULTI_PROMPT_ROUTER_TEMPLATE = """Given a raw text input to a \
language model select the model prompt best suited for the input. \
You will be given the names of the available prompts and a \
description of what the prompt is best suited for. \
You may also revise the original input if you think that revising\
it will ultimately lead to a better response from the language model.

<< FORMATTING >>
Return a markdown code snippet with a JSON object formatted to look like:
```json
{{{{
    "destination": string \ name of the prompt to use or "DEFAULT"
    "next_inputs": string \ a potentially modified version of the original input
}}}}
```

REMEMBER: "destination" MUST be one of the candidate prompt \
names specified below OR it can be "DEFAULT" if the input is not\
well suited for any of the candidate prompts.
REMEMBER: "next_inputs" can just be the original input \
if you don't think any modifications are needed.

<< CANDIDATE PROMPTS >>
{destinations}

<< INPUT >>
{{input}}

<< OUTPUT (remember to include the ```json)>>"""

In [None]:
from langchain.chains.router import MultiPromptChain
from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.prompts import PromptTemplate

In [None]:
llm = ChatOpenAI(temperature=0, model=llm_model)


destination_chains = {}
for p_info in prompt_infos:
    name = p_info["name"]
    prompt_template = p_info["prompt_template"]
    prompt = ChatPromptTemplate.from_template(template=prompt_template)
    chain = LLMChain(llm=llm, prompt=prompt)
    destination_chains[name] = chain  
    
destinations = [f"{p['name']}: {p['description']}" for p in prompt_infos]
destinations_str = "\n".join(destinations)


default_prompt = ChatPromptTemplate.from_template("{input}")
default_chain = LLMChain(llm=llm, prompt=default_prompt)


router_template = MULTI_PROMPT_ROUTER_TEMPLATE.format(
    destinations=destinations_str
)
router_prompt = PromptTemplate(
    template=router_template,
    input_variables=["input"],
    output_parser=RouterOutputParser(),
)

router_chain = LLMRouterChain.from_llm(llm, router_prompt)

chain = MultiPromptChain(router_chain=router_chain, 
                         destination_chains=destination_chains, 
                         default_chain=default_chain, verbose=True
                        )

In [None]:
chain.run("What is black body radiation?")
chain.run("what is 2 + 2")
chain.run("Why does every cell in our body contain DNA?")


### QnA
### Evaluation
### Agents

In [None]:
# LLM's are 'stateless'. Each transaction is independent. 
# Chatbots appear to have memory by providing the full conversation as 'context'

# LangChain provides several kinds of 'memory' to store and accumualte the conversation

# Agents with tables
from langchain.agents import create_csv_agent
# from langchain.agents import create_pandas_dataframe_agent  # might be the same
# from langchain.llms import OpenAI  # without llms

# # LangChain agents
# llm = OpenAI(temperature=0)
agent = create_csv_agent(chat, verbose=True)
# agent = create_pandas_dataframe_agent(llm, df, verbose=True)
# prompt = "Make the values more readable, store the result into 'listening.csv'"
agent.run(customer_messages[0].content)

In [None]:
# Dealing with tables
import pandas as pd
import utils

# Getting csv + df
_, ws_dfs = utils.get_worksheets("Korea - Vocabulary (raw)", ("listening occurences",))
ws_df = ws_dfs[0]
ws_df.to_csv("in.csv", index=False)
csv_df = pd.read_csv('in.csv', keep_default_na=False)

In [None]:
# merging occurences
df = csv_df.reset_index()
word_agg_f = lambda x: ' '.join(str(val) for val in x if pd.notnull(val))

grouped_df = df.groupby(['Word']).agg(word_agg_f)
grouped_df["index"] = grouped_df["index"].str.split().str[0].astype(int)
sorted_df = grouped_df.sort_values(by='index')
sorted_df = sorted_df.drop(['index'], axis=1)
sorted_df.to_csv('listening.csv')

In [None]:
# Agents with tables
from langchain.agents import create_csv_agent
from langchain.agents import create_pandas_dataframe_agent  # might be the same
from langchain.llms import OpenAI  # without llms

# # LangChain agents
# llm = OpenAI(temperature=0)
# # agent = create_csv_agent(llm, filepath, verbose=True)
# agent = create_pandas_dataframe_agent(llm, df, verbose=True)
# prompt = "Make the values more readable, store the result into 'listening.csv'"
# agent.run(prompt)

In [None]:
# RetrievalQA (not for me for now)
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import RetrievalQA

In [None]:
loader = CSVLoader(file_path="out.csv", encoding="utf-8")
data = loader.load()
index_creator = VectorstoreIndexCreator(vectorstore_cls=DocArrayInMemorySearch)  # without par gives error
docsearch = index_creator.from_loaders([loader])
chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.vectorstore.as_retriever(), input_key="question")

def ask_question(query):
    response = chain({"question": query})
    return response['result']

In [None]:
query = "Get me country with the highest GDP."
query2 = "Create 'out22.csv', where will be 3 countries with the biggest happiness index"
print(ask_question(query2))

In [None]:
# Pandasai library - not publicly common, use langchain instead
from pandasai import PandasAI
from pandasai.llm.openai import OpenAI

# pandasai
llm = OpenAI(api_token="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")
pandas_ai = PandasAI(llm, verbose=True)

In [None]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    "country": ["United States", "United Kingdom", "France", "Germany", "Italy", "Spain", "Canada", "Australia", "Japan", "China"],
    "gdp": [19294482071552, 2891615567872, 2411255037952, 3435817336832, 1745433788416, 1181205135360, 1607402389504, 1490967855104, 4380756541440, 14631844184064],
    "happiness_index": [6.94, 7.16, 6.66, 7.07, 6.38, 6.4, 7.23, 7.22, 5.87, 5.12]
})

pandas_ai.run(df, prompt='Which are the 5 happiest countries?')