In [1]:
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.core.query_engine.pandas import PandasInstructionParser
#from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
Settings.llm = Ollama(model="llama2", request_timeout=30.0)

In [2]:
import pandas as pd

df = pd.read_csv("./titanic_train.csv")

In [12]:
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)
response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)
llm = Ollama(model="codellama", request_timeout=30.0)

In [14]:
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# add link from response synthesis prompt to llm2
qp.add_link("response_synthesis_prompt", "llm2")


In [15]:
response = qp.run(
    query_str="What is the correlation between survival and age?",
)

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the correlation between survival and age?

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: What is the correlation between survival and age?

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
   survived  pclass                                               name  ...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df['survived'].corr(df['age'])

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: What is the correlation between survival and age?
pandas_instructions: assistant: df['survived'].corr(df['age'])
pandas_output: -0.07722109457217755

[0m[1;3;38;2;155;135;227m> Running module llm2 with input: 
messages: Given an input question, 

In [16]:
print(response.message.content)


The correlation between survival and age is negative. In other words, as someone gets older, their chances of survival decrease. The exact magnitude of this correlation can be seen in the output.


In [20]:
response = qp.run(
    query_str="Can you create a confusion matrix about survivors and their economic status?",
)

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: Can you create a confusion matrix about survivors and their economic status?

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: Can you create a confusion matrix about survivors and their economic status?

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
   survived  pclass                                               name  ...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: `df.pivot_table(index='survived', columns='pclass', values='name', aggfunc=len)`

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: Can you create a confusion matrix about survivors and their economic status?
pandas_instructions: assistant: `df.pivot_table(index='survived', 