In [1]:
import os

os.environ["OPENAI_API_KEY"] = ""
os.environ["LANGCHAIN_API_KEY"] = ""
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = ""

In [2]:
! pip install langchain_openai



In [3]:
import pprint
from typing import Any, Dict

import pandas as pd
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [4]:
model = ChatOpenAI(model_name="gpt-3.5-turbo")

In [5]:
# 출력 목적으로만 사용됩니다.
def format_parser_output(parser_output: Dict[str, Any]) -> None:
    for key in parser_output.keys():
        parser_output[key] = parser_output[key].to_dict()
    return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)

In [6]:
import seaborn as sns

df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [7]:
parser = PandasDataFrameOutputParser(dataframe=df)

In [8]:
print(parser.get_format_instructions())

The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
2. Arrays must either be a comma-separated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
3. Remember that arrays are optional and not necessarily required.
4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".

As an example, for the formats:
1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
2. String "row:1" is a well-formatted instance which gets row 1.
3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a p

In [12]:
# 프롬프트 템플릿을 설정합니다.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],  # 입력 변수 설정
    partial_variables={
        "format_instructions": parser.get_format_instructions()
    },  # 부분 변수 설정
)

In [13]:
chain = prompt | model | parser

In [22]:
df_query = "데이터프레임에서 'age' 열의 데이터를 출력하세요."

parser_output = chain.invoke({"query": df_query})

In [23]:
parser_output

{'age': 0      22.0
 1      38.0
 2      26.0
 3      35.0
 4      35.0
        ... 
 886    27.0
 887    19.0
 888     NaN
 889    26.0
 890    32.0
 Name: age, Length: 891, dtype: float64}

In [25]:
df["age"].head().mean()

31.2

In [26]:
df_query = "Retrieve the average of the Ages from row 0 to 4."
parser_output = chain.invoke({"query": df_query})
print(parser_output)

{'mean': 31.2}


### 숙제

- 3등칸 승객중 40세 이상인 여성승객의 평균 나이

In [41]:
prompt = PromptTemplate(
    template="""
    사용자 질의에 대한 답변을 PandasDataFrameOutputParser가 파싱할 수 있는 유효한 DataFrame JSON 형식으로 제공하세요.
    {format_instructions}
    {query}
    """,
    input_variables=["query"],
    partial_variables={
        "format_instructions": parser.get_format_instructions()
    },
)

df_query = """
3등 칸 승객 중 40세 이상인 여성 승객의 평균 나이를 구해줘.
결과를 DataFrame JSON으로 반환해줘(칼럼 이름은 "age"로 해줘).
"""

parser_output = chain.invoke({"query": df_query})
print(parser_output)

{'mean': 26.0}
