In [1]:
from typing_extensions import TypedDict
from typing import List
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate

from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_groq import ChatGroq

In [2]:
def load_data(uploaded_file):
    df = pd.read_csv(uploaded_file)
    return df

df = load_data("tips.csv")
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
GROQ_LLM = ChatGroq(
            model="llama3-70b-8192", temperature=0
        )

In [4]:
x = df.iloc[1][df.columns[1]]
y = df.iloc[1][df.columns[0]]
color = df.iloc[1][df.columns[4]]

print(f"{df.columns[1]}: {x}, {df.columns[0]}: {y}, {df.columns[4]}, {color}")

tip: 1.66, total_bill: 10.34, day, Sun


In [6]:
### State

class GraphState(TypedDict):
    """
    Represents the state of the graph.

    Attributes:
        df: A pandas DataFrame containing the data used in the graph.
        input_data: A string representing the source or nature of the input data.
        num_steps: An integer indicating the number of processing steps applied to the data.
        questions: A string representing questions that relate to the data.
        answers: A string representing answers related to the questions posed.
        summary: A string summarizing the key insights or results derived from the data.
    """
    df: pd.DataFrame
    input_data: str
    num_steps: int
    questions: str
    answers: str
    summary: str

In [7]:
import re

def extract_questions(text):
    text_after_colon = text.split(':', 1)[1] if ':' in text else text   
    pattern = r'(?<=\?)\s*(?=[A-Z0-9])'
    questions = re.split(pattern, text_after_colon)
    questions = [question.strip() for question in questions if question.strip().endswith('?')]
    return questions

# first node
def generate_questions(state):
    """Take the initial df and input_data to generate the questions based on the data"""
    print("---GENERATING THE QUESTIONS---")
    df = state["df"]
    input_data = state["input_data"]
    num_steps = int(state["num_steps"])
    num_steps += 1

    prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a Data Analyst Agent that is an expert on making insightful questions.

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Given the following dataframe: {df}\n

    Write some questions that relate this values to the whole dataframe, no preamble or explanation:
    \n{input_data}\n\n
    <|eot_id|>
    <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["df", "input_data"],
    )

    questions_generator = prompt | GROQ_LLM | StrOutputParser()
    questions = questions_generator.invoke({"df": df, "input_data": input_data})

    questions = extract_questions(questions)

    return ({"questions": questions, "num_steps": num_steps})

In [8]:
def questions_answering(state):
    """Given any number of questions answer them"""
    print("---ANSWERING QUESTIONS---")
    df = state["df"]
    input_data = state["input_data"]
    num_steps = int(state["num_steps"])
    questions = state["questions"]
    num_steps += 1

    prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a Data Analyst expert that is able to find meaningful insights answering the user questions about data.

        <|eot_id|><|start_header_id|>user<|end_header_id|>
        DATAFRAME: {df}\n
        INPUT DATA: {input_data}\n
        QUESTIONS: {questions}\n

        Answer each questions giving a proper analysis and explanation on how the INPUT DATA compares to the whole DATAFRAME, giving a strong focues on the INPUT DATA.
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """,
        input_variables=["df", "input_data", "questions"],
    )

    answers_generator = prompt | GROQ_LLM | StrOutputParser()
    answers = answers_generator.invoke({"df": df, "input_data": input_data, "questions": questions})

    return ({"answers": answers, "num_steps": num_steps})

In [9]:
def summarize_answers(state):
    """Summarize the given answers"""
    answers = state["answers"]
    num_steps = int(state["num_steps"])
    num_steps += 1
    
    prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a Data Analyst expert that is a master on summarize informations and translate it to italian language.

        <|eot_id|><|start_header_id|>user<|end_header_id|>
        ANSWERS: {answers}\n

        Summarize the whole answers in a very discorsive single paragraph without adding any preamble or introduction.
        Just answer with the summary.
        <|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>
        """,
        input_variables=["answers"],
    )

    summary_generator = prompt | GROQ_LLM | StrOutputParser()
    summary = summary_generator.invoke({"answers": answers})

    return ({"summary": summary, "num_steps": num_steps})

In [10]:
from langgraph.graph import StateGraph, END

# Init Graph
workflow = StateGraph(GraphState)

# Nodes
workflow.add_node("generate_questions", generate_questions)
workflow.add_node("questions_answering", questions_answering)
workflow.add_node("summarize_answers", summarize_answers)

# Edges
workflow.add_edge("generate_questions", "questions_answering")
workflow.add_edge("questions_answering", "summarize_answers")
workflow.add_edge("summarize_answers", END)

# Entrypoint
workflow.set_entry_point("generate_questions")

# Build the graph
app = workflow.compile()

In [11]:
# Setup the inputs to the entrypoint
df = load_data("tips.csv")
INPUT_DATA = f"""
- {df.columns[1]}: {x}\n
- {df.columns[0]}: {y}\n
- {df.columns[4]}: {color}
"""
inputs = ({"df": df, "input_data": INPUT_DATA, "num_steps": 0})

# for output in app.stream(inputs):
#     for key, value in output.items():
#         print(f"Finished running\n {key}:{value}")

res = app.invoke(inputs)
print(res["summary"])


---GENERATING THE QUESTIONS---
---ANSWERING QUESTIONS---
La media delle mance lasciate dalle donne di domenica è di circa 2,73, mentre c'è una moderata correlazione positiva tra il conto totale e l'importo della mancia. Il sabato è il giorno con la spesa media più alta, di circa 25,15. Le donne di domenica tendono a frequentare ristoranti in coppia, mentre i fumatori lasciano mance più basse rispetto ai non fumatori. Gli uomini che pranzano da soli hanno una spesa media di circa 20,51. C'è una differenza significativa nella spesa media tra sabato e domenica, con una spesa media più alta di sabato. Il 34,15% dei clienti sono donne che non fumano, mentre la mediana delle mance per la cena di giovedì è di circa 2,50.
