In [21]:
from typing import TypedDict, Annotated
from langchain_groq import ChatGroq
from langgraph.graph import StateGraph, END
from dotenv import load_dotenv
from langchain_community.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
import json
import ast

load_dotenv("env")
set_llm_cache(SQLiteCache(database_path=".cache_langchain.db"))

llm_name = "llama-3.1-8b-instant"
llm_generator = ChatGroq(cache=True, temperature=0.0, model_name=llm_name)

# State
class State(TypedDict):
    input: str
    is_python_code: bool
    evaluation: dict
    unit_test_result: bool

# Nodes
def check_python_code(state: State) -> State:
    try:
        ast.parse(state["input"])
        return {"is_python_code": True}
    except SyntaxError:
        return {"is_python_code": False}

def evaluate_code(state: State) -> State:
    prompt = prompt_evaluate_code()
    result = llm_generator.invoke(prompt.format(input=state["input"]))
    try:
        evaluation = json.loads(result.content)
        return {"evaluation": evaluation}
    except json.JSONDecodeError:
        return {"evaluation": {"difficulty": "error", "explanation": "Failed to parse LLM output"}}

def unit_testing(state: State) -> State:
    expected_keys = ["difficulty", "explanation"]
    difficulty_values = ["easy", "medium", "hard"]
    
    evaluation = state["evaluation"]
    
    if all(key in evaluation for key in expected_keys) and evaluation["difficulty"] in difficulty_values:
        return {"unit_test_result": True}
    else:
        return {"unit_test_result": False}

# Edges
workflow = StateGraph(State)

workflow.add_node("check_python_code", check_python_code)
workflow.add_node("evaluate_code", evaluate_code)
workflow.add_node("unit_testing", unit_testing)

workflow.set_entry_point("check_python_code")
workflow.add_edge("check_python_code", "evaluate_code")
workflow.add_edge("evaluate_code", "unit_testing")

# Conditional edges
def decide_next_step(state: State):
    if not state.get("is_python_code", True):
        return END
    if "unit_test_result" in state:
        return END if state["unit_test_result"] else "evaluate_code"
    return "evaluate_code"

workflow.add_conditional_edges("check_python_code", decide_next_step)
workflow.add_conditional_edges("unit_testing", decide_next_step)

# Compile the graph
app = workflow.compile()

# Function that returns prompt
def prompt_evaluate_code() -> ChatPromptTemplate:
    system_prompt = """
    You are a highly capable language model specialized in evaluating Python code in terms of ease of modification and maintainability. Your task is to classify given Python code snippets as either "easy", "medium", or "hard" to update, based on the complexity, structure, and best practices present in the code. Please follow the rules below:
    - The input will always be a Python code snippet.
    - The output must be a JSON object containing a "difficulty" value and an "explanation".
    - The difficulty can be "easy", "medium", or "hard".
    - The explanation must provide the reasoning behind your classification in terms of maintainability and ease of modification.
    - The output format must be as follows:
    ```
    {{
      "difficulty": "[easy|medium|hard]",
      "explanation": "[Put the explanation here]"
    }}
    ```
    ### Criteria for Classification:
    1. **Easy**: The code follows good practices, has modular functions, meaningful variable names, and is overall simple to understand and modify.
    2. **Medium**: The code may have some issues, such as lacking comments or using slightly unclear variable names, making it somewhat challenging to modify. It may also use complex logic that can be understood with effort.
    3. **Hard**: The code has poor practices such as deeply nested loops, poorly named variables, lack of modularity, or complex interdependencies, making it difficult to modify.

    Only output the json. No explanations.
    """

    example_prompt = ChatPromptTemplate.from_messages(
        [
            ("human", "{input}"),
            ("ai", "{output}"),
        ]
    )

    examples = [
        {
            "input": """
def calculate_sum(numbers):
    "Calculate the sum of a list of numbers."
    return sum(numbers)
print(calculate_sum([1, 2, 3, 4, 5]))
""",
            "output": """
{{
  "difficulty": "easy",
  "explanation": "The code is simple and modular, with a well-named function and parameter. It follows good practices, such as documentation and minimal complexity."
}}
"""
        },
        {
            "input": """
def process_data(data):
    result = []
    for d in data:
        if d % 2 == 0:
            result.append(d * 2)
    return result
print(process_data([1, 2, 3, 4, 5]))
""",
            "output": """
{{
  "difficulty": "medium",
  "explanation": "The code lacks comments, and the purpose of processing is not immediately clear. Variable names are generic, and the loop has conditional logic that requires some thought to understand."
}}
"""
        },
        {
            "input": """
def func(a, b):
    r = []
    for i in range(len(a)):
        for j in range(len(b)):
            if a[i] == b[j]:
                r.append((i, j))
                for k in range(5):
                    r.append(k * (i + j))
    return r
print(func([1, 2, 3], [3, 2, 1]))
""",
            "output": """
{{
  "difficulty": "hard",
  "explanation": "The code has nested loops with poor variable naming, making it difficult to understand and maintain. Additionally, there is a deeply nested inner loop with unclear logic, leading to high cognitive load and making the code challenging to update."
}}
"""
        }
    ]

    few_shot_prompt = FewShotChatMessagePromptTemplate(
        example_prompt=example_prompt,
        examples=examples,
    )

    final_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            few_shot_prompt,
            ("human", "{input}"),
        ]
    )
    return final_prompt

# Execution
def evaluate_code_maintainability(code: str):
    initial_state = {"input": code}
    for output in app.stream(initial_state):
        print(output)
        if END in output:
            if not output[END].get("is_python_code", True):
                return "No Python code. Try again."
            return output[END].get("evaluation", "Failed to evaluate code.")

# Example usage
code_snippet = """
import os
"""

result = evaluate_code_maintainability(code_snippet)
print(json.dumps(result, indent=2))

# Test with non-Python input
non_python_input = "This is not Python code."
result = evaluate_code_maintainability(non_python_input)
print(result)

{'check_python_code': {'is_python_code': True}}
{'evaluate_code': {'evaluation': {'difficulty': 'hard', 'explanation': 'The code snippet appears to be incomplete, as it only imports the os module without any further functionality. This lack of context makes it difficult to assess the maintainability and ease of modification of the code.'}}}
{'unit_testing': {'unit_test_result': True}}
null
{'check_python_code': {'is_python_code': False}}
{'evaluate_code': {'evaluation': {'difficulty': 'error', 'explanation': 'Failed to parse LLM output'}}}
{'unit_testing': {'unit_test_result': False}}
None


## Langfuse

In [2]:
from typing import Annotated
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage
from typing_extensions import TypedDict
 
from langgraph.graph import StateGraph
from langgraph.graph.message import add_messages
 
class State(TypedDict):
    # Messages have the type "list". The `add_messages` function in the annotation defines how this state key should be updated
    # (in this case, it appends messages to the list, rather than overwriting them)
    messages: Annotated[list, add_messages]
 
graph_builder = StateGraph(State)
 
load_dotenv("env")
# set_llm_cache(SQLiteCache(database_path=".cache_langchain.db"))

llm_name = "llama-3.1-8b-instant"
llm_generator = ChatGroq(cache=False, temperature=0.0, model_name=llm_name)

 
# The chatbot node function takes the current State as input and returns an updated messages list. This is the basic pattern for all LangGraph node functions.
def chatbot(state: State):
    return {"messages": [llm_generator.invoke(state["messages"])]}
 
# Add a "chatbot" node. Nodes represent units of work. They are typically regular python functions.
graph_builder.add_node("chatbot", chatbot)
 
# Add an entry point. This tells our graph where to start its work each time we run it.
graph_builder.set_entry_point("chatbot")
 
# Set a finish point. This instructs the graph "any time this node is run, you can exit."
graph_builder.set_finish_point("chatbot")
 
# To be able to run our graph, call "compile()" on the graph builder. This creates a "CompiledGraph" we can use invoke on our state.
graph = graph_builder.compile()

In [3]:
from langfuse.callback import CallbackHandler
 
# Initialize Langfuse CallbackHandler for Langchain (tracing)
langfuse_handler = CallbackHandler()
 
for s in graph.stream({"messages": [HumanMessage(content = "What is Langfuse?")]},
                      config={"callbacks": [langfuse_handler]}):
    print(s)

{'chatbot': {'messages': [AIMessage(content='I couldn\'t find any information on "Langfuse." It\'s possible that it\'s a lesser-known or obscure term, or it could be a misspelling or variation of a different term. If you could provide more context or clarify what you\'re referring to, I\'d be happy to try and help you further.', response_metadata={'token_usage': {'completion_time': 0.086666667, 'completion_tokens': 65, 'prompt_time': 0.009526751, 'prompt_tokens': 40, 'queue_time': 0.951697046, 'total_time': 0.096193418, 'total_tokens': 105}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_9cb648b966', 'finish_reason': 'stop', 'logprobs': None}, id='run-c55f192a-269b-4ffa-8650-5c0c0db7f438-0', usage_metadata={'input_tokens': 40, 'output_tokens': 65, 'total_tokens': 105})]}}


In [4]:
langfuse_handler

<langfuse.callback.langchain.LangchainCallbackHandler at 0x7f469d6db0d0>

In [5]:
from langfuse import Langfuse
langfuse = Langfuse()
traces = langfuse.fetch_traces()
traces

FetchTracesResponse(data=[TraceWithDetails(id='773784c8-2ee5-4056-9741-e8d93218ed44', timestamp=datetime.datetime(2024, 10, 8, 1, 25, 57, 569000, tzinfo=datetime.timezone.utc), name='LangGraph', input={'messages': [{'content': 'What is Langfuse?'}]}, output={'messages': [{'id': '9023e6e0-4d94-45ea-802e-9f47efd22df1', 'content': 'What is Langfuse?'}, {'id': 'run-c55f192a-269b-4ffa-8650-5c0c0db7f438-0', 'content': 'I couldn\'t find any information on "Langfuse." It\'s possible that it\'s a lesser-known or obscure term, or it could be a misspelling or variation of a different term. If you could provide more context or clarify what you\'re referring to, I\'d be happy to try and help you further.', 'tool_calls': [], 'usage_metadata': {'input_tokens': 40, 'total_tokens': 105, 'output_tokens': 65}, 'additional_kwargs': {}, 'response_metadata': {'logprobs': None, 'model_name': 'llama-3.1-8b-instant', 'token_usage': {'queue_time': 0.951697046, 'total_time': 0.096193418, 'prompt_time': 0.0095267

In [18]:
traces.data[0].dict()

{'id': '773784c8-2ee5-4056-9741-e8d93218ed44',
 'timestamp': datetime.datetime(2024, 10, 8, 1, 25, 57, 569000, tzinfo=datetime.timezone.utc),
 'name': 'LangGraph',
 'input': {'messages': [{'content': 'What is Langfuse?'}]},
 'output': {'messages': [{'id': '9023e6e0-4d94-45ea-802e-9f47efd22df1',
    'content': 'What is Langfuse?'},
   {'id': 'run-c55f192a-269b-4ffa-8650-5c0c0db7f438-0',
    'content': 'I couldn\'t find any information on "Langfuse." It\'s possible that it\'s a lesser-known or obscure term, or it could be a misspelling or variation of a different term. If you could provide more context or clarify what you\'re referring to, I\'d be happy to try and help you further.',
    'tool_calls': [],
    'usage_metadata': {'input_tokens': 40,
     'total_tokens': 105,
     'output_tokens': 65},
    'additional_kwargs': {},
    'response_metadata': {'logprobs': None,
     'model_name': 'llama-3.1-8b-instant',
     'token_usage': {'queue_time': 0.951697046,
      'total_time': 0.09619

In [6]:
langfuse.fetch_observations()

FetchObservationsResponse(data=[ObservationsView(id='c2e192fe-dda0-4403-948d-87a63780932c', trace_id='773784c8-2ee5-4056-9741-e8d93218ed44', type='SPAN', name='ChannelWrite<chatbot,messages>', start_time=datetime.datetime(2024, 10, 8, 1, 25, 59, 500000, tzinfo=datetime.timezone.utc), end_time=datetime.datetime(2024, 10, 8, 1, 25, 59, 502000, tzinfo=datetime.timezone.utc), completion_start_time=None, model=None, model_parameters=None, input={'messages': [{'id': 'run-c55f192a-269b-4ffa-8650-5c0c0db7f438-0', 'content': 'I couldn\'t find any information on "Langfuse." It\'s possible that it\'s a lesser-known or obscure term, or it could be a misspelling or variation of a different term. If you could provide more context or clarify what you\'re referring to, I\'d be happy to try and help you further.', 'tool_calls': [], 'usage_metadata': {'input_tokens': 40, 'total_tokens': 105, 'output_tokens': 65}, 'additional_kwargs': {}, 'response_metadata': {'logprobs': None, 'model_name': 'llama-3.1-8