In [6]:
from enum import Enum
from dotenv import  load_dotenv
load_dotenv()
from Models.models import LLMModel as LLMModel2
import pytest
from langchain.prompts import HumanMessagePromptTemplate
from langchain_core.messages import (HumanMessage)
from langchain.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field, validator
import json



In [7]:

class Severity(Enum):
    CRITICAL = "Critical"
    MEDIUM = "Medium"
    LOW = "Low"

class Issue(BaseModel):
    """Represents a specific issue found during code review."""

    cell_position: int = Field(
        ..., description="The position of the cell where the issue was found."
    )
    what: str = Field(..., description="A brief description of the issue.")
    why: str = Field(..., description="Explanation of why this is an issue.")
    where: str = Field(
        ...,
        description="Specific location within the cell where the issue can be found.",
    )
    severity: Severity = Field(
        ...,
        description="The severity level of the issue, categorized as Critical, Medium, or Low. Critical issues majorly decrease the usefulness of the Assistant code replies for the human user. Medium severity issues have a strong influence on the conversation flow and usefulness. Low severity issues have almost no influence on the overall score but could improve the quality if addressed.",
    )
    fix: str = Field(
        ..., description="Suggested fix for the issue in an executive summary fashion."
    )


class NotebookWiseFeedback(BaseModel):
    """Represents the outcome of a code review task."""

    scratchpad: str = Field(
        ...,
        description="Place for you to think. Think before issues and score creation. Be concise. Analyze the text to achieve your goal. Always think before looking for issues!",
    )
    issues: list[Issue] = Field(
        ...,
        description="List of issues identified in the code review, categorized by severity.",
    )
    scoring_explanation: str = Field(
        ...,
        description="Explanation of the logic behind scoring this conversation, using the grading rules provided.",
    )
    score: int | None = Field(
        ...,
        description="A score between 1 and 5 that reflects the quality of the code, where 1 is the worst and 5 is the best, based on the criteria outlined in the grading rules.",
    )

In [8]:
main_prompt = """# IDENTITY

You are an AI named Codia. You have extensive knowledge and skill in programming languages, especially Python. You are aware of the best practices used in programming, have an extensive extensive experience in algorithms, data structures and overall computer science.

You are a concise expert in evaluating and refining the code generated by an AI assistant based on a Large Language Model.

# GOALS

Your task is to evaluate and provide feedback for a conversation between a human user and an AI Assistant that is based on the latest large language model architecture.
Focus of your evaluation is code in the replies generated by the AI Assistant only. The conversation environment is a Jupyter notebook, thus things that are run in other cells, are available in the next cells.

# RULES

Attributes to consider:
- Code Correctness
- Code Efficiency
- Best Practices
- Code Readability
- Code style Consistency
- Code purpose and usefulness for user request satisfaction

**1. Identification of Code for Review**
- Target for analysis: Code generated by the LLM Assistant in a reply to the User within a Jupyter notebook exchange.
- Exclude analysis of human user input for focused improvement on LLM-generated content.
- Exclude LLM Assistant text content that is not related to the code, only review code snippets and code cells. Text is for context and reasoning/explanation only, you can assess meaning of the text in relation to the code.
- Exclude concerns about code explanation in the text parts if they are not comments inside the code, as it will be covered by other reviewers.

**2. Evaluation Criteria Definitions**
- Correctness: The code must be devoid of bugs and errors.
- Efficiency: The code must be optimized for maximum performance.
- Best Practices: The code must adhere to established programming conventions, techniques, and guidelines.
- Readability: The code must be easily comprehensible, with suitable naming conventions and comments where complexity demands.
- Consistency: The code must be consistent with the Assistant's programming identity and the context of the user interaction.
- Completeness of the conversation as a whole: was user request satisfied or does conversation still needs more interactions(very bad)?

**3. Review Guidelines**
- Avoid general praise observations: Be specific and objective in your feedback.
- Avoid nitpicky/subjective criticism: Focus on substantial issues that affect the code quality.

# Grading score rules:
```
### 5 - Excellent
- Well Formatted
- Correct
- Optimal
- Highly readable
- Useful
- conversation must be complete ending in user request full satisfaction

### 4 - Good
- Correct but can be slightly optimized in terms of approach / speed / readability

### 3 - Acceptable
- The code is correct but can be significantly improved.
- The code is not readable.

### 2 - Needs Improvement
- The code is incorrect / out of scope / has syntax errors.
- Looks like it’s copied from ChatGPT - robotic, no personality, inhuman.

### 1 - Poor
- Incomplete or missing Code, but is required or implied by context of the interaction to make it useful aka did not satisfy user's request and desire
```


# REFOCUS:
- You are a code reviewer, not a language and contextual information content reviewer Do not mention issues not related to your purpose.
- If the code was **unnecessary** aka user request FULLY satisfied without it, it can be absent and thus must receive null.
- If code from assistant is necessary by the conversation flow to satisfy user's request but it is not there - score it as 1, do not mark as 5.
- As you are giving a rating to a reply from a perfect AI Assistant, each issue decreases the rating/score significantly. If there is at least one of medium issue - 3 is max rating already and must go lower if more or issues are worse."""


chat_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "{main_prompt}"
        ),
        MessagesPlaceholder(variable_name="messages")
    ])

chat_template = chat_template.partial(main_prompt=main_prompt)

with open('Data/new_chirag/algorithm_problems_using_python__178__06_03_2024_15_46_55_1.ipynb', 'r') as file:
    convo= file.read()



In [9]:

model = "gpt-4o"
provider = "openai_api"


evaluator = LLMModel2(
provider=provider,
model=model,
output_schema=NotebookWiseFeedback.model_json_schema(),
name="aspect_evaluator",
prompt_template=chat_template,
try_to_parse=True,
config={
    "retry": 4,
    "params":{"temperature":0.8}
}
)

# Perform evaluation
evaluation_result = evaluator( [HumanMessage((f"Conversation between AI Assistant and a human User: \n {convo}"))])
print(evaluation_result)


{'scratchpad': "The assistant provided a complete implementation of PCA from scratch in Python. The code includes data standardization, computation of the covariance matrix, eigenvalue decomposition, and projection of the original dataset onto the new feature space. The code also includes input validation and error handling.\n\nHowever, there are a few issues:\n1. The use of `np.linalg.eig` can lead to numerical instability. `np.linalg.svd` is generally preferred for PCA as it is more stable.\n2. The code does not handle the case where the standard deviation is zero, which can lead to division by zero errors.\n3. The code uses print statements for error handling, which is not ideal for a reusable function. Raising exceptions would be more appropriate.\n4. The code does not include comments, which affects readability.\n5. The code does not include a test case to verify the correctness of the PCA implementation.\n\nThese issues affect the code's efficiency, best practices, and readabilit

In [10]:
from IPython.display import display, HTML
import json

def pretty_print_html(data):
    # Convert the dictionary to a JSON string with indentation for readability
    pretty_data = json.dumps(data, indent=4)
    
    # Prepare HTML string with color styling
    html_data = '<pre style="line-height: 1.35;">'
    for line in pretty_data.splitlines():
        if ':' in line:
            key, value = line.split(':', 1)
            key = f'<span style="color: blue;">{key}</span>'
            if value.strip().startswith('"'):
                value = f'<span style="color: green;">{value}</span>'
            elif value.strip().isdigit():
                value = f'<span style="color: red;">{value}</span>'
            html_data += f"{key}:{value}\n"
        else:
            html_data += f'<span style="color: black;">{line}</span>\n'
    html_data += '</pre>'
    
    display(HTML(html_data))



pretty_print_html(evaluation_result)