# Evaluating LLMs on a Classification Task

* docs: https://docs.evidentlyai.com/introduction
* repo: https://github.com/evidentlyai/evidently/

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
booking_queries = pd.read_csv("https://raw.githubusercontent.com/evidentlyai/community-examples/refs/heads/main/datasets/booking_queries_dataset.csv")

In [3]:
booking_queries.head()

Unnamed: 0,query,label
0,"booked a trip for 4 ppl, want to add a 5th now",Booking
1,"hello team, please confirm if my hotel reserva...",Booking
2,"i can’t see the payment options, dropdown just...",Technical
3,"I heard airlines sometimes overbook, what’s yo...",Policy
4,wanna reschedule my train ride to next week,Booking


In [4]:
booking_queries.shape

(200, 2)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    booking_queries['query'], booking_queries.label, test_size=0.1, random_state=42
)

## Logistic Regression

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
vectorizer = CountVectorizer()
vectorized_train = vectorizer.fit_transform(X_train)
vectorized_test = vectorizer.transform(X_test)

In [8]:
model = LogisticRegression(max_iter=1000, random_state=42)

In [9]:
model.fit(vectorized_train, y_train)

lr_train_preds = model.predict(vectorized_train)
lr_test_preds = model.predict(vectorized_test)

In [10]:
accuracy_score(lr_train_preds, y_train)

1.0

In [11]:
accuracy_score(lr_test_preds, y_test)

0.8

## Classification Quality Report

In [12]:
from evidently import Dataset, DataDefinition, Report, MulticlassClassification
from evidently.presets import ClassificationPreset

In [13]:
lr_frame = pd.DataFrame()
lr_frame['label'] = y_test.values
lr_frame['predicted_label'] = lr_test_preds
lr_frame['query'] = X_test.values

In [14]:
lr_frame

Unnamed: 0,label,predicted_label,query
0,Booking,Booking,pls assist with booking confirmation didnt get...
1,Booking,Booking,what's the easiest way to upgrade the hotel i ...
2,Escalation,Escalation,I followed all your steps to cancel my hotel b...
3,Technical,Technical,"Hi there, I booked a flight for my family, but..."
4,Booking,Booking,Hello! Could you help me figure out if I can s...
5,Booking,Booking,my itinerary disappeared from the app
6,Payment,Payment,"Hello, I was charged in USD instead of EUR. Pl..."
7,Technical,Booking,i got logged out while booking and now i lost ...
8,Booking,Booking,hi i need to change my flight
9,Technical,Policy,I’m using a VPN and the site won’t load for bo...


In [15]:
data_definition = DataDefinition(
    text_columns = ["query"],
    classification = [MulticlassClassification(target="label", prediction_labels="predicted_label")]
)

In [16]:
lr_dataset = Dataset.from_pandas(
    lr_frame,
    data_definition=data_definition
)

In [17]:
report = Report(
    metrics=[
        ClassificationPreset()
    ]
)

In [18]:
lr_report = report.run(lr_dataset)


Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



In [19]:
lr_report

## Load report to Evidently cloud

In [20]:
from evidently.ui.workspace import CloudWorkspace

In [21]:
client = CloudWorkspace(url="https://app.evidently.cloud")

In [22]:
project = client.create_project("[LLM course] Classification Task", org_id = "fd72dd71-e4ef-46dd-87cf-da6bb75b825e")
project.description = "course project for classification evals"

In [23]:
client.add_run(project.id, lr_report, include_data=True)

UUID('0196d984-7ef5-760b-8918-d60751b5e9bb')

## Zero-shot classification with OpenAI

In [24]:
from openai import OpenAI

In [25]:
OA_client = OpenAI()

In [26]:
response = OA_client.responses.create(
    instructions="",
    model="gpt-4o-mini",
    input="what is the latest version of gpt 4 model?"
)

print(response.output_text)

As of now, the latest version of the GPT-4 model is GPT-4 turbo. GPT-4 turbo is designed to be faster and more cost-effective than the base GPT-4 model while maintaining high performance. If you're looking for specific features or differences, let me know!


In [27]:
instructions = """
You are a customer support assistant trained to classify incoming boooking-related support messages.

1. Booking – for messages about making, changing, or cancelling a reservation, or checking availability.
2. Payment – for issues related to billing, refunds, invoices, or payment transactions.
3. Policy – for questions about travel policies, baggage rules, visas, check-in procedures, or similar topics not tied to a specific booking.
4. Technical – for problems with logging in, accessing the app or website, or other technical errors.
5. Escalation – for urgent requests or when the user asks to speak with a human agent.

Return only a category name. Do not include any explanation or additional text.
"""

In [28]:
X_test.values[0]

'pls assist with booking confirmation didnt get email'

In [29]:
y_test.values[0]

'Booking'

In [30]:
response = OA_client.responses.create(
    instructions=instructions,
    model="gpt-4o-mini",
    input=X_test.values[0]
)

print(response.output_text)

Booking


In [31]:
def zero_shot_predict(client, model, input):
    response = client.responses.create(
    instructions=instructions,
    model=model,
    input=input
    )

    label = response.output_text
    return label

In [32]:
llm_test_preds = [zero_shot_predict(OA_client, "gpt-4o-mini", query) for query in X_test]

In [33]:
llm_test_preds

['Booking',
 'Booking',
 'Escalation',
 'Technical',
 'Booking',
 'Technical',
 'Payment',
 'Technical',
 'Booking',
 'Technical',
 'Technical',
 'Escalation',
 'Booking',
 'Payment',
 'Booking',
 'Booking',
 'Escalation',
 'Booking',
 'Technical',
 'Booking']

In [34]:
accuracy_score(llm_test_preds, y_test)

0.85

In [35]:
llm_frame = pd.DataFrame()
llm_frame['label'] = y_test.values
llm_frame['predicted_label'] = llm_test_preds
llm_frame['query'] = X_test.values

In [36]:
llm_dataset = Dataset.from_pandas(
    llm_frame,
    data_definition=data_definition
)

In [37]:
llm_report = report.run(llm_dataset, tags = ["open ai 4.0 mini"])

In [38]:
client.add_run(project.id, llm_report, include_data=True)

UUID('0196d98a-b613-7dbe-b25f-be4e31370886')

In [39]:
better_llm_test_preds = [zero_shot_predict(OA_client, "gpt-4.1", query) for query in X_test]

In [40]:
accuracy_score(better_llm_test_preds, y_test)

0.85

In [41]:
better_llm_frame = pd.DataFrame()
better_llm_frame['label'] = y_test.values
better_llm_frame['predicted_label'] = better_llm_test_preds
better_llm_frame['query'] = X_test.values

In [42]:
better_llm_dataset = Dataset.from_pandas(
    better_llm_frame,
    data_definition=data_definition
)

In [43]:
better_llm_report = report.run(better_llm_dataset, tags = ["open ai 4.1"])

In [44]:
client.add_run(project.id, better_llm_report, include_data=True)

UUID('0196d98c-39d1-7ee0-b0bd-6c49653df083')