### **Step3-Evaluation: Benchmarking and LLM-as-a-Judge**

#### **1. Setup Evaluation Environment**
Configures parameters for evaluation (e.g., GPU ID, models to compare, and data sources).


In [1]:
#Setup inference and evaluation parameters
import re
import sys
import os
import random
GPU_ID="0"
os.environ["CUDA_VISIBLE_DEVICES"]=GPU_ID
#Cloudera Customer =0 means run the Cloudera questions and format and Customer=1 Runs the customer format and questions
Customer=0
CustomPrompt=1
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
import Libs.DataSynthesisLib as DataSynthesisLib
Config=DataSynthesisLib.GetConfig('Config/Config.py')
ModelBase=""




#How many lines to trim of the output
StartLineFT=1
StartLineBase=3

#Models to use
ModelFT='./tmp/merged_AllComments_Clean_Trainr_128_a_64_d_0.05'


EvalLLM='microsoft/phi-4'



  from .autonotebook import tqdm as notebook_tqdm
2025-04-03 18:36:28,556	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


In [2]:
# Detect specify the model length. Use the smaller of maximum given by the user (ModelLen) and model max length
ModelLen=32000 #Specify max model len. Make sure this value is less than the maximum supported by the model.


from transformers import AutoConfig
ModelConfig = AutoConfig.from_pretrained(ModelFT)
fields = ["max_position_embeddings", "n_positions", "seq_len", "seq_length", "n_ctx", "sliding_window"] 
context_windows = [getattr(ModelConfig, field) for field in fields if field in dir(ModelConfig)]
ModelLenTemp=context_windows.pop() if len(context_windows) else ModelLen
if ModelLenTemp<ModelLen:
    ModelLen=ModelLenTemp
    

#print(ModelConfig)
print(f"Maximum Model length used {ModelLen}")


Maximum Model length used 32000


In [3]:
#Test if the LLM needs system prompt
HasSystem=False
try:
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(ModelFT)
    chat = [{"role": "system", "content": ""}]

    tokenizer.apply_chat_template(chat, tokenize=False)
    HasSystem=True
except:
    HasSystem=False


#### **2. Prepare Evaluation data**
- Loads evaluation dataset for Customer or Cloudera comments depending on the target evaluation.
- Extracts the customer questions used for evaluation.


In [4]:
#Load appropriate evaluation dataset
import json
import random


if Customer==1:
  with open('Data/CustomerComments_Evaluation_Clean.json', 'r') as file:
    data = json.load(file)
elif Customer==0:
  with open('Data/ClouderaComments_Evaluation_Clean.json', 'r') as file:
    data = json.load(file)





In [5]:
#Setup questions for LLM-as-a-judge, collect prompts for inference, and customer or Cloudera comments to be used later for evaluation
import vllm
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained(ModelFT)

System='You are a helpful assistant.'


FixedQuestions={}
if Customer==1:
  FixedQuestions[1]="Does this comment discuss any technical information? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[2]="Does this comment relate to a customer complaint? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[3]="Customer complaint temperature or a frustration level (if there is a complain give a Score 1-4, lowest=1, highest=4. If there is no complain give a score of 0)"
  FixedQuestions[4]="Score the severity of the issue based on comment content (SCORE 1-4, lowest=1, highest=4)"
  FixedQuestions[5]="Score the urgency of the issue based on the comment content (SCORE 1-4, lowest=1, highest=4)"
  FixedQuestions[6]="Is this a request from a customer for an update? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[7]="Is there a strictly explicit and NOT an implied request from a customer for a call, meeting or a screenshare (zoom/webex/teams etc.)? Do not answer yes unless wording explicitly asks for a call. (BOOL)"
  FixedQuestions[8]="Did the customer request an escalation? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[9]="Did the customer request a priority change?  To what level? (If there is a priority change give score 1 to indicate highest priority (indicated by S1) and 4  to indicate the lowest priority (Indicated by 4). If there is no priority change give a score of 0)."
  FixedQuestions[10]="Did the customer request a transfer to another COE? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[11]="Did the customer request to speak to a manager or supervisor? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[12]="Did the customer request a SME or expert? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[13]="Does this comment discuss a bug in Cloudera software? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[14]="Does the comment include a non-Cloudera Apache JIRA link? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[15]="Does the comment have a link to Cloudera Documentation or Community article? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[16]="Does the comment have any other type of hyperlink? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[17]="Summarize the case comment condensing it as much as possible but without losing important technical details. Omit including any meeting invite information.  [TEXT]"
elif Customer==0:
  FixedQuestions[1]="Does this comment discuss any technical information? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[2]="Score the severity of the issue based on comment content (SCORE 1-4, lowest=1, highest=4)"
  FixedQuestions[3]="Score the urgency of the issue based on the comment content (SCORE 1-4, lowest=1, highest=4)"
  FixedQuestions[4]="Does the comment have a proposed solution? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[5]="Does the comment have a proposed workaround?  (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[6]="Does the comment have a request for an action from the customer?  (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[7]="Does this comment discuss a bug in Cloudera software? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[8]="Does the comment include a non-Cloudera Apache JIRA link? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[9]="Does the comment have a link to Cloudera Documentation or Community article? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[10]="Does the comment have any other type of hyperlink? (BOOL: 0 for no, 1 for yes)"
  FixedQuestions[11]="Summarize the case comment condensing it as much as possible but without losing important technical details. Omit including any meeting invite information.  [TEXT] "


Questions=[]
Comment=[]
BaselineSolutions=[]
for i in data:
  if Customer==1:
      
    if CustomPrompt==1:
      Questions.append(i['Prompt'])
      Comment.append(i['Comment'])

  elif Customer==0:
    if CustomPrompt==1:
      Questions.append(i['Prompt'])
      Comment.append(i['Comment'])


print(len(Questions))
lens=[len(i) for i in Questions ]

500


#### **3. Prepare Prompts and Baseline Outputs**
Loads existing baseline outputs (from the unfinetuned model) and prepares prompts for inference.


In [6]:
  BaseModelName='Base'

  import  pickle
  if Customer==1:
    with open('Data/Eval_Customer_Questions_Solutions_'+BaseModelName+'.pickle', 'rb') as handle:
      Output=pickle.load( handle)
  elif Customer==0:
    with open('Data/Eval_Cloudera_Questions_Solutions_'+BaseModelName+'.pickle', 'rb') as handle:
      Output=pickle.load( handle)


  SolutionsBase=Output["S"]

In [7]:
print(print(Questions[0]))

You are given a Cloudera support team comment, 11 questions referring to the comment, and the possible values for each question in parenthesis following the question. Here is the Cloudera comment:
    
Hi Team,

We're experiencing issues after upgrading to AD 2019 where LDAP group synchronization is failing. The error in Cloudera Manager shows:

LDAP Result Code 32 (No Such Object): Failed to retrieve LDAP groups

We haven't changed any LDAP configuration in Cloudera Manager. Our current settings are:
- LDAP URL: ldaps://ad.company.local:636
- Base DN: dc=company,dc=local
- Group search base: ou=Groups,dc=company,dc=local

Could you please help identify what might have changed and how to resolve this?

Best regards,
John

Here are the 11 questions:
1. Does this comment discuss any technical information? (answer 0 for no, 1 for yes)
2. Score the severity of the issue based on comment content (SCORE 1-4, give 1 for lowest, 4 for highest and 2,3 for in between)
3. Score the urgency of the

In [8]:
print(SolutionsBase[0])

<|start_header_id|>assistant<|end_header_id|>

1. 1
2. 1
3. 1
4. 0
5. 0
6. 0
7. 0
8. 0
9. 0
10. 0
11. The customer is inquiring about compatibility between CDP 7.1.7 and Amazon OpenSearch 2.7 for integration, specifically for log analytics and searching capabilities.


#### **4. Generate Finetuned Model Outputs**
Runs inference on the evaluation dataset using the finetuned model.


In [9]:
#Finetuned model parameters setup and inference 

from torch.multiprocessing import Pool, Process, set_start_method
from transformers import AutoConfig

try:
     set_start_method('spawn')
except RuntimeError:
    pass
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
ConfigS=Config["ConfigSolution"].copy()
ConfigS["gpu-id"]=GPU_ID
ConfigS["tensor_parallel_size"]="1"
ConfigS["llm-path"]=ModelFT
ConfigS["tokenizer-path"]=ModelFT
ConfigS["system"]=System
ConfigS["use-grammar"]="False"
ConfigS["AssistantPrompt"]=""

ConfigS["max_tokens"]="3000"
ConfigS["temperature"]="0.1"
ConfigS["max_num_seqs"]="2"

ConfigS["max_num_seqs"]="16"
ConfigS["max_model_len"]=str(ModelLen)
ConfigS["max_num_batched_tokens"]=str(ModelLen)


ConfigS["AssistantPromptElementsRemoval"]="0"



ConfigS["tokenizer-file-gguf"]=""
ConfigS["temperature"]="0"
if HasSystem==False:
  ConfigS["GenerateChatTemplate"]="GenerateChatWithoutSystemWithAssistantStart"
else:
  ConfigS["GenerateChatTemplate"]="GenerateChatWithSystemWithAssistantStart"

ConfigS["UseOutlines"]="False"
ConfigS["tokenizer-file-gguf"]=""
if 'gptq' in ModelFT:
  ConfigS["quantization"]="gptq"
ConfigS["gpu_memory_utilization"]="0.7"


(Solutions, SolutionsLogging)=DataSynthesisLib.GetSolutions(ConfigS, Questions, ConfigS["system"])


No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


./tmp/merged_AllComments_Clean_Trainr_128_a_64_d_0.05
INFO 04-03 18:36:57 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='./tmp/merged_AllComments_Clean_Trainr_128_a_64_d_0.05', speculative_config=None, tokenizer='./tmp/merged_AllComments_Clean_Trainr_128_a_64_d_0.05', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./tmp/merged_Al

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:08<00:25,  8.49s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:10<00:09,  4.83s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:11<00:03,  3.01s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:20<00:00,  5.22s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:20<00:00,  5.05s/it]



INFO 04-03 18:37:19 model_runner.py:1071] Loading model weights took 14.9888 GB
INFO 04-03 18:37:22 gpu_executor.py:122] # GPU blocks: 6230, # CPU blocks: 0
INFO 04-03 18:37:22 gpu_executor.py:126] Maximum concurrency for 32000 tokens per request: 3.12x


Processed prompts: 100%|██████████| 500/500 [01:47<00:00,  4.65it/s, est. speed input: 2554.23 toks/s, output: 469.44 toks/s]


In [10]:
print(Questions[43])

You are given a Cloudera support team comment, 11 questions referring to the comment, and the possible values for each question in parenthesis following the question. Here is the Cloudera comment:
    
Hi Team,

I've completed a security audit of your cloud workloads and identified critical vulnerabilities in your CSPM implementation:

1. Unprotected API endpoints detected
2. Missing logging and monitoring configurations
3. Inadequate cloud asset inventory management

To address these security gaps, please implement the following measures urgently:

- Enable API Gateway protection mechanisms
- Configure comprehensive logging and monitoring
- Implement automated asset discovery and tracking

Please provide an update once these security controls are in place for verification.

Best regards,
David Chen
Principal Security Engineer
Cloudera Support

Here are the 11 questions:
1. Does this comment discuss any technical information? (answer 0 for no, 1 for yes)
2. Score the severity of the is

In [11]:
#Example finetuned model output
print(Solutions[43])

1. 1
2. 4
3. 4
4. 1
5. 0
6. 1
7. 0
8. 0
9. 0
10. 0
11. Critical security vulnerabilities identified in CSPM implementation including unprotected API endpoints, missing logging/monitoring, and inadequate asset management. Customer requested to implement API protection, logging/monitoring, and asset tracking mechanisms.


In [12]:
#Example finetuned model output
print(SolutionsBase[43])

<|start_header_id|>assistant<|end_header_id|>

1. 1
2. 4
3. 4
4. 0
5. 0
6. 0
7. 1
8. 0
9. 0
10. 0
11. The comment discusses performance degradation and timeout errors when joining tables between two CDW Virtual Warehouses in different environments, with an average query runtime of 25+ minutes and a timeout error after 1800 seconds.


In [13]:
#Individual questions for evaluating each question using LLM-as-a-judge
FixedQuestions

{1: 'Does this comment discuss any technical information? (BOOL: 0 for no, 1 for yes)',
 2: 'Score the severity of the issue based on comment content (SCORE 1-4, lowest=1, highest=4)',
 3: 'Score the urgency of the issue based on the comment content (SCORE 1-4, lowest=1, highest=4)',
 4: 'Does the comment have a proposed solution? (BOOL: 0 for no, 1 for yes)',
 5: 'Does the comment have a proposed workaround?  (BOOL: 0 for no, 1 for yes)',
 6: 'Does the comment have a request for an action from the customer?  (BOOL: 0 for no, 1 for yes)',
 7: 'Does this comment discuss a bug in Cloudera software? (BOOL: 0 for no, 1 for yes)',
 8: 'Does the comment include a non-Cloudera Apache JIRA link? (BOOL: 0 for no, 1 for yes)',
 9: 'Does the comment have a link to Cloudera Documentation or Community article? (BOOL: 0 for no, 1 for yes)',
 10: 'Does the comment have any other type of hyperlink? (BOOL: 0 for no, 1 for yes)',
 11: 'Summarize the case comment condensing it as much as possible but wit

#### **5. Parse Outputs and Validate Format for both Base and finetune model outputs**
Ensures the model outputs adhere to the expected structure (e.g., numbered answers).


In [14]:
#Parse LLM output and extract analytics and summary
def FormatOutput(Solutions,StartLine=3,ExpectedQuestions=17):
  CorrectFormat=0
  Output=[]
  Correct=[]

  for d in Solutions:
    result=d.split('\n')
    QuestionsAnswers={}
    count=0
    CorrectFormat=0
      
    for j in result[StartLine-1:]:
      FoundNumber=0
      count+=1
      context=""
      number=j.split('.')[0]
      if number.isdigit():
          number=int(number)
          if number==count:
              FoundNumber=1
      tmp=j.split(' ')
      if len(tmp)>=2:
         if count<ExpectedQuestions:
             if tmp[1].isdigit():
                 context=int(tmp[1])
                 if FoundNumber==1:
                    CorrectFormat+=1
                    QuestionsAnswers[number]=context
         else:
             context=' '.join(tmp[1:])
             if FoundNumber==1:
                CorrectFormat+=1
                QuestionsAnswers[number]=context
    Correct.append(CorrectFormat==ExpectedQuestions)
    Output.append(QuestionsAnswers)
  return (Output,Correct)


if Customer==1:
  (FTOutput,FTCorrectFormat)=FormatOutput(Solutions,StartLine=StartLineFT)
elif Customer==0:
  (FTOutput,FTCorrectFormat)=FormatOutput(Solutions,StartLine=StartLineFT,ExpectedQuestions=11)



In [15]:
#Parsed formatted output for the finetuned LLM
print(FTOutput[12])
print(len(FTOutput))
print(len(FTCorrectFormat))
sum(FTCorrectFormat)

{1: 1, 2: 3, 3: 3, 4: 0, 5: 0, 6: 1, 7: 0, 8: 0, 9: 0, 10: 0, 11: 'Customer reports performance issues with their Blockchain as a Service implementation on CDP Private Cloud 7.1.8, including node synchronization failures, slow block validation (>20s), and low throughput (150 TPS vs target 500 TPS) on 3 validator nodes with 16GB RAM each. They are requesting help with identifying bottlenecks and optimization recommendations.'}
500
500


500

In [16]:

if Customer==1:
  (BaselineOutput,BaselineCorrectFormat)=FormatOutput(SolutionsBase,StartLine=StartLineBase)
elif Customer==0:
  (BaselineOutput,BaselineCorrectFormat)=FormatOutput(SolutionsBase,StartLine=StartLineBase,ExpectedQuestions=11)



In [17]:
#Parsed formatted output for the base LLM

print(BaselineOutput[2])
print(len(BaselineOutput))
print(len(BaselineCorrectFormat))
sum(BaselineCorrectFormat)

{1: 1, 2: 4, 3: 4, 4: 0, 5: 0, 6: 0, 7: 1, 8: 0, 9: 0, 10: 0, 11: "The comment is from a Cloudera Support Engineer, analyzing a Hive query performance issue in a CDH cluster. The engineer requests the customer to verify and share specific details, including Hive server memory settings, concurrent users, recent data volume changes, and the output of the 'SHOW LOCKS' command. Additionally, the engineer asks the customer to enable query profiling for 24 hours to identify bottlenecks."}
500
500


500

#### **5. LLM-as-a-Judge Evaluation**
- Prepares a prompt for LLM-as-a-judge along with three examples to show how to score win rate between the two competing models. 
- Uses a powerful Phi-4 model to judge which model (finetuned vs baseline) produces better answers.


In [18]:
#Prompts and 3 examples for LLM-as-a-judge to score whether model A or B wins or if there is a tie.


comment="Hi Support team, we're experiencing critical performance issues with our Hive queries in our production environment. All queries are taking 3-4 times longer than usual to execute, severely impacting our business operations. Can you please escalate this to a high priority case and have someone look into this ASAP? We need immediate assistance as this is affecting our SLAs."
question="""Does this comment relate to a customer complaint? (BOOL)"""
AnswerA="""0"""
AnswerB="""1"""

Example1Prompt="You are given a comment and a question about the comment. You will be given the response of two systems and you need to judge which system  gave the best response. Make sure you provide the answer in json and your choice is given in a field with the value choice. Here is the comment: \n" + comment +".\nHere is the question:\n" + question+":\n" +"\nHere is the answer of System A:\n"+ AnswerA +"\nHere is the answer of System B:\n"+AnswerB

Example1Completion="""```json
{
    "choice": "B"
}
```
"""

comment="We're experiencing issues with our Cloudera Data Warehouse environment where our Impala queries are failing with OOM errors. The cluster metrics show high memory utilization across all nodes. This is blocking our critical reporting pipeline. Could you please help investigate this issue? We need this resolved within the next 24 hours."
question="""Customer complaint temperature or a frustration level (score 1-4, 0 if not a complaint)"""
AnswerA="""3"""
AnswerB="""3"""

Example2Prompt="You are given a comment and a question about the comment. You will be given the response of two systems and you need to judge which system  gave the best response. Make sure you provide the answer in json and your choice is given in a field with the value choice. Here is the comment: \n" + comment +".\nHere is the question:\n" + question+":\n" +"\nHere is the answer of System A:\n"+ AnswerA +"\nHere is the answer of System B:\n"+AnswerB

Example2Completion="""```json
{
    "choice": "tie"
}
```
"""
comment="Our Cloudera Manager is showing multiple critical alerts for HDFS data nodes. The replication factor has dropped below the minimum threshold, and we're seeing increased latency in our data processing jobs. Can you please help us resolve this urgently? This is impacting our production workloads."
question="""Summarize the case comment condensing it as much as possible but without losing important technical details. Omit including any meeting invite information.  [TEXT]"""
AnswerA="""Critical HDFS issues with data nodes showing alerts, reduced replication factor causing increased latency in production data processing jobs."""
AnswerB="""This is about critical alets."""

Example3Prompt="You are given a comment and a question about the comment. You will be given the response of two systems and you need to judge which system  gave the best response. Make sure you provide the answer in json and your choice is given in a field with the value choice. Here is the comment: \n" + comment +".\nHere is the question:\n" + question+":\n" +"\nHere is the answer of System A:\n"+ AnswerA +"\nHere is the answer of System B:\n"+AnswerB
Example3Completion="""```json
{
    "choice": "A"
}
```
"""



In [19]:

chat2 = [
  {"role": "system", "content": System},

  {"role": "user", "content": Example1Prompt},
  {"role": "assistant", "content": Example1Completion},
  {"role": "user", "content": Example2Prompt},
  {"role": "assistant", "content": Example2Completion},
  {"role": "user", "content": Example3Prompt},
  {"role": "assistant", "content": Example3Completion}
]

Completion="""```json
{
    "choice": """

chat=chat2.copy()


In [20]:
#Setup LLM-as-a-judge final prompt
from transformers import AutoTokenizer


#tokenizer = AutoTokenizer.from_pretrained(EvalLLM)
tokenizer = AutoTokenizer.from_pretrained(EvalLLM)

if Customer==1:   
    ExpectedQuestions=17

elif Customer==0:
    ExpectedQuestions=11
    
FTCorrectFormat={}
BaselineCorrectFormat={}
AllPrompts=[]

for i in range(1,ExpectedQuestions+1):
    FTCorrectFormat[i]={}
    BaselineCorrectFormat[i]={}
    Prompts=[]
    print(i)
    for j in range(0,len(FTOutput)):
        chat=chat2.copy()
        
        if i in FTOutput[j]:
            FTCandidate=str(FTOutput[j][i])
            FTCorrectFormat[i][j]=True
        else:
            FTCandidate=""
            FTCorrectFormat[i][j]=False
            
        if i in BaselineOutput[j]:
            BaselineCandidate=str(BaselineOutput[j][i])
            BaselineCorrectFormat[i][j]=True
        else:
            BaselineCandidate=""
            BaselineCorrectFormat[i][j]=False
        user=""
        if j%2==0:
          user="You are given a comment and a question about the comment. You will be given the response of two systems, system A and system B and you need to judge which system  gave the best response or if it was a tie. Make sure you provide the answer in json and your choice is given in a field with the value choice. Here is the comment: \n" + Comment[j]+".\nHere is the question:\n"+FixedQuestions[i]+"\nHere is the answer of System A:\n" + FTCandidate +"\nHere is the answer of System B:\n"+BaselineCandidate + "\n"
        else:
          user="You are given a comment and a question about the comment. You will be given the response of two systems, system A and system B and you need to judge which system  gave the best response or if it was a tie. Make sure you provide the answer in json and your choice is given in a field with the value choice. Here is the comment: \n" + Comment[j]+".\nHere is the question:\n"+FixedQuestions[i]+"\nHere is the answer of System A:\n" + BaselineCandidate +"\nHere is the answer of System B:\n"+FTCandidate + "\n"

        chat.append({"role": "user", "content": user})
        
        Prompt=tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) + Completion
        Prompts.append(Prompt)
    AllPrompts.append(Prompts)
    


print(tokenizer.encode(Prompts[1]))

        
        

1
2
3
4
5
6
7
8
9
10
11
[100264, 9125, 100266, 2675, 527, 264, 11190, 18328, 13, 100265, 100264, 882, 100266, 2675, 527, 2728, 264, 4068, 323, 264, 3488, 922, 279, 4068, 13, 1472, 690, 387, 2728, 279, 2077, 315, 1403, 6067, 323, 499, 1205, 311, 11913, 902, 1887, 220, 6688, 279, 1888, 2077, 13, 7557, 2771, 499, 3493, 279, 4320, 304, 3024, 323, 701, 5873, 374, 2728, 304, 264, 2115, 449, 279, 907, 5873, 13, 5810, 374, 279, 4068, 25, 720, 13347, 9365, 2128, 11, 584, 2351, 25051, 9200, 5178, 4819, 449, 1057, 69278, 20126, 304, 1057, 5788, 4676, 13, 2052, 20126, 527, 4737, 220, 18, 12, 19, 3115, 5129, 1109, 13783, 311, 9203, 11, 35906, 74055, 1057, 2626, 7677, 13, 3053, 499, 4587, 89690, 420, 311, 264, 1579, 10844, 1162, 323, 617, 4423, 1427, 1139, 420, 67590, 30, 1226, 1205, 14247, 13291, 439, 420, 374, 28987, 1057, 17216, 2170, 35047, 8586, 374, 279, 3488, 512, 22186, 420, 4068, 29243, 311, 264, 6130, 12458, 30, 320, 10611, 7887, 8586, 374, 279, 4320, 315, 744, 362, 512, 15, 198, 8586, 374

In [21]:
#LLM-as-a-judge prompt example
print(AllPrompts[0][3])

<|im_start|>system<|im_sep|>You are a helpful assistant.<|im_end|><|im_start|>user<|im_sep|>You are given a comment and a question about the comment. You will be given the response of two systems and you need to judge which system  gave the best response. Make sure you provide the answer in json and your choice is given in a field with the value choice. Here is the comment: 
Hi Support team, we're experiencing critical performance issues with our Hive queries in our production environment. All queries are taking 3-4 times longer than usual to execute, severely impacting our business operations. Can you please escalate this to a high priority case and have someone look into this ASAP? We need immediate assistance as this is affecting our SLAs..
Here is the question:
Does this comment relate to a customer complaint? (BOOL):

Here is the answer of System A:
0
Here is the answer of System B:
1<|im_end|><|im_start|>assistant<|im_sep|>```json
{
    "choice": "B"
}
```
<|im_end|><|im_start|>u

In [22]:
#Load LLM-as-a-judge
max_num_seqs=1
max_model_len=10000
max_num_batched_tokens=10000 
gpu_memory_utilization=0.7
enforce_eager=True
tensor_parallel_size=1
seed=None
max_tokens=10
temperature=0.1

import vllm
llm = vllm.LLM(model=EvalLLM, max_num_seqs=max_num_seqs,max_model_len=max_model_len,max_num_batched_tokens=max_num_batched_tokens, gpu_memory_utilization=gpu_memory_utilization,enforce_eager=enforce_eager, tensor_parallel_size=tensor_parallel_size)
seed=None
max_tokens=max_tokens
temperature=temperature
sampling_params = vllm.SamplingParams(seed=None,max_tokens=max_tokens,temperature=temperature)


INFO 04-03 18:39:27 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='microsoft/phi-4', speculative_config=None, tokenizer='microsoft/phi-4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=microsoft/phi-4, use_v2_block_manager=True, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_c

Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  17% Completed | 1/6 [00:00<00:03,  1.25it/s]
Loading safetensors checkpoint shards:  33% Completed | 2/6 [00:01<00:03,  1.18it/s]
Loading safetensors checkpoint shards:  50% Completed | 3/6 [00:02<00:02,  1.21it/s]
Loading safetensors checkpoint shards:  67% Completed | 4/6 [00:03<00:01,  1.26it/s]
Loading safetensors checkpoint shards:  83% Completed | 5/6 [00:03<00:00,  1.28it/s]
Loading safetensors checkpoint shards: 100% Completed | 6/6 [00:04<00:00,  1.30it/s]
Loading safetensors checkpoint shards: 100% Completed | 6/6 [00:04<00:00,  1.27it/s]



INFO 04-03 18:39:37 model_runner.py:1071] Loading model weights took 27.3875 GB
INFO 04-03 18:39:39 gpu_executor.py:122] # GPU blocks: 702, # CPU blocks: 1310
INFO 04-03 18:39:39 gpu_executor.py:126] Maximum concurrency for 10000 tokens per request: 1.12x


#### **6. Final Metrics**
Summarizes the evaluation results, highlighting the finetuned model's performance improvement.


In [23]:
#Run LLM-as-a-judge for eacb question and at the same time count correct answers/ties.
Outs=[]
PosPercNoTies=[]
NegPercNoTies=[]
PosPercTies=[]
NegPercTies=[]
countPosArr=[]
countNegArr=[]
countTieArr=[]
TiesPerc=[]

for j in range(0,ExpectedQuestions):
  max_num_seqs=16

  sampling_params = vllm.SamplingParams(seed=1,max_tokens=max_tokens,temperature=temperature)

  Out=llm.generate(AllPrompts[j], sampling_params)
  Outs.append(Out)
  countPos=0
  countNeg=0
  countTie=0

  for i in range(0,len(Out)):
    if i%2==0:
      #print(i)
      #print(Out[i].prompt)
      #print(Out[i].outputs[0].text[2])
      if len(Out[i].outputs[0].text)<2:
        continue

      if Out[i].outputs[0].text[2]=='A':
        countPos+=1
      elif Out[i].outputs[0].text[2]=='B':
        countNeg+=1
      elif Out[i].outputs[0].text[2]=='t':
        countTie+=1

    if i%2==1:
      if len(Out[i].outputs[0].text)<2:
        continue

      if Out[i].outputs[0].text[2]=='B':
        countPos+=1
      elif Out[i].outputs[0].text[2]=='A':
        countNeg+=1
      elif Out[i].outputs[0].text[2]=='t':
        countTie+=1
  print("===================")
  print("Question "+str(j+1))
  print("===================")
  if countPos+countNeg > 0:
    print(str(countPos/(countPos+countNeg)*100)+"%")
    print(str(countNeg/(countPos+countNeg)*100)+"%")
  else:
      print("All outputs are tied")
  print(countPos/(countPos+countNeg+countTie))
  print(countNeg/(countPos+countNeg+countTie))

  print(countPos)
  print(countNeg)
  print(countTie)
  if countPos+countNeg > 0:
    PosPercNoTies.append(str(countPos/(countPos+countNeg)*100)+"%")
    NegPercNoTies.append(str(countNeg/(countPos+countNeg)*100)+"%")
  else:
    PosPercNoTies.append(("All outputs are tied"))
    NegPercNoTies.append(("All outputs are tied"))

  TiesPerc.append(countTie/(countPos+countNeg+countTie))
  PosPercTies.append(countPos/(countPos+countNeg+countTie))
  NegPercTies.append(countNeg/(countPos+countNeg+countTie))
  countPosArr.append(countPos)
  countNegArr.append(countNeg)
  countTieArr.append(countTie)

  

Processed prompts: 100%|█| 500/500 [02:49<00:00,  2.94it/s, est. speed input: 24


Question 1
78.23529411764706%
21.764705882352942%
0.27479338842975204
0.07644628099173553
133
37
314


Processed prompts: 100%|█| 500/500 [03:03<00:00,  2.72it/s, est. speed input: 22


Question 2
45.10739856801909%
54.8926014319809%
0.3888888888888889
0.4732510288065844
189
230
67


Processed prompts: 100%|█| 500/500 [03:01<00:00,  2.76it/s, est. speed input: 23


Question 3
38.57493857493858%
61.42506142506142%
0.3284518828451883
0.5230125523012552
157
250
71


Processed prompts: 100%|█| 500/500 [02:47<00:00,  2.98it/s, est. speed input: 25


Question 4
85.08771929824562%
14.912280701754385%
0.20464135021097046
0.035864978902953586
97
17
360


Processed prompts: 100%|█| 500/500 [02:48<00:00,  2.97it/s, est. speed input: 25


Question 5
100.0%
0.0%
0.022044088176352707
0.0
11
0
488


Processed prompts: 100%|█| 500/500 [02:48<00:00,  2.97it/s, est. speed input: 25


Question 6
97.68211920529801%
2.3178807947019866%
0.594758064516129
0.014112903225806451
295
7
194


Processed prompts: 100%|█| 500/500 [02:55<00:00,  2.85it/s, est. speed input: 24


Question 7
99.11111111111111%
0.8888888888888888%
0.48478260869565215
0.004347826086956522
223
2
235


Processed prompts: 100%|█| 500/500 [02:48<00:00,  2.96it/s, est. speed input: 25


Question 8
All outputs are tied
0.0
0.0
0
0
500


Processed prompts: 100%|█| 500/500 [02:47<00:00,  2.98it/s, est. speed input: 25


Question 9
100.0%
0.0%
0.04081632653061224
0.0
20
0
470


Processed prompts: 100%|█| 500/500 [02:48<00:00,  2.97it/s, est. speed input: 25


Question 10
All outputs are tied
0.0
0.0
0
0
500


Processed prompts: 100%|█| 500/500 [02:56<00:00,  2.83it/s, est. speed input: 26

Question 11
100.0%
0.0%
1.0
0.0
500
0
0





In [24]:
#Compute winrate
counter=1
Total=0
TotalCounter=0
for i in PosPercNoTies:
    print("Question "+str(counter)+" winrate")
    print(FixedQuestions[counter])
    print(i)
    if i != "All outputs are tied":
        Total += float(i[:-1])
        TotalCounter+=1
    counter=counter+1
    print('----------------------')


Question 1 winrate
Does this comment discuss any technical information? (BOOL: 0 for no, 1 for yes)
78.23529411764706%
----------------------
Question 2 winrate
Score the severity of the issue based on comment content (SCORE 1-4, lowest=1, highest=4)
45.10739856801909%
----------------------
Question 3 winrate
Score the urgency of the issue based on the comment content (SCORE 1-4, lowest=1, highest=4)
38.57493857493858%
----------------------
Question 4 winrate
Does the comment have a proposed solution? (BOOL: 0 for no, 1 for yes)
85.08771929824562%
----------------------
Question 5 winrate
Does the comment have a proposed workaround?  (BOOL: 0 for no, 1 for yes)
100.0%
----------------------
Question 6 winrate
Does the comment have a request for an action from the customer?  (BOOL: 0 for no, 1 for yes)
97.68211920529801%
----------------------
Question 7 winrate
Does this comment discuss a bug in Cloudera software? (BOOL: 0 for no, 1 for yes)
99.11111111111111%
----------------------


In [25]:
print('----------------------')
print('----------------------')
print('Average Score')
print(Total/TotalCounter)
print('----------------------')

----------------------
----------------------
Average Score
82.64428676391772
----------------------


#### **7. Test if workflow run as expected**
- Run with unsloth/Meta-Llama-3.1-8B-Instruct for finetuning
- Evaluate using LLM-as-a-judge microsoft/phi-4
- Test if the average winrate is approximately 82%



In [28]:
if int(Total/TotalCounter) >=81 and int(Total/TotalCounter)<84 :
    print("Model test run as expected.")

Model test run as expected.


In [None]:
***If this documentation includes code, including but not limited to, code examples, Cloudera makes this available to you under the terms of the Apache License, Version 2.0, including any required notices.  A copy of the Apache License Version 2.0 can be found [here](https://opensource.org/licenses/Apache-2.0).***