## Imports

In [67]:
import json
from typing import List, Dict, Any
from google import genai
from google.genai import types
from IPython.display import Markdown
import requests

# Prepare the file to be uploaded
import pathlib
import time
from moviepy import VideoFileClip
import azure.cognitiveservices.speech as speechsdk
import re
import os
import dotenv

dotenv.load_dotenv()

True

In [98]:
VIDEO_PATH = "interview_positive.mp4"
AUDIO_PATH = "interview_positive_audio.wav"

## Intialize the Gemini

In [99]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
MODEL_ID = "gemini-2.0-flash-exp"  # @param ["gemini-1.5-flash-8b","gemini-1.5-flash-002","gemini-1.5-pro-002","gemini-2.0-flash-exp"] {"allow-input":true}
client = genai.Client(api_key=GEMINI_API_KEY)

## Extract Audio From Video

In [100]:
# Load the video file
video_clip = VideoFileClip(VIDEO_PATH)
audio_clip = video_clip.audio
audio_clip.write_audiofile(AUDIO_PATH)
# Close the clips
audio_clip.close()
video_clip.close()

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'mp42', 'minor_version': '1', 'compatible_brands': 'isommp41mp42', 'creation_time': '2025-03-13T07:27:45.000000Z'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 720], 'bitrate': 516, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'creation_time': '2025-03-13T07:27:45.000000Z', 'handler_name': 'Core Media Video', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 44100, 'bitrate': 128, 'metadata': {'Metadata': '', 'creation_time': '2025-03-13T07:27:45.000000Z', 'handler_name': 'Core Media Audio', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 369.43, 'bitrate': 647, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264'

                                                                      

MoviePy - Done.




# Transcribe the audio file using azure

In [101]:
def ms_to_time_format(milliseconds: int) -> str:
    """
    Convert milliseconds to a readable time format (HH:MM:SS or MM:SS).

    Parameters:
    -----------
    milliseconds : int
        Time in milliseconds

    Returns:
    --------
    str
        Formatted time string (HH:MM:SS or MM:SS)
    """
    # Calculate total seconds
    total_seconds = milliseconds / 1000

    # Extract hours, minutes, and seconds
    hours = int(total_seconds // 3600)
    minutes = int((total_seconds % 3600) // 60)
    seconds = int(total_seconds % 60)

    # Format the time string
    if hours > 0:
        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
    else:
        return f"{minutes:02d}:{seconds:02d}"


def create_conversation_transcript(
    transcription_json: Dict[str, Any],
) -> List[Dict[str, Any]]:
    """
    Creates a transcript of a multi-turn conversation, preserving the turn-taking structure.

    Parameters:
    -----------
    transcription_json : Dict[str, Any]
        The JSON result from Azure Speech Service transcription

    Returns:
    --------
    List[Dict[str, Any]]
        List of conversation turns, each containing speaker information and text
    """
    if "phrases" not in transcription_json:
        print("Error: No phrases found in the transcription JSON")
        return []

    conversation = []
    current_speaker = None
    current_turn = None

    # Sort phrases by their timestamp to ensure chronological order
    phrases = sorted(
        transcription_json["phrases"], key=lambda x: x["offsetMilliseconds"]
    )

    for phrase in phrases:
        if "speaker" not in phrase:
            continue

        speaker_id = phrase["speaker"]
        start_time_ms = phrase["offsetMilliseconds"]
        end_time_ms = start_time_ms + phrase["durationMilliseconds"]

        # If this is a new speaker or the first phrase
        if speaker_id != current_speaker:
            # Save the previous turn if it exists
            if current_turn is not None:
                # Join the text into a single string
                current_turn["text"] = " ".join(current_turn["text"])
                conversation.append(current_turn)

            # Start a new turn
            current_turn = {
                "speaker_id": speaker_id,
                "start_time_ms": start_time_ms,
                "end_time_ms": end_time_ms,
                "start_time": ms_to_time_format(start_time_ms),
                "end_time": ms_to_time_format(end_time_ms),
                "text": [phrase["text"]],
                "confidence": phrase.get("confidence", 0),
            }
            current_speaker = speaker_id
        else:
            # Continue the current turn
            current_turn["text"].append(phrase["text"])
            current_turn["end_time_ms"] = end_time_ms
            current_turn["end_time"] = ms_to_time_format(end_time_ms)

            # Update confidence (use average)
            if "confidence" in phrase:
                current_turn["confidence"] = (
                    current_turn["confidence"] + phrase["confidence"]
                ) / 2

    # Add the last turn
    if current_turn is not None:
        # Join the text into a single string
        current_turn["text"] = " ".join(current_turn["text"])
        conversation.append(current_turn)

    return conversation


def format_transcript_as_string(transcript: List[Dict[str, Any]]) -> str:
    """
    Format the conversation transcript as a human-readable string.

    Parameters:
    -----------
    transcript : List[Dict[str, Any]]
        The conversation transcript generated by create_conversation_transcript

    Returns:
    --------
    str
        A formatted string representation of the transcript
    """
    formatted_transcript = []

    for turn in transcript:
        speaker_id = turn["speaker_id"]
        text = turn["text"]
        formatted_line = f'speaker {speaker_id}: "{text}"'
        formatted_transcript.append(formatted_line)

    # Join all lines with newlines
    return "\n".join(formatted_transcript)


def process_transcript_file(transcription_result: Dict, output_file: str) -> None:
    """
    Process a transcription file and create a formatted conversation transcript.

    Parameters:
    -----------
    input_file : str
        The input transcription JSON file
    output_file : str
        The output conversation transcript JSON file
    """
    # Load the transcription result
    # with open(input_file, "r") as json_file:
    #    transcription_result = json.load(json_file)

    # Create the conversation transcript
    transcript = create_conversation_transcript(transcription_result)

    # Print the transcript
    # print("Conversation Transcript:")
    # for i, turn in enumerate(transcript):
    #    print(f"\nTurn {i+1} - Speaker {turn['speaker_id']}:")
    #    print(f"Time: {turn['start_time']} - {turn['end_time']}")
    #    if "confidence" in turn:
    #        print(f"Confidence: {turn['confidence']:.4f}")

    # create the string transcript
    transcript_string = format_transcript_as_string(transcript)
    # Save the transcript to a file
    with open(output_file, "w") as json_file:
        json.dump(transcript, json_file, indent=4)

    print(f"\nConversation transcript saved to {output_file}")
    return transcript_string

In [102]:
def transcribe_audio_with_diarization(
    service_region,
    subscription_key,
    audio_file_path,
    locales=["en-US"],
    max_speakers=2,
    save_to_file=None,
    api_version="2024-11-15",
):
    """
    Transcribe an audio file with speaker diarization using Azure Speech Service.

    Parameters:
    -----------
    service_region : str
        Azure Speech service region (e.g., 'westus', 'eastus', 'southeastasia')
    subscription_key : str
        Azure Speech subscription key
    audio_file_path : str
        Path to the audio file to transcribe
    locales : list
        List of language locales (default: ["en-US"])
    max_speakers : int
        Maximum number of speakers for diarization (default: 2)
    save_to_file : str or None
        Path to save the transcription results as JSON (default: None)
    api_version : str
        API version to use (default: "2024-11-15")

    Returns:
    --------
    dict
        The transcription result from Azure Speech Service
    """
    # API endpoint URL
    url = f"https://{service_region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version={api_version}"

    # Request headers
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}

    # JSON definition for transcription parameters as a string
    definition_str = json.dumps(
        {
            "locales": locales,
            "diarization": {"maxSpeakers": max_speakers, "enabled": True},
        }
    )

    try:
        # Open the audio file
        with open(audio_file_path, "rb") as audio_file:
            # Prepare the multipart form data
            files = {"audio": audio_file, "definition": (None, definition_str)}

            # Make the POST request
            print(
                f"Sending request to Azure Speech Service to transcribe {audio_file_path}..."
            )
            response = requests.post(url, headers=headers, files=files)

        # Check the response
        if response.status_code == 200:
            print("Transcription successful")
            transcription_result = response.json()

            # Save the full response to a JSON file if requested
            if save_to_file:
                transcript_string = process_transcript_file(
                    transcription_result, save_to_file
                )

            return transcript_string
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
            return None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [103]:
# Replace these with your actual values
service_region = "southeastasia"
subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
transcript = transcribe_audio_with_diarization(
    service_region=service_region,
    subscription_key=subscription_key,
    audio_file_path=AUDIO_PATH,
    save_to_file="transcription_result.json",
)
print(transcript)

Sending request to Azure Speech Service to transcribe interview_positive_audio.wav...
Transcription successful

Conversation transcript saved to transcription_result.json
speaker 1: "Thank you for coming to interview with us today. Let's just go ahead and get started with a quick introduction. Could you tell me a little bit about yourself?"
speaker 2: "Yeah, absolutely. First of all, thank you so much for having me. It's an honor to be here. So a bit about myself. I'm originally from Morocco, where I lived pretty much my whole life. I then came to Boston and studied industrial engineering at Northeastern University. While I was in college, I did a few different internships. I did technical ones at a local power plant and at Amazon Robotics. And I also did one in consulting at the Boston Consulting Group and was also the president of the consulting club at my university. So really I'm someone who enjoys both the business side of things, but I also do have a passion for technology. So do

## Perform Body language Analysis


In [104]:
# Upload the file using the API
file_upload = client.files.upload(file=pathlib.Path(VIDEO_PATH))

# Prepare the file to be uploaded
while file_upload.state == "PROCESSING":
    print("Waiting for video to be processed.")
    time.sleep(10)
    file_upload = client.files.get(name=file_upload.name)

if file_upload.state == "FAILED":
    raise ValueError(file_upload.state)
print(f"Video processing complete: " + file_upload.uri)

Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/3yzy3b6pnbs2


In [105]:
SYSTEM_PROMPT = """When given a video and a query, call the relevant function only once with 
the appropriate timecodes and text for the video"""

USER_PROMPT = """
You are tasked with analyzing an **online interview video** of a candidate. **Your goal is to analyze the candidate's behavior, communication, and emotional expressions** based on the provided video transcript.

Here is the **transcript of the video**:  
<Transcript>  
{transcript}  
</Transcript>  

### Instructions:  
1. The video may contain **both the candidate and the interviewer**.  
2. **Only analyze the candidate** — ignore the interviewer.  
3. Analyze the candidate's **verbal communication, non-verbal communication (body language and facial expressions), and emotional/vocal tone**.  
4. If both **positive and negative behaviors** are present, mention both clearly.  
5. Your analysis should be **neutral, detailed, and professional** — without making assumptions beyond observable behaviors.  
6. Focus on **summarizing the overall behavior throughout the video**, not listing actions minute by minute.  

---

### Metrics you MUST analyze (with explanation and examples):  

1. **Verbal Communication (Speech and Language Analysis)**:  
   - Focus on **what the candidate says**: word choice, clarity, structure of answers, fluency, and speech pace.  
   - **Positive example**: "The candidate speaks clearly, uses appropriate vocabulary, and provides well-structured answers."  
   - **Negative example**: "The candidate struggles to articulate responses and uses filler words frequently."  

2. **Non-Verbal Communication and Body Language**:  
   - Focus on **how the candidate behaves physically**: body posture, gestures, eye contact, facial expressions, fidgeting.  
   - **Positive example**: "The candidate maintains good eye contact and uses open hand gestures while speaking."  
   - **Negative example**: "The candidate avoids looking at the camera and frequently fidgets with their hands."  

3. **Emotional and Vocal Tone Analysis**:  
   - Focus on **how the candidate sounds and what emotions they convey**: tone of voice, modulation, energy, friendliness, confidence, nervousness.  
   - **Positive example**: "The candidate speaks in a calm and confident tone with visible enthusiasm about the role."  
   - **Negative example**: "The candidate's tone is flat and lacks energy, with signs of nervousness in their voice."  

---

### Final Report Format (JSON):  

```json
{{
  "verbal_communication": "Detailed observation here.",
  "non_verbal_communication_and_body_language": "Detailed observation here.",
  "emotional_and_vocal_tone_analysis": "Detailed observation here."
}}

### Example of a Proper Response (Sample Style):

{{
  "verbal_communication_score": ["The candidate spoke clearly and used structured sentences, but occasionally hesitated when answering complex questions.", 8],
  "non_verbal_communication_and_body_language_score": ["Maintained good posture and eye contact, with slight fidgeting when asked tough questions.", 7],
  "emotional_and_vocal_tone_analysis_score": ["Tone was mostly confident and calm, though slightly nervous when discussing weaknesses.", 7],
  "skills_experience_professional_competence_score": ["Demonstrated relevant experience and aligned skills but could have elaborated more on technical details.", 8],
  "motivation_adaptability_professional_attitude_score": ["Candidate appeared motivated and professional, expressed willingness to learn and adapt.", 9]
}}


IMPORTANT NOTES:
   - Be specific and balanced — include both positive and negative points, if present.
   - Focus only on the candidate, even if others appear in the video.
   - Avoid assumptions — base observations only on visible and audible behavior.
   - Summarize the overall behavior and communication throughout the video (no need to reference timecodes).
   - The final output must strictly follow the JSON format shared above.
   - Keep your analysis concise, clear, and to the point — professional and useful for evaluation.

"""
USER_PROMPT = USER_PROMPT.format(transcript=transcript)

In [106]:
def extract_json_from_markdown(markdown_string):
    """
    Extract and parse JSON from a markdown code block string.

    Parameters:
    -----------
    markdown_string : str
        String containing a markdown code block with JSON

    Returns:
    --------
    dict
        Parsed JSON as a Python dictionary
    """
    # Remove markdown code block indicators and newlines
    json_string = markdown_string.strip()
    json_string = re.sub(r"^```json\n", "", json_string)
    json_string = re.sub(r"\n```$", "", json_string)

    # Replace escaped newlines with actual newlines
    json_string = json_string.replace("\\n", "\n")

    # Replace escaped quotes with actual quotes
    json_string = json_string.replace("\\'", "'")

    # Parse the JSON string into a Python dictionary
    json_obj = json.loads(json_string)

    return json_obj

In [107]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=file_upload.uri, mime_type=file_upload.mime_type
                ),
            ],
        ),
        USER_PROMPT,
    ],
    config=types.GenerateContentConfig(
        system_instruction=SYSTEM_PROMPT,
        temperature=0.0,
    ),
)
video_and_audio_analysis_report = extract_json_from_markdown(response.text)
Markdown(response.text)

```json
{
  "verbal_communication": "The candidate generally speaks clearly and fluently, using appropriate vocabulary. Answers are well-structured, often beginning with a brief summary before elaborating. There are occasional hesitations, but overall, the candidate articulates thoughts effectively and demonstrates a good command of language.",
  "non_verbal_communication_and_body_language": "The candidate maintains consistent eye contact, creating a sense of engagement. Posture is generally upright and attentive. Hand gestures are used naturally to emphasize points, contributing to a confident demeanor. There is minimal fidgeting observed, suggesting a comfortable and composed presence.",
  "emotional_and_vocal_tone_analysis": "The candidate's vocal tone is calm and confident, conveying enthusiasm and sincerity. Modulation is appropriate, varying to emphasize key points and maintain listener engagement. A friendly and approachable demeanor is evident throughout the interview. While some nervousness may be present, it does not detract from the overall impression of confidence and professionalism."
}
```

## Score the Candidate


In [108]:
SCORING_SYSTEM_PROMPT = """
You are a hiring manager and you need to score the candidate. 
You are also very experienced in hiring and you know what is expected from a candidate in an interview.
You are also given the transcript of the video and the video and audio analysis report.
"""
SCORING_USER_PROMPT = """
You are a hiring manager and need to evaluate and score the candidate based on their **online interview performance**.  
Your goal is to **fairly and professionally assess the candidate's communication, behavior, and suitability for the role**, using the **transcript and the video/audio analysis report**.  

---

### **Evaluation Criteria (Refined and Non-Overlapping):**

1. **Verbal Communication (Speech & Content Analysis)**  
   - Clarity, vocabulary, sentence structure, fluency, coherence, and ability to articulate thoughts and experiences.  

2. **Non-Verbal Communication and Body Language**  
   - Body posture, hand gestures, facial expressions, eye contact, fidgeting, and overall physical presence.  

3. **Emotional and Vocal Tone Analysis**  
   - Tone of voice, modulation, energy, confidence, enthusiasm, emotional expressions (e.g., nervousness, excitement).  

4. **Skills, Experience, and Professional Competence**  
   - Relevance of past experiences and skills, technical or role-specific competencies, and professional knowledge.  

5. **Motivation, Adaptability, and Professional Attitude**  
   - Motivation for the role, willingness to learn, adaptability, sense of responsibility, professionalism, and interpersonal skills.  

---

### **Instructions for Scoring:**

1. **Score each criterion between 0 and 10.**  
   - **10** = Excellent, **0** = Very Poor.  
2. Each score must be **a number between 0 and 10** (whole number or decimal).  
3. You must **clearly justify each score** based on observations from both the **transcript and video/audio analysis**.  
4. **Be objective, fair, and professional** — no assumptions, only observations.  
5. Focus **only on the candidate**, even if others are present.  
6. Review the **Scoring Guidelines** carefully before assigning scores.  

---

### **Scoring Guidelines (Reference Table):**

| **Score** | **Rating**          | **Description**                                     |
|-----------|---------------------|----------------------------------------------------|
| 10        | Excellent            | Outstanding, exceeds expectations with no issues.  |
| 8-9       | Above Average        | Strong performance with minor improvements needed. |
| 6-7       | Average              | Acceptable, balanced with noticeable pros and cons.|
| 4-5       | Below Average        | Noticeable weaknesses, needs significant work.     |
| 2-3       | Poor                 | Major flaws and insufficient demonstration.        |
| 0-1       | Very Poor            | Unacceptable, fails to demonstrate basic skills.   |

---

### **Response Format (JSON):**

```json
{{
  "verbal_communication_score": ["Reason for the score", 0-10],
  "non_verbal_communication_and_body_language_score": ["Reason for the score", 0-10],
  "emotional_and_vocal_tone_analysis_score": ["Reason for the score", 0-10],
  "skills_experience_professional_competence_score": ["Reason for the score", 0-10],
  "motivation_adaptability_professional_attitude_score": ["Reason for the score", 0-10]
}}
Example of a Proper Response (Sample Style):

{{
  "verbal_communication_score": ["The candidate spoke clearly and used structured sentences, but occasionally hesitated when answering complex questions.", 8],
  "non_verbal_communication_and_body_language_score": ["Maintained good posture and eye contact, with slight fidgeting when asked tough questions.", 7],
  "emotional_and_vocal_tone_analysis_score": ["Tone was mostly confident and calm, though slightly nervous when discussing weaknesses.", 7],
  "skills_experience_professional_competence_score": ["Demonstrated relevant experience and aligned skills but could have elaborated more on technical details.", 8],
  "motivation_adaptability_professional_attitude_score": ["Candidate appeared motivated and professional, expressed willingness to learn and adapt.", 9]
}}

Rate the candidate based on the following transcript and the video and audio analysis report.
<Transcript>
{transcript}
</Transcript>
<Video and Audio Analysis Report>
{video_and_audio_analysis_report}
</Video and Audio Analysis Report>

### Important Notes:
 - Review the transcript and video/audio analysis report carefully.
 - Ensure each score is fully justified based on clear observations.
 - Be detailed in your reasoning — highlight both strengths and weaknesses where relevant.
 - Maintain a neutral and professional tone.
 - Not necessary to mention information from transcripts and video/audio analysis report in the scores, give reason based on the information provided.
 - Remember: 0 is the lowest score (Very Poor), 10 is the highest (Excellent).
"""

SCORING_USER_PROMPT = SCORING_USER_PROMPT.format(
    transcript=transcript,
    video_and_audio_analysis_report=json.dumps(
        video_and_audio_analysis_report, indent=2
    ),
)
print(SCORING_USER_PROMPT)


You are a hiring manager and need to evaluate and score the candidate based on their **online interview performance**.  
Your goal is to **fairly and professionally assess the candidate's communication, behavior, and suitability for the role**, using the **transcript and the video/audio analysis report**.  

---

### **Evaluation Criteria (Refined and Non-Overlapping):**

1. **Verbal Communication (Speech & Content Analysis)**  
   - Clarity, vocabulary, sentence structure, fluency, coherence, and ability to articulate thoughts and experiences.  

2. **Non-Verbal Communication and Body Language**  
   - Body posture, hand gestures, facial expressions, eye contact, fidgeting, and overall physical presence.  

3. **Emotional and Vocal Tone Analysis**  
   - Tone of voice, modulation, energy, confidence, enthusiasm, emotional expressions (e.g., nervousness, excitement).  

4. **Skills, Experience, and Professional Competence**  
   - Relevance of past experiences and skills, technical or

### Good Candidate

In [None]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[SCORING_USER_PROMPT],
    config=types.GenerateContentConfig(
        system_instruction=SCORING_SYSTEM_PROMPT,
        temperature=0.0,
    ),
)
score = extract_json_from_markdown(response.text)
print(score)
# Markdown(response.text)

```json
{
  "verbal_communication_score": ["The candidate demonstrates strong verbal communication skills, articulating thoughts clearly and using appropriate vocabulary. The responses are well-structured and coherent, indicating a good command of language.", 8],
  "non_verbal_communication_and_body_language_score": ["The candidate maintains consistent eye contact and exhibits good posture, contributing to a confident and engaged presence. Natural hand gestures are used to emphasize points, further enhancing communication.", 8],
  "emotional_and_vocal_tone_analysis_score": ["The candidate's vocal tone is calm and confident, conveying enthusiasm and sincerity. Modulation is appropriate, and the overall demeanor is friendly and approachable, creating a positive impression.", 9],
  "skills_experience_professional_competence_score": ["The candidate effectively highlights relevant experiences and skills, demonstrating professional competence. The examples provided showcase problem-solving abilities, leadership skills, and the capacity to persuade others.", 9],
  "motivation_adaptability_professional_attitude_score": ["The candidate expresses clear motivation for the role and demonstrates a willingness to learn and adapt. The questions asked at the end of the interview indicate a genuine interest in the company's future and the evolving consulting landscape. The candidate also shows a good level of self-awareness by acknowledging a weakness and outlining steps taken to improve.", 9]
}
```

### Bad Candidate

In [97]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[SCORING_USER_PROMPT],
    config=types.GenerateContentConfig(
        system_instruction=SCORING_SYSTEM_PROMPT,
        temperature=0.0,
    ),
)

Markdown(response.text)

```json
{
  "verbal_communication_score": ["The candidate's communication is conversational but lacks professional polish, including slang and disjointed answers. The candidate also tends to answer questions directly without much elaboration unless prompted.", 4],
  "non_verbal_communication_and_body_language_score": ["The candidate's non-verbal communication is inconsistent, with fidgeting and restlessness. Posture varies, sometimes relaxed and other times tense, especially when discussing potentially controversial topics or when challenged.", 5],
  "emotional_and_vocal_tone_analysis_score": ["The candidate's vocal tone is enthusiastic and friendly, but can also come across as defensive or argumentative when disagreeing with the interviewer. Confidence levels fluctuate throughout the interview.", 5],
  "skills_experience_professional_competence_score": ["The candidate's claims about skills and experience, such as attending Stanford, are misleading. The candidate does not demonstrate the professional competence expected for the role.", 3],
  "motivation_adaptability_professional_attitude_score": ["The candidate shows eagerness but demonstrates a lack of professionalism and adaptability. The candidate is argumentative about the salary and work arrangements, and the candidate's comments about women are inappropriate.", 2]
}
```

### Average Candidate

In [81]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[SCORING_USER_PROMPT],
    config=types.GenerateContentConfig(
        system_instruction=SCORING_SYSTEM_PROMPT,
        temperature=0.0,
    ),
)

Markdown(response.text)

```json
{
  "verbal_communication_score": ["The candidate's speech contains filler words and hesitations. The candidate also restates the question before answering. The vocabulary is adequate, but the candidate struggles to find the precise word, leading to pauses.", 6],
  "non_verbal_communication_and_body_language_score": ["The candidate maintains consistent eye contact, and her posture is generally upright. However, she touches her face and adjusts her hair, indicating nervousness.", 7],
  "emotional_and_vocal_tone_analysis_score": ["The candidate's vocal tone is calm and friendly, but her energy level is low, and her voice lacks modulation. There are signs of nervousness when answering challenging questions.", 6],
  "skills_experience_professional_competence_score": ["The candidate demonstrates some relevant experience in manual testing, but struggles with scenario-based questions and lacks knowledge of key concepts like defect cascading, authorization vs. authentication, and API testing tools.", 5],
  "motivation_adaptability_professional_attitude_score": ["The candidate expresses a willingness to learn and adapt, as evidenced by her seeking real-time scenario answers. She is also thankful for the opportunity. However, her responses to some questions indicate a lack of in-depth understanding and problem-solving skills.", 7]
}
```