## Imports

In [4]:
import json
from typing import List, Dict, Any
from google import genai
from google.genai import types
from IPython.display import Markdown
import requests

# Prepare the file to be uploaded
import pathlib
import time
from moviepy import VideoFileClip
import azure.cognitiveservices.speech as speechsdk
import re
import os
import dotenv

dotenv.load_dotenv()

True

In [5]:
VIDEO_PATH = "interview_negative.mp4"
AUDIO_PATH = "interview_negative_audio.wav"

## Intialize the Gemini

In [6]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
MODEL_ID = "gemini-2.0-flash-exp"  # @param ["gemini-1.5-flash-8b","gemini-1.5-flash-002","gemini-1.5-pro-002","gemini-2.0-flash-exp"] {"allow-input":true}
client = genai.Client(api_key=GEMINI_API_KEY)

## Extract Audio From Video

In [7]:
# Load the video file
video_clip = VideoFileClip(VIDEO_PATH)
audio_clip = video_clip.audio
audio_clip.write_audiofile(AUDIO_PATH)
# Close the clips
audio_clip.close()
video_clip.close()

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso6iso2avc1mp41', 'encoder': 'Lavf60.16.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 720], 'bitrate': 185, 'fps': 30.0, 'codec_name': 'h264', 'profile': '(Main)', 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 127, 'metadata': {'Metadata': '', 'handler_name': 'ISO Media file produced by Google Inc.', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 961.59, 'bitrate': 317, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(Main)', 'video_size': [1280, 720], 'video_bitrate': 185,

                                                                        

MoviePy - Done.


# Transcribe the audio file using azure

In [8]:
def ms_to_time_format(milliseconds: int) -> str:
    """
    Convert milliseconds to a readable time format (HH:MM:SS or MM:SS).

    Parameters:
    -----------
    milliseconds : int
        Time in milliseconds

    Returns:
    --------
    str
        Formatted time string (HH:MM:SS or MM:SS)
    """
    # Calculate total seconds
    total_seconds = milliseconds / 1000

    # Extract hours, minutes, and seconds
    hours = int(total_seconds // 3600)
    minutes = int((total_seconds % 3600) // 60)
    seconds = int(total_seconds % 60)

    # Format the time string
    if hours > 0:
        return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
    else:
        return f"{minutes:02d}:{seconds:02d}"


def create_conversation_transcript(
    transcription_json: Dict[str, Any],
) -> List[Dict[str, Any]]:
    """
    Creates a transcript of a multi-turn conversation, preserving the turn-taking structure.

    Parameters:
    -----------
    transcription_json : Dict[str, Any]
        The JSON result from Azure Speech Service transcription

    Returns:
    --------
    List[Dict[str, Any]]
        List of conversation turns, each containing speaker information and text
    """
    if "phrases" not in transcription_json:
        print("Error: No phrases found in the transcription JSON")
        return []

    conversation = []
    current_speaker = None
    current_turn = None

    # Sort phrases by their timestamp to ensure chronological order
    phrases = sorted(
        transcription_json["phrases"], key=lambda x: x["offsetMilliseconds"]
    )

    for phrase in phrases:
        if "speaker" not in phrase:
            continue

        speaker_id = phrase["speaker"]
        start_time_ms = phrase["offsetMilliseconds"]
        end_time_ms = start_time_ms + phrase["durationMilliseconds"]

        # If this is a new speaker or the first phrase
        if speaker_id != current_speaker:
            # Save the previous turn if it exists
            if current_turn is not None:
                # Join the text into a single string
                current_turn["text"] = " ".join(current_turn["text"])
                conversation.append(current_turn)

            # Start a new turn
            current_turn = {
                "speaker_id": speaker_id,
                "start_time_ms": start_time_ms,
                "end_time_ms": end_time_ms,
                "start_time": ms_to_time_format(start_time_ms),
                "end_time": ms_to_time_format(end_time_ms),
                "text": [phrase["text"]],
                "confidence": phrase.get("confidence", 0),
            }
            current_speaker = speaker_id
        else:
            # Continue the current turn
            current_turn["text"].append(phrase["text"])
            current_turn["end_time_ms"] = end_time_ms
            current_turn["end_time"] = ms_to_time_format(end_time_ms)

            # Update confidence (use average)
            if "confidence" in phrase:
                current_turn["confidence"] = (
                    current_turn["confidence"] + phrase["confidence"]
                ) / 2

    # Add the last turn
    if current_turn is not None:
        # Join the text into a single string
        current_turn["text"] = " ".join(current_turn["text"])
        conversation.append(current_turn)

    return conversation


def format_transcript_as_string(transcript: List[Dict[str, Any]]) -> str:
    """
    Format the conversation transcript as a human-readable string.

    Parameters:
    -----------
    transcript : List[Dict[str, Any]]
        The conversation transcript generated by create_conversation_transcript

    Returns:
    --------
    str
        A formatted string representation of the transcript
    """
    formatted_transcript = []

    for turn in transcript:
        speaker_id = turn["speaker_id"]
        text = turn["text"]
        formatted_line = f'speaker {speaker_id}: "{text}"'
        formatted_transcript.append(formatted_line)

    # Join all lines with newlines
    return "\n".join(formatted_transcript)


def process_transcript_file(transcription_result: Dict, output_file: str) -> None:
    """
    Process a transcription file and create a formatted conversation transcript.

    Parameters:
    -----------
    input_file : str
        The input transcription JSON file
    output_file : str
        The output conversation transcript JSON file
    """
    # Load the transcription result
    # with open(input_file, "r") as json_file:
    #    transcription_result = json.load(json_file)

    # Create the conversation transcript
    transcript = create_conversation_transcript(transcription_result)

    # Print the transcript
    # print("Conversation Transcript:")
    # for i, turn in enumerate(transcript):
    #    print(f"\nTurn {i+1} - Speaker {turn['speaker_id']}:")
    #    print(f"Time: {turn['start_time']} - {turn['end_time']}")
    #    if "confidence" in turn:
    #        print(f"Confidence: {turn['confidence']:.4f}")

    # create the string transcript
    transcript_string = format_transcript_as_string(transcript)
    # Save the transcript to a file
    with open(output_file, "w") as json_file:
        json.dump(transcript, json_file, indent=4)

    print(f"\nConversation transcript saved to {output_file}")
    return transcript_string

In [9]:
def transcribe_audio_with_diarization(
    service_region,
    subscription_key,
    audio_file_path,
    locales=["en-US"],
    max_speakers=2,
    save_to_file=None,
    api_version="2024-11-15",
):
    """
    Transcribe an audio file with speaker diarization using Azure Speech Service.

    Parameters:
    -----------
    service_region : str
        Azure Speech service region (e.g., 'westus', 'eastus', 'southeastasia')
    subscription_key : str
        Azure Speech subscription key
    audio_file_path : str
        Path to the audio file to transcribe
    locales : list
        List of language locales (default: ["en-US"])
    max_speakers : int
        Maximum number of speakers for diarization (default: 2)
    save_to_file : str or None
        Path to save the transcription results as JSON (default: None)
    api_version : str
        API version to use (default: "2024-11-15")

    Returns:
    --------
    dict
        The transcription result from Azure Speech Service
    """
    # API endpoint URL
    url = f"https://{service_region}.api.cognitive.microsoft.com/speechtotext/transcriptions:transcribe?api-version={api_version}"

    # Request headers
    headers = {"Ocp-Apim-Subscription-Key": subscription_key}

    # JSON definition for transcription parameters as a string
    definition_str = json.dumps(
        {
            "locales": locales,
            "diarization": {"maxSpeakers": max_speakers, "enabled": True},
        }
    )

    try:
        # Open the audio file
        with open(audio_file_path, "rb") as audio_file:
            # Prepare the multipart form data
            files = {"audio": audio_file, "definition": (None, definition_str)}

            # Make the POST request
            print(
                f"Sending request to Azure Speech Service to transcribe {audio_file_path}..."
            )
            response = requests.post(url, headers=headers, files=files)

        # Check the response
        if response.status_code == 200:
            print("Transcription successful")
            transcription_result = response.json()

            # Save the full response to a JSON file if requested
            if save_to_file:
                transcript_string = process_transcript_file(
                    transcription_result, save_to_file
                )

            return transcript_string
        else:
            print(f"Error: {response.status_code}")
            print(response.text)
            return None

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [10]:
# Replace these with your actual values
service_region = "southeastasia"
subscription_key = os.getenv("AZURE_SUBSCRIPTION_KEY")
transcript = transcribe_audio_with_diarization(
    service_region=service_region,
    subscription_key=subscription_key,
    audio_file_path=AUDIO_PATH,
    save_to_file="transcription_result.json",
)
print(transcript)

Sending request to Azure Speech Service to transcribe interview_negative_audio.wav...
Transcription successful

Conversation transcript saved to transcription_result.json
speaker 1: "No, did you notice I'm wearing blue?"
speaker 2: "Because I saw your company has blue logo, so I'm what the blue?"
speaker 1: "I mean, that's a nice shirt, but actually our logo's red, just for information."
speaker 2: "It's red."
speaker 1: "Yeah, but I like it. I mean, it's a great shirt."
speaker 2: "Oh, wait. Oh, no, it's another company. Sorry, I mixed it up."
speaker 1: "Gotcha, gotcha. I understand."
speaker 2: "I have a red sweater even. Should I go?"
speaker 1: "No, no, no, not necessary. It's not a problem. We're just getting to know each other today. Before I get going, I just want you to know there are no right or wrong answers. I just want you to answer honestly, okay? And one more thing, just to confirm, you are clear on the pay to pay a 60,000 a year with a 10% yearly bonus. Can you confirm 

## Perform Body language Analysis


In [11]:
# Upload the file using the API
file_upload = client.files.upload(file=pathlib.Path(VIDEO_PATH))

# Prepare the file to be uploaded
while file_upload.state == "PROCESSING":
    print("Waiting for video to be processed.")
    time.sleep(10)
    file_upload = client.files.get(name=file_upload.name)

if file_upload.state == "FAILED":
    raise ValueError(file_upload.state)
print(f"Video processing complete: " + file_upload.uri)

Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/wddukrot37kx


In [12]:
SYSTEM_PROMPT = """When given a video and a query, call the relevant function only once with 
the appropriate timecodes and text for the video"""

USER_PROMPT = """
You are tasked with analyzing an **online interview video** of a candidate. **Your goal is to analyze the candidate's behavior and communication** based on the provided video transcript.

Here is the **transcript of the video**:  
<Transcript>  
{transcript}  
</Transcript>  

### Instructions:  
1. The video may contain **both the candidate and the interviewer**.  
2. **Only analyze the candidate** — ignore the interviewer.  
3. Analyze the **candidate's body language, expressions, tone of voice, speech speed, emotions, and non-verbal communication**.  
4. If both **positive and negative behaviors** are present, mention both clearly.  
5. Your analysis should be **neutral, detailed, and professional** — without making assumptions beyond observable behaviors.  

---

### Metrics you MUST analyze (explanation and examples):  

1. **Speech and Language Analysis**:  
   - **Tone of voice, speech speed, word choice, clarity, structure of sentences**.  
   - **Positive example**: "The candidate speaks clearly, uses appropriate vocabulary, and maintains a good pace."  
   - **Negative example**: "The candidate speaks too fast and uses informal words that are inappropriate for an interview."  

2. **Non-Verbal Communication**:  
   - **Body language, posture, hand movements, facial expressions, eye contact**.  
   - **Positive example**: "The candidate maintains good eye contact and uses open hand gestures."  
   - **Negative example**: "The candidate avoids eye contact and frequently looks down, which may indicate nervousness."  

3. **Emotions Expressed by the Candidate**:  
   - **Visible emotions like confidence, nervousness, excitement, frustration, boredom**.  
   - **Positive example**: "The candidate smiles genuinely and seems enthusiastic about the role."  
   - **Negative example**: "The candidate appears anxious, with a tense face and trembling hands."  

4. **Tone of Voice of the Candidate**:  
   - **Tone, energy, modulation, friendliness, confidence**.  
   - **Positive example**: "The candidate has a calm and confident tone."  
   - **Negative example**: "The candidate's tone is monotonous and lacks enthusiasm."  

5. **Body Language Analysis** (detailed):  
   - **Posture, movements, distance from camera, nervous gestures**.  
   - **Positive example**: "The candidate sits upright and leans slightly forward to show interest."  
   - **Negative example**: "The candidate slouches and fidgets with their hands, showing discomfort."  

---

### Final Report Format (JSON):  

```json
{{
  "body_language_analysis": "Detailed observation here.",
  "emotions_expressed_by_the_person_in_the_video": "Detailed observation here.",
  "non_verbal_communication": "Detailed observation here.",
  "speech_and_language_analysis": "Detailed observation here.",
  "tone_of_voice_of_the_candidate": "Detailed observation here."
}}

### IMPORTANT NOTES:
  - Be specific and balanced — mention both positive and negative points, if present.
  - Identify and focus only on the candidate, even if the interviewer is present.
  - Avoid making assumptions — focus on observable facts.
  - Your final output must strictly follow the JSON format shared above.
  - It should be concise and to the point.
  - The report doens't need to include what happened at each timecode.It should be a summary of the entire video.
"""
USER_PROMPT = USER_PROMPT.format(transcript=transcript)

In [13]:
def extract_json_from_markdown(markdown_string):
    """
    Extract and parse JSON from a markdown code block string.

    Parameters:
    -----------
    markdown_string : str
        String containing a markdown code block with JSON

    Returns:
    --------
    dict
        Parsed JSON as a Python dictionary
    """
    # Remove markdown code block indicators and newlines
    json_string = markdown_string.strip()
    json_string = re.sub(r"^```json\n", "", json_string)
    json_string = re.sub(r"\n```$", "", json_string)

    # Replace escaped newlines with actual newlines
    json_string = json_string.replace("\\n", "\n")

    # Replace escaped quotes with actual quotes
    json_string = json_string.replace("\\'", "'")

    # Parse the JSON string into a Python dictionary
    json_obj = json.loads(json_string)

    return json_obj

In [14]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[
        types.Content(
            role="user",
            parts=[
                types.Part.from_uri(
                    file_uri=file_upload.uri, mime_type=file_upload.mime_type
                ),
            ],
        ),
        USER_PROMPT,
    ],
    config=types.GenerateContentConfig(
        system_instruction=SYSTEM_PROMPT,
        temperature=0.0,
    ),
)
video_and_audio_analysis_report = extract_json_from_markdown(response.text)
Markdown(response.text)

```json
{
  "body_language_analysis": "The candidate's body language is generally relaxed, with occasional hand gestures for emphasis. However, there are instances where he leans back or looks away, which could indicate disengagement or discomfort. Towards the end of the interview, he touches his face and hair more frequently, suggesting increasing nervousness or impatience.",
  "emotions_expressed_by_the_person_in_the_video": "The candidate expresses a range of emotions, including enthusiasm, humor, and occasional frustration. He seems confident initially but becomes more defensive and agitated when discussing salary and the company's in-office mandate. There are moments of visible impatience and skepticism, particularly when the interviewer explains the hiring process.",
  "non_verbal_communication": "The candidate uses hand gestures to emphasize points and maintain engagement. Eye contact is generally good, but there are instances where he looks away, especially when discussing sensitive topics like salary or his 'weakness.' His posture shifts throughout the interview, sometimes leaning forward to show interest and other times leaning back, potentially indicating disagreement or skepticism.",
  "speech_and_language_analysis": "The candidate's speech is generally clear, but there are moments where he speaks rapidly, especially when excited or defensive. His word choice is sometimes informal and includes slang, which may not be appropriate for a professional interview. He occasionally interrupts the interviewer and challenges their statements directly. The structure of his sentences is generally coherent, but he sometimes goes off on tangents.",
  "tone_of_voice_of_the_candidate": "The candidate's tone is initially friendly and enthusiastic, but it becomes more assertive and challenging as the interview progresses. His tone fluctuates between confident and defensive, particularly when discussing compensation and the company's policies. There are moments where his tone becomes sarcastic or dismissive, especially when questioning the interviewer's statements or procedures."
}
```

## Score the Candidate


In [15]:
SCORING_SYSTEM_PROMPT = """
You are a hiring manager and you need to score the candidate. 
You are also very experienced in hiring and you know what is expected from a candidate in an interview.
You are also given the transcript of the video and the video and audio analysis report.
"""
SCORING_USER_PROMPT = """
You are a hiring manager and you need to score the candidate
based on the following criteria:
Score the candidate based on the following criteria:
1. Body language
2. Non-verbal communication
3. Emotions expressed by the candidate
4. Tone of voice of the candidate
5. Speech and language analysis
6. Skills and experience
7. Interest, motivation, adaptability, responsibility
9. Professionalism, interpersonal.

<Instructions>
1. The score should be between 0 and 5.
2. The score should be a number.
3. The score should be based on the candidate's transcript and the Video and Audio analysis report.
4. Make sure to give the score for each of the criteria.
5. 5 is the highest score and 0 is the lowest score.
6. You should be non-biased and fair while scoring the candidate.
7. You should be able to justify the score for each of the criteria.
8. You should be professional while scoring the candidate.
</Instructions>
<Scoring Criteria>
1. 10 - Excellent
2. >= 8 and < 10 - Above Average
3. >= 6 and < 8 - Average
4. >= 4 and < 6 - Below Average
5. >= 2 and < 4 - Poor
6. < 2 - Very Poor
</Scoring Criteria>
<Response Format>
1. You should return a JSON object with the following format:
{{
  "body_language_score": [Reason for the score,0-10],
  "non_verbal_communication_score": [Reason for the score,0-10],
  "emotions_expressed_by_the_person_in_the_video_score": [Reason for the score,0-10],
  "tone_of_voice_of_the_candidate_score": [Reason for the score,0-10],
  "speech_and_language_analysis_score": [Reason for the score,0-10],
  "skills_and_experience_score": [Reason for the score,0-10],
  "interest_motivation_adaptability_responsibility_score": [Reason for the score,0-10],
  "professionalism_interpersonal": [Reason for the score,0-10]
}}
</Response Format>
Rate the candidate based on the following transcript and the video and audio analysis report.
<Transcript>
{transcript}
</Transcript>
<Video and Audio Analysis Report>
{video_and_audio_analysis_report}
</Video and Audio Analysis Report>

### Important Notes:
1. Review the transcript and the video and audio analysis report carefully.
2. Review Scoring Criteria carefully.
"""

SCORING_USER_PROMPT = SCORING_USER_PROMPT.format(
    transcript=transcript,
    video_and_audio_analysis_report=json.dumps(
        video_and_audio_analysis_report, indent=2
    ),
)
print(SCORING_USER_PROMPT)


You are a hiring manager and you need to score the candidate
based on the following criteria:
Score the candidate based on the following criteria:
1. Body language
2. Non-verbal communication
3. Emotions expressed by the candidate
4. Tone of voice of the candidate
5. Speech and language analysis
6. Skills and experience
7. Interest, motivation, adaptability, responsibility
9. Professionalism, interpersonal.

<Instructions>
1. The score should be between 0 and 5.
2. The score should be a number.
3. The score should be based on the candidate's transcript and the Video and Audio analysis report.
4. Make sure to give the score for each of the criteria.
5. 5 is the highest score and 0 is the lowest score.
6. You should be non-biased and fair while scoring the candidate.
7. You should be able to justify the score for each of the criteria.
8. You should be professional while scoring the candidate.
</Instructions>
<Scoring Criteria>
1. 10 - Excellent
2. >= 8 and < 10 - Above Average
3. >= 6 a

In [16]:
response = client.models.generate_content(
    model=MODEL_ID,
    contents=[SCORING_USER_PROMPT],
    config=types.GenerateContentConfig(
        system_instruction=SCORING_SYSTEM_PROMPT,
        temperature=0.0,
    ),
)

Markdown(response.text)

```json
{
  "body_language_score": ["The candidate's body language is generally relaxed but shows signs of disengagement and nervousness later in the interview. Occasional hand gestures are used, but leaning back and increased face-touching suggest discomfort. Therefore, the candidate gets a below-average score.", 4],
  "non_verbal_communication_score": ["The candidate uses hand gestures and maintains generally good eye contact, but looking away during sensitive topics indicates discomfort or skepticism. Posture shifts also reflect varying levels of engagement and disagreement. Therefore, the candidate gets a below-average score.", 4],
  "emotions_expressed_by_the_person_in_the_video_score": ["The candidate expresses a wide range of emotions, including enthusiasm, humor, frustration, and defensiveness. The emotional range is broad, but the negative emotions expressed, especially regarding salary and office mandates, detract from the overall impression. Therefore, the candidate gets an average score.", 6],
  "tone_of_voice_of_the_candidate_score": ["The candidate's tone shifts from friendly and enthusiastic to assertive, challenging, and sometimes sarcastic. This inconsistency and the presence of negative tones impact the overall impression. Therefore, the candidate gets a below-average score.", 4],
  "speech_and_language_analysis_score": ["The candidate's speech is generally clear but includes rapid speech, informal word choices, and occasional interruptions. Tangents and challenges to the interviewer's statements further detract from the professionalism. Therefore, the candidate gets a below-average score.", 4],
  "skills_and_experience_score": ["Based on the transcript, the candidate's skills and experience are not clearly articulated. The 'Stanford' claim is misleading, and there's no concrete discussion of relevant skills. The focus is more on personal needs and demands than on what the candidate can offer. Therefore, the candidate gets a poor score.", 3],
  "interest_motivation_adaptability_responsibility_score": ["The candidate expresses interest but does so in a demanding and inflexible manner. There's a lack of adaptability regarding the in-office mandate and salary. Responsibility is questionable, given the 'weakness' explanation and the overall tone. Therefore, the candidate gets a poor score.", 2],
  "professionalism_interpersonal": ["The candidate demonstrates a lack of professionalism through inappropriate comments, demanding behavior, and challenges to the interviewer. Interpersonal skills are poor, as evidenced by the inability to engage in a constructive conversation. Therefore, the candidate gets a very poor score.", 1]
}
```