<a href="https://colab.research.google.com/github/dhruvpathak23/Assignment/blob/main/Dhruv_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Call Quality Analyzer - Colab-ready Python notebook

# 1) Install required packages (Run in Colab)
!pip install -q yt-dlp pydub transformers torch torchvision torchaudio librosa soundfile openai-whisper numpy scipy scikit-learn

# Optional (pyannote) - only if you provide HF_TOKEN as an environment variable
!pip install -q pyannote.audio

# %%%
# 2) Imports
import os
import sys
import math
import tempfile
from pathlib import Path
from subprocess import run
import numpy as np
import librosa
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
import whisper
import time
import csv
import re

# %%%
# 3) Helper: download youtube audio and convert to mono WAV 16k
YOUTUBE_URL = 'https://www.youtube.com/watch?v=4ostqJD3Psc'  # test file required by assignment
OUT_WAV = 'call.wav'

def download_audio(youtube_url, out_wav=OUT_WAV):
    tmpfile = 'audio_temp.m4a'
    run(['yt-dlp', '-x', '--audio-format', 'm4a', '-o', tmpfile, youtube_url], check=False)
    audio = AudioSegment.from_file(tmpfile)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio.export(out_wav, format='wav')
    return out_wav

# %%%
# 4) Voice activity detection (simple energy-based fallback)
def get_speech_timestamps_energy(wav_path, frame_duration_ms=30, agg_energy_threshold=0.0005):
    y, sr = librosa.load(wav_path, sr=16000)
    frame_len = int(sr * (frame_duration_ms/1000.0))
    frames = [y[i:i+frame_len] for i in range(0, len(y), frame_len)]
    timestamps = []
    is_speech = False
    start = 0
    for i, f in enumerate(frames):
        energy = np.mean(f**2)
        if energy > agg_energy_threshold and not is_speech:
            is_speech = True
            start = i*frame_duration_ms/1000.0
        if energy <= agg_energy_threshold and is_speech:
            end = i*frame_duration_ms/1000.0
            timestamps.append((start, end))
            is_speech = False
    if is_speech:
        timestamps.append((start, len(y)/sr))
    return timestamps

# %%%
# 5) Load whisper model
MODEL = 'tiny'  # tiny model for speed
print('Loading Whisper model...')
start = time.time()
wh = whisper.load_model(MODEL)
print('Loaded in', time.time()-start, 's')

# %%%
# 6) High-level pipeline function
def analyze_call(wav_path):
    vad_segs = get_speech_timestamps_energy(wav_path)
    result = wh.transcribe(wav_path, word_timestamps=False)
    segments = result.get('segments', [])

    # Alternate speakers
    turns = []
    speaker = 'A'
    last_end = None
    for s in segments:
        st, ed = s['start'], s['end']
        txt = s['text'].strip()
        if last_end is not None and (st - last_end) > 0.8:
            speaker = 'B' if speaker=='A' else 'A'
        turns.append((speaker, st, ed, txt))
        last_end = ed

    # Talk-time ratio (improved: total words per speaker weighted by duration)
    dur_A = sum(ed-st for sp,st,ed,tx in turns if sp=='A')
    dur_B = sum(ed-st for sp,st,ed,tx in turns if sp=='B')
    words_A = sum(len(tx.split()) for sp,st,ed,tx in turns if sp=='A')
    words_B = sum(len(tx.split()) for sp,st,ed,tx in turns if sp=='B')
    total_time = dur_A + dur_B if (dur_A+dur_B)>0 else 1e-6
    talk_ratio = {
        'A': (0.6*(dur_A/total_time) + 0.4*(words_A/(words_A+words_B+1e-6))),
        'B': (0.6*(dur_B/total_time) + 0.4*(words_B/(words_A+words_B+1e-6)))
    }

    # Improved question detection
    num_questions = 0
    question_pattern = re.compile(r'(\?|\b(what|why|how|when|where|who|which|can|could|would|should|do|did|does|is|are|am)\b)', re.IGNORECASE)
    for sp,st,ed,txt in turns:
        if question_pattern.search(txt):
            num_questions += 1

    # Longest monologue (based on continuous speech duration)
    longest_mono = 0.0
    cur_speaker, cur_start, cur_end = None, None, None
    for sp,st,ed,tx in turns:
        if sp != cur_speaker:
            if cur_speaker is not None:
                longest_mono = max(longest_mono, cur_end - cur_start)
            cur_speaker, cur_start, cur_end = sp, st, ed
        else:
            cur_end = ed
    if cur_speaker is not None:
        longest_mono = max(longest_mono, cur_end - cur_start)

    # Sentiment (averaged across turns for robustness)
    sentiment_pipe = pipeline('sentiment-analysis')
    sentiments = [sentiment_pipe(tx[:512])[0]['label'] for sp,st,ed,tx in turns if tx]
    pos = sentiments.count('POSITIVE')
    neg = sentiments.count('NEGATIVE')
    neu = len(sentiments) - pos - neg
    if pos >= max(neg, neu):
        call_sentiment = 'POSITIVE'
    elif neg >= max(pos, neu):
        call_sentiment = 'NEGATIVE'
    else:
        call_sentiment = 'NEUTRAL'

    # Actionable insight (contextual)
    if talk_ratio['A'] > 0.7 or talk_ratio['B'] > 0.7:
        dominant = 'A' if talk_ratio['A']>talk_ratio['B'] else 'B'
        insight = f"Speaker {dominant} is dominating (~{talk_ratio[dominant]*100:.1f}%). Encourage the other speaker to participate more."
    elif num_questions < 3:
        insight = "Few questions were asked. Recommend using more open-ended questions to engage the other party."
    elif call_sentiment == 'NEGATIVE':
        insight = "Overall negative sentiment detected. Address objections proactively and ensure clarity of next steps."
    else:
        insight = "Balanced interaction with healthy engagement. Consider summarizing agreed points and next steps at the end."

    return {
        'talk_time_ratio': talk_ratio,
        'num_questions': num_questions,
        'longest_monologue_s': longest_mono,
        'call_sentiment': call_sentiment,
        'insight': insight,
        'turns': turns
    }

# %%%
# 7) Run end-to-end
print('Downloading audio...')
download_audio(YOUTUBE_URL)
print('Analyzing...')
start = time.time()
res = analyze_call(OUT_WAV)
print('Done in', time.time()-start, 's')

# %%%
# 8) Print results
print('\n--- Results ---')
print('Talk-time ratio (A/B):', res['talk_time_ratio'])
print('Number of questions:', res['num_questions'])
print('Longest monologue (s):', res['longest_monologue_s'])
print('Call sentiment:', res['call_sentiment'])
print('Actionable insight:', res['insight'])

with open('turns.csv','w',newline='',encoding='utf-8') as f:
    w = csv.writer(f)
    w.writerow(['speaker','start','end','text'])
    for sp,st,ed,tx in res['turns']:
        w.writerow([sp,st,ed,tx])
print('Saved turns.csv')

# %%%
# Notes
# - For robust speaker-diarization, integrate pyannote.audio
# - For more accurate ASR, use faster-whisper or OpenAI API
# - Tiny Whisper is chosen for speed to stay <30s

# End of notebook

Loading Whisper model...
Loaded in 0.9832863807678223 s
Downloading audio...
Analyzing...


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Done in 15.300451517105103 s

--- Results ---
Talk-time ratio (A/B): {'A': 0.9999999988130563, 'B': 0.0}
Number of questions: 16
Longest monologue (s): 118.08
Call sentiment: POSITIVE
Actionable insight: Speaker A is dominating (~100.0%). Encourage the other speaker to participate more.
Saved turns.csv
