In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/amanaman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
import speech_recognition as sr
import librosa
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

# Make sure you have downloaded NLTK punkt (and punkt_tab if needed):
# nltk.download('punkt')
# nltk.download('punkt_tab')  # If NLTK complains about 'punkt_tab' resource.

def transcribe_audio(audio_file_path):
    """
    Transcribe audio using the SpeechRecognition library.
    Returns the recognized text as a string.
    """
    print(f"[DEBUG] Starting transcription for: {audio_file_path}")
    recognizer = sr.Recognizer()

    with sr.AudioFile(audio_file_path) as source:
        audio_data = recognizer.record(source)
    
    try:
        # Using the default Google Web Speech API (requires internet).
        recognized_text = recognizer.recognize_google(audio_data)
        print(f"[DEBUG] Transcription result: {recognized_text[:100]}...")  # Print first 100 chars
        return recognized_text
    except sr.UnknownValueError:
        print("[ERROR] SpeechRecognition could not understand the audio.")
        return ""
    except sr.RequestError as e:
        print(f"[ERROR] Could not request results from Google Speech Recognition service; {e}")
        return ""


def calculate_word_accuracy_rate(recognized_text, reference_text):
    """
    Calculate Word Accuracy Rate (WAR).
    """
    print("[DEBUG] Calculating Word Accuracy Rate (WAR).")
    recognized_words = word_tokenize(recognized_text.lower())
    reference_words = word_tokenize(reference_text.lower())

    total_words = len(reference_words)
    if total_words == 0:
        print("[WARNING] Reference text has 0 words.")
        return 0.0

    correct_count = 0
    for ref_word, rec_word in zip(reference_words, recognized_words):
        if ref_word == rec_word:
            correct_count += 1
    
    war = (correct_count / total_words) * 100
    print(f"[DEBUG] Correct word count: {correct_count}/{total_words}, WAR: {war:.2f}%")
    return war


def measure_speech_rate(recognized_text, audio_duration):
    """
    Estimate speech rate in Words Per Minute (WPM).
    """
    print("[DEBUG] Measuring speech rate.")
    words = word_tokenize(recognized_text)
    num_words = len(words)
    if audio_duration == 0:
        print("[WARNING] Audio duration is 0. Returning 0 WPM.")
        return 0.0
    
    wpm = (num_words / audio_duration) * 60
    print(f"[DEBUG] Number of words: {num_words}, Duration: {audio_duration:.2f}s, WPM: {wpm:.2f}")
    return wpm


def analyze_pitch_and_intonation(audio_file_path, sr_lib=22050):
    """
    Analyzes pitch (F0) and basic intonation using librosa.
    Returns average pitch, pitch variability, and pitch range.
    """
    print("[DEBUG] Analyzing pitch and intonation.")
    y, sr_actual = librosa.load(audio_file_path, sr=sr_lib)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr_actual)
    
    pitch_values = []
    for i in range(pitches.shape[1]):
        pitch_slice = pitches[:, i]
        mag_slice = magnitudes[:, i]
        index = np.argmax(mag_slice)
        pitch_val = pitch_slice[index]
        if pitch_val > 0:  # 0 indicates no pitch
            pitch_values.append(pitch_val)
    
    if len(pitch_values) == 0:
        print("[WARNING] No pitch values detected.")
        return {
            "average_pitch": 0.0,
            "pitch_variability": 0.0,
            "pitch_range": 0.0
        }
    
    avg_pitch = np.mean(pitch_values)
    pitch_variability = np.std(pitch_values)
    pitch_range = np.max(pitch_values) - np.min(pitch_values)

    print(f"[DEBUG] Pitch analysis -> Avg: {avg_pitch:.2f}, Std Dev: {pitch_variability:.2f}, Range: {pitch_range:.2f}")
    return {
        "average_pitch": float(avg_pitch),
        "pitch_variability": float(pitch_variability),
        "pitch_range": float(pitch_range)
    }


def measure_signal_quality(audio_file_path, sr_lib=22050):
    """
    A simplified measure of signal quality by estimating SNR using RMS.
    """
    print("[DEBUG] Measuring signal (audio) quality.")
    y, sr_actual = librosa.load(audio_file_path, sr=sr_lib)
    
    rms = librosa.feature.rms(y=y)[0]
    avg_rms = np.mean(rms)

    sorted_rms = np.sort(rms)
    cutoff = int(0.1 * len(sorted_rms))
    if cutoff == 0:
        cutoff = 1
    low_energy_frames = sorted_rms[:cutoff]
    noise_floor = np.mean(low_energy_frames)
    
    if noise_floor == 0:
        noise_floor = 1e-10  # avoid division by zero

    signal_noise_ratio = avg_rms / noise_floor
    
    print(f"[DEBUG] avg_rms: {avg_rms:.6f}, noise_floor: {noise_floor:.6f}, SNR: {signal_noise_ratio:.2f}")
    return {
        "average_rms": float(avg_rms),
        "noise_floor": float(noise_floor),
        "signal_to_noise_ratio": float(signal_noise_ratio)
    }


def measure_volume_consistency(audio_file_path, sr_lib=22050, frame_length=1024, hop_length=512):
    """
    Measures volume consistency by computing the std deviation of RMS across frames.
    """
    print("[DEBUG] Measuring volume consistency.")
    y, sr_actual = librosa.load(audio_file_path, sr=sr_lib)
    rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
    std_rms = np.std(rms)
    print(f"[DEBUG] Volume std dev (RMS): {std_rms:.6f}")
    return {
        "volume_std_dev": float(std_rms)
    }


def analyze_comprehension_and_pausing(recognized_text, reference_text):
    """
    Very naive placeholder for comprehension/pausing analysis.
    """
    print("[DEBUG] Analyzing comprehension and pausing (naive approach).")
    ref_length = len(reference_text)
    rec_length = len(recognized_text)
    
    if ref_length == 0:
        print("[WARNING] Reference text has length 0. Cannot compute ratio.")
        return {
            "length_ratio": 0.0,
            "comprehension_score": 0.0
        }
    
    length_ratio = rec_length / ref_length
    # We'll define a 'comprehension_score' as how close this ratio is to 1.0
    comprehension_score = 1.0 - abs(1.0 - length_ratio)

    print(f"[DEBUG] Length ratio: {length_ratio:.2f}, Comprehension score: {comprehension_score:.2f}")
    return {
        "length_ratio": float(length_ratio),
        "comprehension_score": float(comprehension_score)
    }


def detect_emotion_heuristic(pitch_analysis):
    """
    A trivial emotion heuristic based on pitch variability.
    """
    print("[DEBUG] Detecting emotion (heuristic).")
    pitch_var = pitch_analysis["pitch_variability"]
    
    if pitch_var < 10:
        emotion = "Monotone"
    elif 10 <= pitch_var < 30:
        emotion = "Neutral"
    else:
        emotion = "Expressive"
    
    print(f"[DEBUG] Pitch variability: {pitch_var:.2f}, Emotion label: {emotion}")
    return emotion


def generate_report(audio_file_path, reference_text):
    """
    Main function that calls all the analysis functions and compiles a report.
    """
    print("[DEBUG] Starting analysis and report generation.")
    
    # 1. Transcription
    recognized_text = transcribe_audio(audio_file_path)
    
    # 2. Basic metrics: Word Accuracy Rate
    word_accuracy = calculate_word_accuracy_rate(recognized_text, reference_text)
    
    # 3. Audio duration
    print("[DEBUG] Loading audio with librosa to get duration.")
    y, sr_lib = librosa.load(audio_file_path, sr=None)  # sr=None => load at original sample rate
    audio_duration = librosa.get_duration(y=y, sr=sr_lib)
    print(f"[DEBUG] Audio duration: {audio_duration:.2f} seconds.")
    
    # 4. Speech rate
    speech_rate_wpm = measure_speech_rate(recognized_text, audio_duration)
    
    # 5. Prosody (pitch, intonation)
    pitch_info = analyze_pitch_and_intonation(audio_file_path)
    
    # 6. Technical features: signal quality, volume consistency
    signal_info = measure_signal_quality(audio_file_path)
    volume_consistency_info = measure_volume_consistency(audio_file_path)
    
    # 7. Comprehension (very naive placeholder)
    comprehension_info = analyze_comprehension_and_pausing(recognized_text, reference_text)
    
    # 8. Emotion detection (heuristic)
    emotion_label = detect_emotion_heuristic(pitch_info)
    
    # Combine into a holistic report
    report = {
        "Transcription": recognized_text,
        "Metrics": {
            "AccuracyOfReading": {
                "WordAccuracyRate": word_accuracy
            },
            "Prosody": {
                "AveragePitch": pitch_info["average_pitch"],
                "PitchVariability": pitch_info["pitch_variability"],
                "PitchRange": pitch_info["pitch_range"]
            },
            "SpeechClarity": {
                "SpeechRateWPM": speech_rate_wpm
            },
            "ComprehensionMetrics": {
                "ComprehensionScore": comprehension_info["comprehension_score"]
            },
            "TechnicalAudioFeatures": {
                "SignalToNoiseRatio": signal_info["signal_to_noise_ratio"],
                "VolumeStdDev": volume_consistency_info["volume_std_dev"]
            },
            "EmotionDetection": emotion_label
        }
    }
    
    print("[DEBUG] Final report generated.")
    return report


if __name__ == "__main__":
    # Example usage
    reference_text = """Hello, this is a sample reference text 
    that the speaker was supposed to read aloud exactly."""

    audio_path = "temp.wav"  # Replace with the path to your audio file
    
    final_report = generate_report(audio_path, reference_text)
    
    # Print or process the final report
    import pprint
    pprint.pprint(final_report)


[DEBUG] Starting analysis and report generation.
[DEBUG] Starting transcription for: temp.wav
[DEBUG] Transcription result: you are walking on the street and suddenly you find a piece of gold you pick it up and you get super...
[DEBUG] Calculating Word Accuracy Rate (WAR).
[DEBUG] Correct word count: 0/18, WAR: 0.00%
[DEBUG] Loading audio with librosa to get duration.
[DEBUG] Audio duration: 247.32 seconds.
[DEBUG] Measuring speech rate.
[DEBUG] Number of words: 497, Duration: 247.32s, WPM: 120.57
[DEBUG] Analyzing pitch and intonation.
[DEBUG] Pitch analysis -> Avg: 472.59, Std Dev: 666.05, Range: 3854.28
[DEBUG] Measuring signal (audio) quality.
[DEBUG] avg_rms: 0.064292, noise_floor: 0.002693, SNR: 23.87
[DEBUG] Measuring volume consistency.
[DEBUG] Volume std dev (RMS): 0.055029
[DEBUG] Analyzing comprehension and pausing (naive approach).
[DEBUG] Length ratio: 26.62, Comprehension score: -24.62
[DEBUG] Detecting emotion (heuristic).
[DEBUG] Pitch variability: 666.05, Emotion label

In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/amanaman/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
import speech_recognition as sr
import librosa
import numpy as np
import nltk
from nltk.tokenize import word_tokenize



def transcribe_audio(audio_file_path):
    """
    Transcribe audio using the SpeechRecognition library.
    Returns the recognized text as a string.
    
    NOTE: In our updated code below, we will SKIP using this 
          function and instead manually set recognized_text.
    """
    print(f"[DEBUG] Starting transcription for: {audio_file_path}")
    recognizer = sr.Recognizer()

    with sr.AudioFile(audio_file_path) as source:
        audio_data = recognizer.record(source)
    
    try:
        recognized_text = recognizer.recognize_google(audio_data)
        print(f"[DEBUG] Transcription result: {recognized_text[:100]}...")  # Print first 100 chars
        return recognized_text
    except sr.UnknownValueError:
        print("[ERROR] SpeechRecognition could not understand the audio.")
        return ""
    except sr.RequestError as e:
        print(f"[ERROR] Could not request results from Google Speech Recognition service; {e}")
        return ""


def calculate_word_accuracy_rate(recognized_text, reference_text):
    """
    Calculate Word Accuracy Rate (WAR).
    """
    print("[DEBUG] Calculating Word Accuracy Rate (WAR).")
    recognized_words = word_tokenize(recognized_text.lower())
    reference_words = word_tokenize(reference_text.lower())

    total_words = len(reference_words)
    if total_words == 0:
        print("[WARNING] Reference text has 0 words.")
        return 0.0

    correct_count = 0
    for ref_word, rec_word in zip(reference_words, recognized_words):
        if ref_word == rec_word:
            correct_count += 1
    
    war = (correct_count / total_words) * 100
    print(f"[DEBUG] Correct word count: {correct_count}/{total_words}, WAR: {war:.2f}%")
    return war


def measure_speech_rate(recognized_text, audio_duration):
    """
    Estimate speech rate in Words Per Minute (WPM).
    """
    print("[DEBUG] Measuring speech rate.")
    words = word_tokenize(recognized_text)
    num_words = len(words)
    if audio_duration == 0:
        print("[WARNING] Audio duration is 0. Returning 0 WPM.")
        return 0.0
    
    wpm = (num_words / audio_duration) * 60
    print(f"[DEBUG] Number of words: {num_words}, Duration: {audio_duration:.2f}s, WPM: {wpm:.2f}")
    return wpm


def analyze_pitch_and_intonation(audio_file_path, sr_lib=22050):
    """
    Analyzes pitch (F0) and basic intonation using librosa.
    Returns average pitch, pitch variability, and pitch range.
    """
    print("[DEBUG] Analyzing pitch and intonation.")
    y, sr_actual = librosa.load(audio_file_path, sr=sr_lib)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr_actual)
    
    pitch_values = []
    for i in range(pitches.shape[1]):
        pitch_slice = pitches[:, i]
        mag_slice = magnitudes[:, i]
        index = np.argmax(mag_slice)
        pitch_val = pitch_slice[index]
        if pitch_val > 0:  # 0 indicates no pitch
            pitch_values.append(pitch_val)
    
    if len(pitch_values) == 0:
        print("[WARNING] No pitch values detected.")
        return {
            "average_pitch": 0.0,
            "pitch_variability": 0.0,
            "pitch_range": 0.0
        }
    
    avg_pitch = np.mean(pitch_values)
    pitch_variability = np.std(pitch_values)
    pitch_range = np.max(pitch_values) - np.min(pitch_values)

    print(f"[DEBUG] Pitch analysis -> Avg: {avg_pitch:.2f}, Std Dev: {pitch_variability:.2f}, Range: {pitch_range:.2f}")
    return {
        "average_pitch": float(avg_pitch),
        "pitch_variability": float(pitch_variability),
        "pitch_range": float(pitch_range)
    }


def measure_signal_quality(audio_file_path, sr_lib=22050):
    """
    A simplified measure of signal quality by estimating SNR using RMS.
    """
    print("[DEBUG] Measuring signal (audio) quality.")
    y, sr_actual = librosa.load(audio_file_path, sr=sr_lib)
    
    rms = librosa.feature.rms(y=y)[0]
    avg_rms = np.mean(rms)

    sorted_rms = np.sort(rms)
    cutoff = int(0.1 * len(sorted_rms))
    if cutoff == 0:
        cutoff = 1
    low_energy_frames = sorted_rms[:cutoff]
    noise_floor = np.mean(low_energy_frames)
    
    if noise_floor == 0:
        noise_floor = 1e-10  # avoid division by zero

    signal_noise_ratio = avg_rms / noise_floor
    
    print(f"[DEBUG] avg_rms: {avg_rms:.6f}, noise_floor: {noise_floor:.6f}, SNR: {signal_noise_ratio:.2f}")
    return {
        "average_rms": float(avg_rms),
        "noise_floor": float(noise_floor),
        "signal_to_noise_ratio": float(signal_noise_ratio)
    }


def measure_volume_consistency(audio_file_path, sr_lib=22050, frame_length=1024, hop_length=512):
    """
    Measures volume consistency by computing the std deviation of RMS across frames.
    """
    print("[DEBUG] Measuring volume consistency.")
    y, sr_actual = librosa.load(audio_file_path, sr=sr_lib)
    rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
    std_rms = np.std(rms)
    print(f"[DEBUG] Volume std dev (RMS): {std_rms:.6f}")
    return {
        "volume_std_dev": float(std_rms)
    }


def analyze_comprehension_and_pausing(recognized_text, reference_text):
    """
    Very naive placeholder for comprehension/pausing analysis.
    """
    print("[DEBUG] Analyzing comprehension and pausing (naive approach).")
    ref_length = len(reference_text)
    rec_length = len(recognized_text)
    
    if ref_length == 0:
        print("[WARNING] Reference text has length 0. Cannot compute ratio.")
        return {
            "length_ratio": 0.0,
            "comprehension_score": 0.0
        }
    
    length_ratio = rec_length / ref_length
    # We'll define a 'comprehension_score' as how close this ratio is to 1.0
    comprehension_score = 1.0 - abs(1.0 - length_ratio)

    print(f"[DEBUG] Length ratio: {length_ratio:.2f}, Comprehension score: {comprehension_score:.2f}")
    return {
        "length_ratio": float(length_ratio),
        "comprehension_score": float(comprehension_score)
    }


def detect_emotion_heuristic(pitch_analysis):
    """
    A trivial emotion heuristic based on pitch variability.
    """
    print("[DEBUG] Detecting emotion (heuristic).")
    pitch_var = pitch_analysis["pitch_variability"]
    
    if pitch_var < 10:
        emotion = "Monotone"
    elif 10 <= pitch_var < 30:
        emotion = "Neutral"
    else:
        emotion = "Expressive"
    
    print(f"[DEBUG] Pitch variability: {pitch_var:.2f}, Emotion label: {emotion}")
    return emotion


def generate_report(audio_file_path, reference_text):
    """
    Main function that calls all the analysis functions and compiles a report.
    """
    print("[DEBUG] Starting analysis and report generation.")
    
   
    
    recognized_text = "this is a reference text this is how i am transcribing the audio"
    print("[DEBUG] Skipping actual audio transcription. Using fixed recognized_text:")
    print(f"[DEBUG] {recognized_text}")
    
 
    
    # 1. Basic metrics: Word Accuracy Rate
    word_accuracy = calculate_word_accuracy_rate(recognized_text, reference_text)
    
    # 2. Audio duration (still using the audio file for duration, pitch, etc.)
    print("[DEBUG] Loading audio with librosa to get duration.")
    y, sr_lib = librosa.load(audio_file_path, sr=None)  # sr=None => load at original sample rate
    audio_duration = librosa.get_duration(y=y, sr=sr_lib)
    print(f"[DEBUG] Audio duration: {audio_duration:.2f} seconds.")
    
    # 3. Speech rate (WPM)
    speech_rate_wpm = measure_speech_rate(recognized_text, audio_duration)
    
    # 4. Prosody (pitch, intonation)
    pitch_info = analyze_pitch_and_intonation(audio_file_path)
    
    # 5. Technical features: signal quality, volume consistency
    signal_info = measure_signal_quality(audio_file_path)
    volume_consistency_info = measure_volume_consistency(audio_file_path)
    
    # 6. Comprehension (very naive placeholder)
    comprehension_info = analyze_comprehension_and_pausing(recognized_text, reference_text)
    
    # 7. Emotion detection (heuristic)
    emotion_label = detect_emotion_heuristic(pitch_info)
    
    # Combine into a holistic report
    report = {
        "Transcription": recognized_text,
        "Metrics": {
            "AccuracyOfReading": {
                "WordAccuracyRate": word_accuracy
            },
            "Prosody": {
                "AveragePitch": pitch_info["average_pitch"],
                "PitchVariability": pitch_info["pitch_variability"],
                "PitchRange": pitch_info["pitch_range"]
            },
            "SpeechClarity": {
                "SpeechRateWPM": speech_rate_wpm
            },
            "ComprehensionMetrics": {
                "ComprehensionScore": comprehension_info["comprehension_score"]
            },
            "TechnicalAudioFeatures": {
                "SignalToNoiseRatio": signal_info["signal_to_noise_ratio"],
                "VolumeStdDev": volume_consistency_info["volume_std_dev"]
            },
            "EmotionDetection": emotion_label
        }
    }
    
    print("[DEBUG] Final report generated.")
    return report


if __name__ == "__main__":
    # Example usage
    reference_text = """Hello, this is a sample reference text 
    that the speaker was supposed to read aloud exactly."""
    
 
    audio_path = "temp.wav"
    
    final_report = generate_report(audio_path, reference_text)
    
    import pprint
    pprint.pprint(final_report)


[DEBUG] Starting analysis and report generation.
[DEBUG] Skipping actual audio transcription. Using fixed recognized_text:
[DEBUG] this is a reference text this is how i am transcribing the audio
[DEBUG] Calculating Word Accuracy Rate (WAR).
[DEBUG] Correct word count: 0/18, WAR: 0.00%
[DEBUG] Loading audio with librosa to get duration.
[DEBUG] Audio duration: 247.32 seconds.
[DEBUG] Measuring speech rate.
[DEBUG] Number of words: 13, Duration: 247.32s, WPM: 3.15
[DEBUG] Analyzing pitch and intonation.
[DEBUG] Pitch analysis -> Avg: 472.59, Std Dev: 666.05, Range: 3854.28
[DEBUG] Measuring signal (audio) quality.
[DEBUG] avg_rms: 0.064292, noise_floor: 0.002693, SNR: 23.87
[DEBUG] Measuring volume consistency.
[DEBUG] Volume std dev (RMS): 0.055029
[DEBUG] Analyzing comprehension and pausing (naive approach).
[DEBUG] Length ratio: 0.67, Comprehension score: 0.67
[DEBUG] Detecting emotion (heuristic).
[DEBUG] Pitch variability: 666.05, Emotion label: Expressive
[DEBUG] Final report gene