# ESL Speech Analysis (Remote Kernel)

**Assumptions**
- Recording happens locally.
- Audio is uploaded as .wav (16 kHz preferred) or .m4a (converted to .wav).
- Notebook runs on Paperspace (CPU or GPU).
- No microphone access.
- .m4a conversion requires ffmpeg available in the environment.

In [3]:
print("hello")

hello


In [30]:
# Cell 0: Environment Setup (run once)
# For .m4a support, pydub needs ffmpeg available in the runtime.
!pip install faster-whisper language-tool-python pydub openai -U typing_extensions

[0m

In [None]:
# Cell 0b: Diarization Setup (run once)
# Installs pyannote and pins NumPy to avoid ABI issues with pyarrow/pandas.
# After running this cell, restart the kernel, then run it again.
!pip install -q "numpy<2" "pandas<2.2" "pyarrow<16" pyannote.audio


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy 3.6.1 requires typer<0.10.0,>=0.3.0, but you have typer 0.21.1 which is incompatible.
tensorflow 2.15.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 6.33.4 which is incompatible.
tensorboard 2.15.1 requires protobuf<4.24,>=3.19.6, but you have protobuf 6.33.4 which is incompatible.
torchvision 0.16.1+cu121 requires torch==2.1.1, but you have torch 2.8.0 which is incompatible.[0m[31m
[0mHF_TOKEN set for this session.


In [1]:
import os

os.environ["HF_TOKEN"] = "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"
print("HF_TOKEN set for this session.")

HF_TOKEN set for this session.


In [2]:
import ipywidgets as widgets
from IPython.display import display

# FileUpload widget (in-memory)
upload = widgets.FileUpload(accept=".wav,.m4a", multiple=False)
display(upload)

# Global variable to hold the audio content
AUDIO_BYTES = None
AUDIO_FILENAME = None


def _iter_uploaded_files(value):
    # ipywidgets can return dict-like (v7) or tuple/list (v8)
    if hasattr(value, "items"):
        for name, file_info in value.items():
            yield name, file_info
    elif isinstance(value, (list, tuple)):
        for file_info in value:
            name = file_info.get("name") if isinstance(file_info, dict) else None
            yield name, file_info


def store_audio(change):
    global AUDIO_BYTES, AUDIO_FILENAME
    if not upload.value:
        return
    for name, file_info in _iter_uploaded_files(upload.value):
        if isinstance(file_info, dict):
            AUDIO_BYTES = file_info.get("content")
            AUDIO_FILENAME = name or file_info.get("name")
            if AUDIO_BYTES and AUDIO_FILENAME:
                print(
                    f"Audio file '{AUDIO_FILENAME}' is now ready in memory for other cells."
                )


# Automatically trigger when a file is uploaded
upload.observe(store_audio, names="value")


FileUpload(value=(), accept='.wav,.m4a', description='Upload')

In [3]:
# Cell 1: Load most recent audio file from ./audio (.wav or .m4a)
import os
import io
import tempfile
from pydub import AudioSegment

AUDIO_DIR = "audio"
SUPPORTED_EXTS = {".wav", ".m4a"}

# Prefer in-memory upload if present
if "AUDIO_BYTES" in globals() and AUDIO_BYTES and AUDIO_FILENAME:
    ext = os.path.splitext(AUDIO_FILENAME)[1].lower()
    if ext not in SUPPORTED_EXTS:
        raise ValueError("Unsupported file type. Use .wav or .m4a.")

    os.makedirs(AUDIO_DIR, exist_ok=True)
    if ext == ".m4a":
        AUDIO_PATH = os.path.join(
            AUDIO_DIR, os.path.splitext(AUDIO_FILENAME)[0] + ".wav"
        )
        audio = AudioSegment.from_file(io.BytesIO(AUDIO_BYTES), format="m4a")
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(AUDIO_PATH, format="wav")
    else:
        AUDIO_PATH = os.path.join(AUDIO_DIR, AUDIO_FILENAME)
        with open(AUDIO_PATH, "wb") as f:
            f.write(AUDIO_BYTES)

    print(f"Audio file loaded from upload: {AUDIO_PATH}")
else:
    if not os.path.isdir(AUDIO_DIR):
        raise FileNotFoundError(f"Directory not found: {AUDIO_DIR}")

    candidates = [
        os.path.join(AUDIO_DIR, f)
        for f in os.listdir(AUDIO_DIR)
        if os.path.splitext(f)[1].lower() in SUPPORTED_EXTS
        and os.path.isfile(os.path.join(AUDIO_DIR, f))
    ]

    if not candidates:
        raise FileNotFoundError(
            "No .wav or .m4a files found in ./audio. Add a file and try again."
        )

    INPUT_PATH = max(candidates, key=os.path.getmtime)

    ext = os.path.splitext(INPUT_PATH)[1].lower()

    if ext == ".m4a":
        AUDIO_PATH = os.path.splitext(INPUT_PATH)[0] + ".wav"
        audio = AudioSegment.from_file(INPUT_PATH, format="m4a")
        # Convert to mono/16k for best Whisper results
        audio = audio.set_channels(1).set_frame_rate(16000)
        audio.export(AUDIO_PATH, format="wav")
    elif ext == ".wav":
        AUDIO_PATH = INPUT_PATH
    else:
        raise ValueError("Unsupported file type. Use .wav or .m4a.")

    print(f"Audio file loaded: {AUDIO_PATH}")

Audio file loaded from upload: audio/20260119 102010.wav


In [None]:
# Cell 3: Speech to Text (Whisper) + True Diarization
from faster_whisper import WhisperModel
import os

use_cuda = True  # set True if GPU is available
device = "cuda" if use_cuda else "cpu"
compute_type = "float16" if use_cuda else "int8"

model = WhisperModel(
    "medium",
    device=device,
    compute_type=compute_type,
)

segments, info = model.transcribe(AUDIO_PATH)
segments = list(segments)

transcript = " ".join(s.text.strip() for s in segments)

print("TRANSCRIPT:")
print(transcript)

# True diarization using pyannote (requires HF_TOKEN)
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    raise RuntimeError(
        "HF_TOKEN not set. Run the diarization setup cell and restart the kernel."
    )

from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token=HF_TOKEN,
)

# Perform diarization on the audio file
diarization = pipeline(AUDIO_PATH)

# Map each whisper segment to the best overlapping speaker segment
# to produce a speaker-labeled transcript.

# Collect diarization segments
_diars = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    _diars.append({"start": turn.start, "end": turn.end, "speaker": speaker})


def _best_speaker_for_segment(seg_start, seg_end):
    best_speaker = "SPEAKER_00"
    best_overlap = 0.0
    for d in _diars:
        overlap = max(0.0, min(seg_end, d["end"]) - max(seg_start, d["start"]))
        if overlap > best_overlap:
            best_overlap = overlap
            best_speaker = d["speaker"]
    return best_speaker


labeled_lines = []
for s in segments:
    speaker = _best_speaker_for_segment(s.start, s.end)
    labeled_lines.append(f"[{speaker}] {s.text.strip()}")

speaker_separated = "\n".join(labeled_lines)
print("\nSPEAKER SEPARATION (diarization):")
print(speaker_separated)

# Set use_cuda=True if using GPU.

  _torch_pytree._register_pytree_node(


TRANSCRIPT:
We've been talking about a well-known person that you admire and I'd like to discuss with you one or two more general questions related to this. Let's consider first of all famous people in your country. What kind of people become famous in China? Those actors, especially the movie actors and the sports actors, sorry, the movie actors and the sports stars, they are very famous now in China because they can be seen by the people every day during the movie on the advertisements, they can be seen all the times. So they are very famous and those people who are very rich and who had a really big company and they are also on the TV, on the news, so they are very famous as well. What's different about people who were famous in the past with people who are famous these days? I think those people who were very famous in the past are very great because they do a lot to change the world, just like Newton Einstein and they found new logics, they found new way about building a new thing

  warn(

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-packages

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-packages/tornado

AttributeError: _ARRAY_API not found

Diarization failed: Pipeline.from_pretrained() got an unexpected keyword argument 'use_auth_token'

SPEAKER SEPARATION (heuristic):
[speaker 1] We've been talking about a well-known person that you admire and I'd like to discuss with you one or two more general questions related to this.
[speaker 1] Let's consider first of all famous people in your country.
[speaker 1] What kind of people become famous in China?
[speaker 2] Those actors, especially the movie actors and the sports actors, sorry, the movie actors and the sports stars, they are very famous now in China because they can be seen by the people every day during the movie on the advertisements, they can be seen all the times.
[speaker 2] So they are very famous and those people who are very rich and who had a really big company and they are also on the TV, on the news, so they are very famous as well.
[speaker 2] What's different about people who were famous in the past with people who are famous these days?
[speaker 2] I thin

In [None]:
# Cell 4: ESL Error Detection (OpenAI via HTTP)
import os
import json
import requests

OPENAI_MODEL = "gpt-5.2"
api_key = "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"


def get_issue_type(match):
    if isinstance(match, dict):
        return match.get("type", "UNKNOWN")
    if hasattr(match, "ruleId"):
        return match.ruleId
    if hasattr(match, "rule_id"):
        return match.rule_id
    if hasattr(match, "rule"):
        rule = match.rule
        if isinstance(rule, dict) and "id" in rule:
            return rule["id"]
        if hasattr(rule, "id"):
            return rule.id
    return "UNKNOWN"


if not api_key:
    print("OPENAI_API_KEY not set. Set it to enable OpenAI-based ESL checks.")
    matches = []
else:
    system_msg = (
        "You are an ESL writing assistant. Identify grammar and usage errors in the transcript. "
        "Return a JSON object with an array 'issues'. Each issue must include: "
        "type (short label), message (explain the error), context (short excerpt), suggestion (optional)."
    )
    user_msg = f"Transcript:\n{transcript}"
    schema = {
        "type": "object",
        "properties": {
            "issues": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "type": {"type": "string"},
                        "message": {"type": "string"},
                        "context": {"type": "string"},
                        "suggestion": {"type": "string"},
                    },
                    "required": ["type", "message", "context", "suggestion"],
                    "additionalProperties": False,
                },
            }
        },
        "required": ["issues"],
        "additionalProperties": False,
    }
    payload = {
        "model": OPENAI_MODEL,
        "messages": [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg},
        ],
        "response_format": {
            "type": "json_schema",
            "json_schema": {"name": "esl_issues", "schema": schema, "strict": True},
        },
    }
    r = requests.post(
        "https://api.openai.com/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        json=payload,
        timeout=60,
    )
    r.raise_for_status()
    resp = r.json()

    if "choices" in resp and len(resp["choices"]) > 0:
        content_str = resp["choices"][0]["message"]["content"]
        data = json.loads(content_str)
        matches = data.get("issues", [])
    else:
        raise ValueError(
            f"Unexpected response format from OpenAI: {json.dumps(resp, indent=2)}"
        )

print("ESL ISSUES:\n")
for m in matches:
    if isinstance(m, dict):
        print(f"- {m.get('message', '')}")
        print(f"  Context: {m.get('context', '')}")
        if m.get("suggestion"):
            print(f"  Suggestion: {m.get('suggestion')}")
        print()
    else:
        print(f"- {m.message}")
        print(f"  Context: {m.context}")
        print()

ESL ISSUES:

- Using “interest or hobby” is awkward and unclear; choose one and match singular/plural with the list that follows.
  Context: My interest or hobby that I enjoy is shopping and picnics
  Suggestion: My hobbies are shopping and having picnics.

- The sentence has too many ideas joined by commas, which makes it hard to follow.
  Context: ..., to spend time with my family or friends, or especially I want to do volunteer for shopping.
  Suggestion: Break into shorter sentences or use clearer connectors.

- “Do volunteer” is incorrect; use “do volunteer work” or “volunteer.”
  Context: I want to do volunteer for shopping
  Suggestion: I especially want to do volunteer work related to shopping. / I want to volunteer.

- “Volunteer for shopping” is unclear (volunteer to shop? volunteer at a charity?).
  Context: ...volunteer for shopping
  Suggestion: Say what you mean: “volunteer at a charity shop,” “help people shop,” or “collect donations.”

- After “can,” use the base form o

In [47]:
# Cell 5: Clean Teacher-Friendly Summary
from collections import Counter

error_types = Counter(get_issue_type(m) for m in matches)

print("ERROR SUMMARY:\n")
for rule, count in error_types.most_common():
    print(f"{rule}: {count}")

ERROR SUMMARY:

grammar: 5


In [48]:
# Cell 6 (Optional): Save Results
import json

results = {
    "transcript": transcript,
    "errors": [
        {
            "message": (m.get("message") if isinstance(m, dict) else m.message),
            "context": (m.get("context") if isinstance(m, dict) else m.context),
            "rule": get_issue_type(m),
            "suggestion": (m.get("suggestion") if isinstance(m, dict) else None),
        }
        for m in matches
    ],
}

with open("analysis_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Saved analysis_results.json")

Saved analysis_results.json


## Why This Notebook Design Works
- Fully compatible with remote kernels.
- Deterministic and reproducible.
- Easy to iterate on analysis logic.
- Clean separation of concerns: Capture, Transcription, Analysis.