In [2]:
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText

model_path = "nanonets/Nanonets-OCR-s"

model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype="auto",
    #device_map="auto",
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)


def ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=4096):
    prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
    image = Image.open(image_path)
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image", "image": f"file://{image_path}"},
            {"type": "text", "text": prompt},
        ]},
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to(model.device)

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]

    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]

image_path = "/path/to/your/document.jpg"
result = ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=15000)
print(result)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


video_preprocessor_config.json: 0.00B [00:00, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: '/path/to/your/document.jpg'

In [27]:

"""
One-shot speaker verification using SpeechBrain ECAPA-TDNN,
with live audio capture via modules/audio.py.
"""

import torch
import numpy as np
from torch.nn.functional import normalize
from speechbrain.pretrained import SpeakerRecognition
from modules.audio import AudioRecorder


In [None]:
# 1. Initialize audio recorder (16 kHz mono)
recorder = AudioRecorder(samplerate=16000, channels=1, dtype='int16')

In [None]:
# 2. Load pretrained ECAPA-TDNN speaker-recognition model
model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb"
)

In [None]:
def embed_audio(wav: np.ndarray) -> torch.Tensor:
    """
    Take a 1D float32 numpy array of audio at 16 kHz,
    convert to a Torch tensor [1, time],
    and return a normalized embedding [1, dim].
    """
    tensor = torch.from_numpy(wav).unsqueeze(0)  # [1, time]
    emb = model.encode_batch(tensor)             # [1, embed_dim]
    return normalize(emb, p=2, dim=-1)

In [None]:
# --------------------------------------------------------------------
# ENROLLMENT PHASE: record 1–2 reference utterances to build your prototype
# --------------------------------------------------------------------
print("🎤 Enrollment: Please speak your passphrase twice, each ~3 seconds long.")
reference_embeddings = []
for i in range(2):
    wav = recorder.record(duration=3.0)         # record 3 seconds
    emb = embed_audio(wav)
    reference_embeddings.append(emb)

In [None]:
# average and re-normalize to get your speaker prototype
prototype = torch.mean(torch.stack(reference_embeddings), dim=0)
prototype = normalize(prototype, p=2, dim=-1)
prototype.shape

In [None]:
# --------------------------------------------------------------------
# VERIFICATION PHASE: record a test utterance and compare against prototype
# --------------------------------------------------------------------
print("\n🎤 Verification: Please speak your passphrase (~3 seconds).")
test_wav = recorder.record(duration=3.0)
test_emb = embed_audio(test_wav)
test_emb

In [None]:
test_emb.shape

In [None]:
# compute cosine similarity score
score = torch.matmul(test_emb[0], prototype[0].T).item()
threshold = 0.70  # adjust after ROC analysis on your data

print(f"\n🔍 Cosine score = {score:.3f}")
if score >= threshold:
    print("✅ Speaker verified!")
else:
    print("❌ Speaker not recognized.")

# (Optional) You can wrap the above into a loop or API server as needed.

In [1]:
import uuid
import numpy as np
from config.loader import settings
from models.ModelManager import ModelManager
from modules.audio.AudioRecogniserManager import AudioRecogniserManager
from modules.audio.io import AudioRecorder


  import pkg_resources
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
09:27:38 datasets INFO │ PyTorch version 2.7.1 available.
09:27:38 qwen_vl_utils.vision_process INFO │ set VIDEO_TOTAL_PIXELS: 90316800
09:27:38 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint save hook for _speechbrain_save
09:27:38 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint load hook for _speechbrain_load
09:27:38 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint save hook for save
09:27:38 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint load hook for load
09:27:38 speechbrain.utils.quirks INFO │ Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
09:27:38 speechbrain.utils.quirks INFO │ Excluded quirks specified by the `SB_DISABLE_QUIRKS` environ

In [2]:
ModelManager()

09:27:38 ModelManager INFO │ Project Root: /Users/saketm10/Projects/smruti


Loading Qwen/Qwen2.5-VL-3B-Instruct model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Model loaded!
 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
Loading GLiNER model 'urchade/gliner_base' on mps...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

09:27:57 sentence_transformers.SentenceTransformer INFO │ Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Model loaded!


09:28:01 speechbrain.utils.fetching INFO │ Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
09:28:01 speechbrain.utils.fetching INFO │ Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
09:28:01 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint save hook for _save
09:28:01 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint load hook for _load
09:28:01 speechbrain.utils.checkpoints DEBUG │ Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
09:28:01 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint save hook for save
09:28:01 speechbrain.utils.checkpoints DEBUG │ Registered checkpoint load hook for load_if_possible
09:28:01 speechbrain.utils.parameter_transfer DEBUG │ Fetching files for pretraining (no collection directory set)
09:28:01 speechbrain.utils.fetching INFO │ Fetch embedding_model.ckp

<models.ModelManager.ModelManager at 0x37129a750>

In [4]:
audio_recogniser = AudioRecogniserManager(
    ModelManager.speaker_embedder,
    embedding_dim=ModelManager.speaker_embedder.embedding_dim,
    db_path=settings['db']['audio_recogniser']
)


.cache/audio_embeddings.npz ##########
float32 (1, 384)


AssertionError: 

In [4]:
n_utts = int(input("How many reference utterances to record? "))

# Collect WAVs
recorder = AudioRecorder(samplerate=16000, channels=1, dtype='int16')
wavs = []
for i in range(n_utts):
    input(f"Press Enter to record utterance #{i+1}...")
    wav = recorder.record(duration=3.0)
    wavs.append(wav.astype(np.float32) / np.iinfo(np.int16).max)


🎙️  Recording for 3.0 seconds...
🎙️  Recording for 3.0 seconds...


In [5]:
name = input("Enter speaker name: ").strip()
provided_id = input("Enter speaker ID (or leave empty to auto-generate): ").strip() or None


In [11]:

# Enroll and persist
sid = audio_recogniser.enroll(name=name, wavs=wavs, speaker_id=provided_id)
print(f"✅ Enrolled '{name}' with speaker_id = {sid}")


✅ Enrolled 'Saket Mohanty' with speaker_id = de61be0b612a400c8f5ef4de17b749a1


In [6]:
audio_recogniser.faiss.dim

192

In [7]:
embs = [audio_recogniser.embedder.embed(w) for w in wavs]
proto = np.mean(np.stack(embs, axis=0), axis=0)
proto = proto.astype(np.float32) / np.linalg.norm(proto)

In [8]:
proto.shape

(1, 192)

In [10]:
import os
os.makedirs(os.path.dirname(settings['db']['audio_recogniser']), exist_ok=True)