In [None]:
# necessary libraries
#transformers & datasets: To load the IndicBART model and manage the data
# torch: The deep learning backend (PyTorch)
# sentencepiece: Required for the IndicBART tokenizer to handle Hindi characters
# accelerate: To speed up training and handle GPU memory management
# jiwer, evaluate, sacrebleu: To calculate performance metrics like WER (Word Error Rate)
!pip install transformers datasets torch sentencepiece accelerate jiwer evaluate sacrebleu -q

import os
os.environ["WANDB_DISABLED"] = "true" #This prevents the trainer from asking for an API key and keeps the output clean

print("packages are installed.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/3.2 MB[0m [31m12.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/3.2 MB[0m [31m20.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m31.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━

In [None]:
# @title
import os
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# 1. Mount Drive (if not already done)
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

#
model_path = "/content/drive/MyDrive/Project_model"

#
if os.path.exists(model_path):
    print("Folder found! Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True).to("cuda")
    print("Model loaded and ready on GPU!")
else:
    print(f"Error: Folder NOT found at {model_path}")
    print("Please check Drive folder name manually.")

Mounted at /content/drive
Folder found! Loading model...


Loading weights:   0%|          | 0/264 [00:00<?, ?it/s]

Model loaded and ready on GPU!


In [None]:
# @title
import torch
import re

def quick_test_inference(noisy_text):
    model.eval()

    # Input format: <Noisy> </s> <2hi>
    input_text = f"{noisy_text} </s> <2hi>"

    # ADDED: return_token_type_ids=False to fix the ValueError
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        return_token_type_ids=False
    ).to("cuda")

    # Token ID for the Hindi start tag
    hindi_tag_id = tokenizer.convert_tokens_to_ids("<2hi>")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            max_new_tokens=100,
            num_beams=5,
            decoder_start_token_id=hindi_tag_id,
            length_penalty=1.0,
            repetition_penalty=1.0,
            early_stopping=True
        )

    decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    # Clean up any leftover language tags
    clean_text = decoded_text.replace("<2hi>", "").strip()
    return clean_text

# --- THE TEST RUN ---
test_inputs = [
    "में जानता हु की में वहा नहीं था",        # Test 1 (Basic Punctuation)
    "ये काम मुझे समझ नही आ रहा हे",          # Test 10 (Grammar + Comma/Stop)
    "नमस्ते आप बहुत दिनो बाद मिले",           # Test 7 (Ending Marker)
    "कल छुट्टी है क्या तुम घर आओगे",          # Question Marker Test
]

print(" RUNNING POST-TRAINING TEST:\n" + "="*40)
for inp in test_inputs:
    output = quick_test_inference(inp)
    print(f"INPUT : {inp}")
    print(f"OUTPUT: {output}")
    print("-" * 40)

 RUNNING POST-TRAINING TEST:
INPUT : में जानता हु की में वहा नहीं था
OUTPUT: म जनत ह क म वह नह थ।
----------------------------------------
INPUT : ये काम मुझे समझ नही आ रहा हे
OUTPUT: य कम मझ समझ नह आ रह।
----------------------------------------
INPUT : नमस्ते आप बहुत दिनो बाद मिले
OUTPUT: नमसत! आप बहत दन-दन बद मल।
----------------------------------------
INPUT : कल छुट्टी है क्या तुम घर आओगे
OUTPUT: कल छटट! ह! कय तम घर आओग।
----------------------------------------


In [None]:
import torch
import re

# 1. Function Definition
def normalize_asr_strict(noisy_text):
    model.eval()

    # Input format: <Noisy> </s> <2hi>
    input_text = f"{noisy_text} </s> <2hi>"

    # Tokenizing with return_token_type_ids=False for compatibility
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        return_token_type_ids=False
    ).to("cuda")

    # Hindi start tag ID
    hindi_tag_id = tokenizer.convert_tokens_to_ids("<2hi>")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            max_new_tokens=100,
            num_beams=5,
            decoder_start_token_id=hindi_tag_id,
            length_penalty=1.0,
            repetition_penalty=1.0,
            early_stopping=True
        )

    decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    # Final cleanup of any tags
    clean_text = decoded_text.replace("<2hi>", "").strip()
    return clean_text

In [None]:
import pandas as pd
from tqdm import tqdm

# test_df
test_df = pd.read_csv('final_test_data_0.csv')

print(f" {len(test_df)} test sentences.")

 1746 test sentences.


In [6]:
from tqdm import tqdm
tqdm.pandas()
#  Processing
print(f" Processing {len(test_df)} sentences...")
test_df['Model_Output'] = test_df['Noisy'].progress_apply(normalize_asr_strict)


 Processing 1746 sentences...


100%|██████████| 1746/1746 [07:52<00:00,  3.69it/s]


In [None]:
!pip install jiwer -q

In [None]:
import pandas as pd
import re
from jiwer import wer, cer
from tqdm import tqdm
import numpy as np

def run_clean_final_report(df):
    # Converting the target and predicted text into lists
    targets = df['Target'].astype(str).tolist()
    preds = df['Model_Output'].astype(str).tolist()

    word_overlaps = []
    style_mismatches = 0

    print(f"Final Evaluation on {len(df)} samples...")

    for t, p in tqdm(zip(targets, preds), total=len(targets)):
        # 1. Word Match (Spelling & Vocabulary Focus)
        # Comparing words while ignoring punctuation and symbols
        t_w = set(re.sub(r'[^\u0900-\u097F\s]', '', t).split())
        p_w = set(re.sub(r'[^\u0900-\u097F\s]', '', p).split())

        if t_w:
            word_overlaps.append(len(t_w.intersection(p_w)) / len(t_w))

        # 2. Finding cases where words are correct but formatting (like spaces) differs
        t_clean = "".join(re.findall(r'[\u0900-\u097F0-9]', t))
        p_clean = "".join(re.findall(r'[\u0900-\u097F0-9]', p))
        if t_clean == p_clean and t != p:
            style_mismatches += 1

    # Calculating Final scores
    final_wer = wer(targets, preds)
    final_cer = cer(targets, preds)
    final_word_acc = np.mean(word_overlaps) * 100

    # Displaying the final performance report
    print("\n" + "═"*60)
    print(" FINAL ASR NORMALIZATION REPORT")
    print("═"*60)
    print(f" Word-Level Accuracy (ignores punctuation to check contextual correction): {final_word_acc:.2f}%")
    print(f" Word Error Rate (WER):           {final_wer:.4f}")
    print(f" Character Error Rate (CER):      {final_cer:.4f}")
    print(f" Formatting Only Mismatches:      {style_mismatches} sentences")
    print("═"*60)
    print("\n NOTE: Word-Level Accuracy measures contextual word identification.")
    print(" NOTE: Formatting mismatches do not affect the linguistic quality.")

    return {
        "Word_Acc": final_word_acc,
        "WER": final_wer,
        "CER": final_cer,
        "Style_Issues": style_mismatches
    }

# EXECUTE
final_metrics = run_clean_final_report(test_df.rename(columns={'Clean': 'Target'}))

In [None]:
!pip install openai-whisper

In [None]:
!pip install SpeechRecognition pydub

In [None]:
import gradio as gr
import speech_recognition as sr
from pydub import AudioSegment
import os
import uuid

# FUNCTION 1: Manual Text Normalization
def manual_text_normalize(noisy_text):
    if not noisy_text.strip():
        return "Please enter some text."
    # Calling inference function
    return quick_test_inference(noisy_text)

# FUNCTION 2: Audio Pipeline
def process_audio_pipeline(audio_path):
    if audio_path is None or not os.path.exists(audio_path):
        return "Audio signal not detected.", ""

    recognizer = sr.Recognizer()
    unique_wav = f"temp_{uuid.uuid4().hex}.wav"

    try:
        audio = AudioSegment.from_file(audio_path)
        audio.export(unique_wav, format="wav")

        with sr.AudioFile(unique_wav) as source:
            recognizer.adjust_for_ambient_noise(source, duration=0.5)
            audio_data = recognizer.record(source)
            raw_text = recognizer.recognize_google(audio_data, language="hi-IN")

        if raw_text.strip():
            # Using your model function
            clean_text = quick_test_inference(raw_text)
            return raw_text, clean_text
        else:
            return "Speech not recognized.", "Please speak clearly."

    except Exception as e:
        return f"Error: {str(e)}", "Please try again."
    finally:
        if os.path.exists(unique_wav):
            os.remove(unique_wav)

# GRADIO INTERFACE
#  light theme using custom CSS and Soft theme
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo:
    gr.Markdown("Raw ASR Normalizer")

    with gr.Tabs():
        # TAB 1: Speech to Text
        with gr.TabItem("Audio Normalizer"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Record/Upload")
                    btn_audio = gr.Button("Process Audio", variant="primary")
                with gr.Column():
                    raw_out = gr.Textbox(label="Raw ASR Output", lines=3)
                    clean_out = gr.Textbox(label="Corrected Text", lines=3)

            btn_audio.click(process_audio_pipeline, inputs=audio_input, outputs=[raw_out, clean_out])

        # TAB 2: Manual Text Input
        with gr.TabItem("Direct Text Input"):
            gr.Markdown("### Enter Noisy Hindi Text manually to clean it")
            with gr.Row():
                with gr.Column():
                    text_input = gr.Textbox(label="Input Noisy Text", placeholder="e.g., में वहा जा रहा हु...", lines=5)
                    btn_text = gr.Button("Normalize Text", variant="primary")
                with gr.Column():
                    text_output = gr.Textbox(label="Corrected Output", lines=5)

            btn_text.click(manual_text_normalize, inputs=text_input, outputs=text_output)

    gr.Markdown("---")
    gr.Markdown("Built with IndicBART & Google ASR Engine")

# Launch
demo.launch(debug=True, share=True)

In [None]:
if 'demo' in locals():
    demo.close()