In [None]:
import pandas as pd
import re
import random

# 1. File
# Reading the source CSV and converting the 'Clean' column into a list for processing
df_base = pd.read_csv('new.csv')
clean_list = df_base['Clean'].dropna().tolist()

def generate_variations(text):
    results = set()
    text = text.strip()

    # Helper Functions
    def apply_grammar_noise(t):
        # Simulating common ASR errors like mixing up nasal sounds (Anusvar) and Visarg
        t = t.replace("हैं", random.choice(["है", "हे"]))
        t = t.replace("मैं", "में")
        t = t.replace("हूँ", "हु")
        t = t.replace("अतः", "अत").replace("पुनः", "पुन").replace("संभवतः", "संभवत")
        return t

    def strip_dash(t):
      # Removing hyphens to simulate noise in compound words
        return t.replace("-", " ")

    # Identifying Paragraphs
    # Determining if the input is a paragraph based on multiple sentence endings
    sentence_ends = re.findall(r'[।?|]', text)
    is_paragraph = len(sentence_ends) > 1

    # Applying grammatical errors to every variant
    base_noisy_text = apply_grammar_noise(text)

    # Version 1: Extreme Noise (No Punctuation + No Dash + Grammar Error)
    extreme_noisy = re.sub(r'[!?,।|?]', '', strip_dash(base_noisy_text))
    results.add(" ".join(extreme_noisy.split()))

    #Punctuation Logic
    #Identifying which specific punctuation marks are present in the current text
    marks_to_check = ['!', ',', '।', '?', '|']
    present = [m for m in marks_to_check if m in text]

    if is_paragraph:
        # PARAGRAPH SPECIFIC RULES
        # Rule A: Removing only the boundary markers (sentence endings)
        results.add(re.sub(r'[।?|]', '', base_noisy_text))

        # Rule B: Removing internal markers like commas but keeping the sentence endings
        results.add(re.sub(r'[,!]', '', base_noisy_text))

        # Rule C: Merging sentences by removing only the first occurrence of a punctuation mark
        for mark in set(sentence_ends):
            results.add(base_noisy_text.replace(mark, " ", 1)) # Sirf pehla occurrence hataya

    else:
        #NORMAL SENTENCE RULES (9 Rules)
        #Logic for handling sentences with single, double, or multiple punctuation marks
        if len(present) == 1:
            results.add(text.replace(present[0], ""))

        elif len(present) == 2:
            m1, m2 = present[0], present[1]
            results.add(text.replace(m1, "")) # Variation without first mark
            results.add(text.replace(m2, "")) # Variation without second mark
            results.add(re.sub(r'[!?,।|?]', '', text)) # Variation without any marks
        elif len(present) >= 3:
            for m in present:
                results.add(text.replace(m, ""))
            results.add(re.sub(r'[!?,]', '', text)) # Keeping only sentence boundaries
            results.add(re.sub(r'[।?|]', '', text))# Keeping only internal punctuations
    # Dash Logic
    # Handling words like 'धीरे-धीरे' by creating variations with and without hyphens
    if "-" in text:
        no_dash = strip_dash(text)
        results.add(no_dash)
        results.add(re.sub(r'[!?,।|?]', '', no_dash))

    return results

# 2. Pairs Generatation
final_pairs = []

for sentence in clean_list:
    # Including the 'Clean-to-Clean' pair to ensure the model learns to leave correct text as is
    final_pairs.append({"Noisy": sentence, "Clean": sentence})

   # Generating and collecting all unique noisy versions for each clean sentence
    noisy_variants = generate_variations(sentence)
    for nv in noisy_variants:
        nv_clean = " ".join(nv.split()) # Normalizing whitespace
        if nv_clean != sentence.strip():
            final_pairs.append({"Noisy": nv_clean, "Clean": sentence})

# 3. Save
#Dropping any identical pairs and saving the final expanded dataset
df_final = pd.DataFrame(final_pairs).drop_duplicates()
df_final.to_csv('final_1721_0.csv', index=False, encoding='utf-8-sig')

print(f"Success! Total Rows: {len(df_final)}")

Success! Total Rows: 6727


In [None]:
# necessary libraries
#transformers & datasets: To load the IndicBART model and manage the data
# torch: The deep learning backend (PyTorch)
# sentencepiece: Required for the IndicBART tokenizer to handle Hindi characters
# accelerate: To speed up training and handle GPU memory management
# jiwer, evaluate, sacrebleu: To calculate performance metrics like WER (Word Error Rate)
!pip install transformers datasets torch sentencepiece accelerate jiwer evaluate sacrebleu -q

import os
os.environ["WANDB_DISABLED"] = "true" #This prevents the trainer from asking for an API key and keeps the output clean

print("packages are installed.")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hpackages are installed.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# 1.Loading the Augmented Base Dataset(6726 pairs)
old_df = pd.read_csv('final_1721_0.csv')

# 2. Targeted Data (1001 pairs) to fix specific failures
targeted_df = pd.read_csv('target_1001.csv')

# 3. We repeat the targeted samples twice (1001 * 2 = 2002 rows)
#This ensures these critical patterns are seen more frequently by the model
targeted_upsampled = pd.concat([targeted_df] * 2, ignore_index=True)

# 4. Merge and Shuffle
#Combining base data with the doubled targeted data for a balanced training pool
## Total Rows: 6,726 + 2002 = 8,728
df = pd.concat([old_df, targeted_upsampled], ignore_index=True)
df = shuffle(df, random_state=42).reset_index(drop=True)

def clean_text(text):
    if pd.isna(text): return ""
    text = str(text).strip()
    return ' '.join(text.split())

df['Noisy'] = df['Noisy'].apply(clean_text)
df['Clean'] = df['Clean'].apply(clean_text)
df = df[(df['Noisy'] != "") & (df['Clean'] != "")]

# IndicBART Formatting
# Adding special language and separator tokens required by IndicBART
df['input_text'] = df['Noisy'] + " </s> <2hi>"
df['target_text'] = "<2hi> " + df['Clean'] + " </s>"

# Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
test_df.to_csv('final_test_data_0.csv', index=False, encoding='utf-8-sig')
print(f"Dataset Ready! Total Training Data: {len(train_df)},\n Total testing Data: {len(test_df)}")


Dataset Ready! Total Training Data: 6981,
 Total testing Data: 1746


In [None]:
from transformers import AutoTokenizer, MBartForConditionalGeneration
import torch

model_name = "ai4bharat/IndicBART"
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=False, keep_accents=True)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Stability fix
model = model.float()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"IndicBART loaded on {device}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/976M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/976M [00:00<?, ?B/s]

IndicBART loaded on cuda


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[['input_text', 'target_text']])
test_dataset = Dataset.from_pandas(test_df[['input_text', 'target_text']])

def tokenize_fn(examples):
    # Max length 200 for long paragraphs
    model_inputs = tokenizer(examples["input_text"], max_length=200, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=200, padding="max_length", truncation=True)

    model_inputs["labels"] = [[(l if l != tokenizer.pad_token_id else -100) for l in label_seq] for label_seq in labels["input_ids"]]
    return model_inputs
#Applying the mapping across the dataset in batches for efficiency
tokenized_train = train_dataset.map(tokenize_fn, batched=True)
tokenized_test = test_dataset.map(tokenize_fn, batched=True)
print("Tokenization completed with max_length=200 ")

Map:   0%|          | 0/6981 [00:00<?, ? examples/s]



Map:   0%|          | 0/1746 [00:00<?, ? examples/s]

Tokenization completed with max_length=200 


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# 1. Configuring Training Parameters
args = Seq2SeqTrainingArguments(
    output_dir="./indicbart-hindi-asr-fix", # Directory where the fine-tuned model checkpoints are saved
    eval_strategy="epoch", # Performance is evaluated after every full pass of the data
    learning_rate=2e-5, # step size to adapt to new patterns without losing pre-trained knowledge
    per_device_train_batch_size=4,      # Number of samples processed at once per GPU
    gradient_accumulation_steps=2,      # Simulates a larger batch size for better numerical stability
    weight_decay=0.01,                  # Regularization to prevent overfitting

    num_train_epochs=8,               # Total training iterations over the dataset
    predict_with_generate=True,     # Enables the model to generate actual Hindi text during evaluation
    fp16=False,                     # Maintaining full precision for character accuracy
    logging_steps=100,              # Log training progress every 100 steps
    save_total_limit=1,               # Keeping the best recent checkpoint
    report_to="none"                 # Disabling external logging trackers
)
# 2. Initializing the Seq2SeqTrainer engine
trainer = Seq2SeqTrainer(
    model=model,                     # The loaded IndicBART model
    args=args,                       # The training configuration defined above
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer               # Tokenizer for decoding generated output
)

print(" Starting Training with Targeted Data focus...")
# 3. Executing the fine-tuning process
trainer.train()

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 3, 'bos_token_id': 2}.


 Starting Training with Targeted Data focus...


Epoch,Training Loss,Validation Loss
1,0.4854,0.254908
2,0.1552,0.115823
3,0.1174,0.086356
4,0.0802,0.053325
5,0.0645,0.03994
6,0.0514,0.036251
7,0.0472,0.033295
8,0.0471,0.032609


TrainOutput(global_step=6984, training_loss=0.21180950724791825, metrics={'train_runtime': 7239.2842, 'train_samples_per_second': 7.715, 'train_steps_per_second': 0.965, 'total_flos': 1.18195970899968e+16, 'train_loss': 0.21180950724791825, 'epoch': 8.0})

In [None]:
# @title
import torch
import re

def quick_test_inference(noisy_text):
    model.eval()

    # Input format: <Noisy> </s> <2hi>
    input_text = f"{noisy_text} </s> <2hi>"

    # ADDED: return_token_type_ids=False to fix the ValueError
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        return_token_type_ids=False
    ).to("cuda")

    # Token ID for the Hindi start tag
    hindi_tag_id = tokenizer.convert_tokens_to_ids("<2hi>")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            max_new_tokens=100,
            num_beams=5,
            decoder_start_token_id=hindi_tag_id,
            length_penalty=1.0,
            repetition_penalty=1.0,
            early_stopping=True
        )

    decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    # Clean up any leftover language tags
    clean_text = decoded_text.replace("<2hi>", "").strip()
    return clean_text

# --- THE TEST RUN ---
test_inputs = [
    "में जानता हु की में वहा नहीं था",        # Test 1 (Basic Punctuation)
    "ये काम मुझे समझ नही आ रहा हे",          # Test 10 (Grammar + Comma/Stop)
    "नमस्ते आप बहुत दिनो बाद मिले",           # Test 7 (Ending Marker)
    "कल छुट्टी है क्या तुम घर आओगे",          # Question Marker Test
]

print(" RUNNING POST-TRAINING TEST:\n" + "="*40)
for inp in test_inputs:
    output = quick_test_inference(inp)
    print(f"INPUT : {inp}")
    print(f"OUTPUT: {output}")
    print("-" * 40)

 RUNNING POST-TRAINING TEST:
INPUT : में जानता हु की में वहा नहीं था
OUTPUT: मैं जानता हूँ की मैं वहाँ नहीं था।
----------------------------------------
INPUT : ये काम मुझे समझ नही आ रहा हे
OUTPUT: ये काम मुझे समझ नही आ रहा है।
----------------------------------------
INPUT : नमस्ते आप बहुत दिनो बाद मिले
OUTPUT: नमस्ते! आप बहुत दिनो बाद मिले।
----------------------------------------
INPUT : कल छुट्टी है क्या तुम घर आओगे
OUTPUT: कल छुट्टी है। क्या तुम घर आओगे?
----------------------------------------


In [None]:
from google.colab import drive
import shutil
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define Paths
local_save_path = "./project_model"
drive_folder_path = "/content/drive/MyDrive/Project_model"
zip_name = "Project_model"

# 3. Save the model and tokenizer locally first
print("Saving model locally...")
trainer.save_model(local_save_path)
tokenizer.save_pretrained(local_save_path)

# 4. Copy the folder to Google Drive (Fast Backup)
if not os.path.exists(drive_folder_path):
    os.makedirs(drive_folder_path)

print(" Copying model folder to Google Drive...")
!cp -r {local_save_path}/* "{drive_folder_path}"

# 5. Create a Zip archive and copy it to Drive
print(" Creating Zip archive ...")
shutil.make_archive(zip_name, 'zip', local_save_path)
shutil.copy(f"{zip_name}.zip", f"/content/drive/MyDrive/{zip_name}.zip")

print(f"Success! Both the folder and Zip file are now safe in your Drive.")

Mounted at /content/drive
Saving model locally...
 Copying model folder to Google Drive...
 Creating Zip archive ...
Success! Both the folder and Zip file are now safe in your Drive.


In [None]:
# @title
import os
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# 1. Mount Drive (if not already done)
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

#
model_path = "/content/drive/MyDrive/Project_model"

#
if os.path.exists(model_path):
    print("Folder found! Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True).to("cuda")
    print("Model loaded and ready on GPU!")
else:
    print(f"Error: Folder NOT found at {model_path}")
    print("Please check Drive folder name manually.")

Folder found! Loading model...


AssertionError: Torch not compiled with CUDA enabled

In [None]:
import torch
import re

# 1. Function Definition
def normalize_asr_strict(noisy_text):
    model.eval()

    # Input format: <Noisy> </s> <2hi>
    input_text = f"{noisy_text} </s> <2hi>"

    # Tokenizing with return_token_type_ids=False for compatibility
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        return_token_type_ids=False
    ).to("cuda")

    # Hindi start tag ID
    hindi_tag_id = tokenizer.convert_tokens_to_ids("<2hi>")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            max_new_tokens=100,
            num_beams=5,
            decoder_start_token_id=hindi_tag_id,
            length_penalty=1.0,
            repetition_penalty=1.0,
            early_stopping=True
        )

    decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

    # Final cleanup of any tags
    clean_text = decoded_text.replace("<2hi>", "").strip()
    return clean_text

In [None]:
import pandas as pd
from tqdm import tqdm

# test_df
test_df = pd.read_csv('final_test_data_0.csv')

print(f" {len(test_df)} test sentences.")

 1746 test sentences.


In [None]:
from tqdm import tqdm
tqdm.pandas()
#  Processing
print(f" Processing {len(test_df)} sentences...")
test_df['Model_Output'] = test_df['Noisy'].progress_apply(normalize_asr_strict)


 Processing 1746 sentences...


100%|██████████| 1746/1746 [05:52<00:00,  4.95it/s]


In [None]:
!pip install jiwer -q

In [None]:
import pandas as pd
import re
from jiwer import wer, cer
from tqdm import tqdm
import numpy as np

def run_clean_final_report(df):
    # Converting the target and predicted text into lists
    targets = df['Target'].astype(str).tolist()
    preds = df['Model_Output'].astype(str).tolist()

    word_overlaps = []
    style_mismatches = 0

    print(f"Final Evaluation on {len(df)} samples...")

    for t, p in tqdm(zip(targets, preds), total=len(targets)):
        # 1. Word Match (Spelling & Vocabulary Focus)
        # Comparing words while ignoring punctuation and symbols
        t_w = set(re.sub(r'[^\u0900-\u097F\s]', '', t).split())
        p_w = set(re.sub(r'[^\u0900-\u097F\s]', '', p).split())

        if t_w:
            word_overlaps.append(len(t_w.intersection(p_w)) / len(t_w))

        # 2. Finding cases where words are correct but formatting (like spaces) differs
        t_clean = "".join(re.findall(r'[\u0900-\u097F0-9]', t))
        p_clean = "".join(re.findall(r'[\u0900-\u097F0-9]', p))
        if t_clean == p_clean and t != p:
            style_mismatches += 1

    # Calculating Final scores
    final_wer = wer(targets, preds)
    final_cer = cer(targets, preds)
    final_word_acc = np.mean(word_overlaps) * 100

    # Displaying the final performance report
    print("\n" + "═"*60)
    print(" FINAL ASR NORMALIZATION REPORT")
    print("═"*60)
    print(f" Word-Level Accuracy (ignores punctuation to check contextual correction): {final_word_acc:.2f}%")
    print(f" Word Error Rate (WER):           {final_wer:.4f}")
    print(f" Character Error Rate (CER):      {final_cer:.4f}")
    print(f" Formatting Only Mismatches:      {style_mismatches} sentences")
    print("═"*60)
    print("\n NOTE: Word-Level Accuracy measures contextual word identification.")
    print(" NOTE: Formatting mismatches do not affect the linguistic quality.")

    return {
        "Word_Acc": final_word_acc,
        "WER": final_wer,
        "CER": final_cer,
        "Style_Issues": style_mismatches
    }

# EXECUTE
final_metrics = run_clean_final_report(test_df.rename(columns={'Clean': 'Target'}))

Final Evaluation on 1746 samples...


100%|██████████| 1746/1746 [00:00<00:00, 29393.43it/s]


════════════════════════════════════════════════════════════
 FINAL ASR NORMALIZATION REPORT
════════════════════════════════════════════════════════════
 Word-Level Accuracy (ignores punctuation to check contextual correction): 96.95%
 Word Error Rate (WER):           0.0360
 Character Error Rate (CER):      0.0136
 Formatting Only Mismatches:      70 sentences
════════════════════════════════════════════════════════════

 NOTE: Word-Level Accuracy measures contextual word identification.
 NOTE: Formatting mismatches do not affect the linguistic quality.





In [None]:
#test
for i in range(5):
    input_text = test_df['Noisy'].iloc[i]
    prediction = normalize_asr_strict(input_text)
    print(f"Input : {input_text}")
    print(f"Model : {prediction}")
    print("-" * 30)

Input : अरे! राम और लक्ष्मण को देखो, वे कितनी वीरता से लड़ रहे हैं
Model : अरे! राम और लक्ष्मण को देखो, वे कितनी वीरता से लड़ रहे हैं।
------------------------------
Input : चुप रहो तुम बहुत बोल रहे हो क्या तुम्हे सुनाई नहीं देता
Model : चुप रहो! तुम बहुत बोल रहे हो, क्या तुम्हें सुनाई नहीं देता?
------------------------------
Input : उफ़! क्या इस गर्मी में पंखा, कूलर और एसी सब फेल हो गए हैं?
Model : उफ़! क्या इस गर्मी में पंखा, कूलर और एसी सब फेल हो गए हैं?
------------------------------
Input : मैं बचपन में बहुत शरारती था पर अब मैं काफी गंभीर हो गया हूँ| संभवतः उम्र के साथ समझदारी आ जाती है, अतः मैं अब हर कदम सोच-समझकर उठाता हूँ| क्या तुम भी ऐसा ही महसूस करते हो?
Model : मैं बचपन में बहुत शरारती था पर अब मैं काफी गंभीर हो गया हूँ| संभवतः उम्र के साथ समझदारी आ जाती है, अतः मैं अब हर कदम सोच-समझकर उठाता हूँ| क्या तुम भी ऐसा ही महसूस करते हो?
------------------------------
Input : यह संभव नहीं है, मित्र!
Model : यह संभव नहीं है, मित्र!
------------------------------


In [None]:
test = [

    "में जानता हु की में वहा नहीं था",
    "में बाज़ार में था पर मेंने तुम्हे नहीं देखा",
    "रुको कहा जा रहे हो",

    # Cases (Single & Multi-sentence)
    "क्या तुम जानते हो की वो कहा रहता है मुझे बताओ",
    "कल बारिश होगी क्या तुम्हे पता है",
    "नमस्ते आप बहुत दिनो बाद मिले",
    "वहा बहुत भीड थी मे डरा हुआ था",
    "उसे बोला था की यहा मत आना पर वो नही माना",
    "ये काम मुझे समझ नही आ रहा हे",
    "जल्दी चलो ट्रेन छूट जाएगी देर हो रही है",



    "कल छुट्टी है क्या तुम घर आओगे",
    "क्या तुम पागल हो ऐसा कोन करता है",
    "खाना तैयार है जल्दी आओ वरना ठंडा हो जाएगा",
    "नमस्ते सर में कल बाज़ार गया था वहा मेने देखा की बहुत भीड है पर मुझे समझ नही आया की लोग इतने परेशां क्यूँ थे क्या आपको पता है की वहा क्या हुआ था अगर आपको कुछ खबर मिले तो मुझे बताना में इंतज़ार करुगा"
]

print("Running Final Evaluation...\n" + "="*80)

for i, noisy in enumerate(test, 1):
    output = normalize_asr_strict(noisy)
    print(f"Test {i} (Input) : {noisy}")
    print(f"Test {i} (Output): {output}")
    print("-" * 80)

Running Final Evaluation...
Test 1 (Input) : में जानता हु की में वहा नहीं था
Test 1 (Output): मैं जानता हूँ की मैं वहाँ नहीं था।
--------------------------------------------------------------------------------
Test 2 (Input) : में बाज़ार में था पर मेंने तुम्हे नहीं देखा
Test 2 (Output): मैं बाज़ार में था पर मैंने तुम्हें नहीं देखा।
--------------------------------------------------------------------------------
Test 3 (Input) : रुको कहा जा रहे हो
Test 3 (Output): रुको! कहाँ जा रहे हो?
--------------------------------------------------------------------------------
Test 4 (Input) : क्या तुम जानते हो की वो कहा रहता है मुझे बताओ
Test 4 (Output): क्या तुम जानते हो की वह कहाँ रहता है? मुझे बताओ।
--------------------------------------------------------------------------------
Test 5 (Input) : कल बारिश होगी क्या तुम्हे पता है
Test 5 (Output): कल बारिश होगी, क्या तुम्हें पता है?
--------------------------------------------------------------------------------
Test 6 (Input) : नमस्ते आप बहुत दिन

In [None]:
# --- EXTENDED PARAGRAPH & COMPLEX TEST SUITE ---
test = [
    # 1. Official/Professional Paragraph (Testing connectives like 'kyuki', 'isliye')
    "नमस्ते सर में कल दफ्तर नहीं आ पाया क्युकी मेरी तबियत ठीक नहीं थी और में डॉक्टर के पास गया था इसलिए क्या आप मुझे कल की मीटिंग की जानकारी दे सकते है ताकि में अपना काम पूरा कर सकू",

    # 2. Daily Life/Instructional (Testing flow and urgent tone)
    "सावधान आगे रास्ता खराब है वहा बहुत पत्थर और कांच के टुकड़े पड़े है अपनी गाड़ी धीरे चलाइये वरना टायर पंक्चर हो सकता है क्या आपको मेरी बात समझ आ रही है",

    # 3. Emotional/Storytelling (Testing gender consistency and long-term context)
    "मेरी दादी ने कहा की में बहुत बहादुर हु पर में जानता हु की उस रात में बहुत डरा हुआ था वहा अँधेरा था और चारो ओर अजीब सी आवाज़े आ रही थी क्या आप कभी ऐसी स्थिति में फंसे है",

    # 4. Market/Conversation (Testing numeric and 'Me/Main' confusion)
    "बाजार मे बहुत भीड थी मेने १० किलो चीनी और कुछ फल ख़रीदे पर मुझे दुकानदार ने पैसे कम वापस दिए क्युकी उसे लगा की में हिसाब नहीं जानता हु",

    # 5. Travel/Urgency (Testing multiple sentence boundaries)
    "जल्दी चलो भाई ट्रेन छूट जाएगी हमे प्लेटफार्म नंबर ४ पर पहुचना है और हमारे पास सिर्फ ५ मिनट बचे है क्या तुम सारा सामान उठा सकते हो या में किसी की मदद लू",

    # 6. House/Domestic (Testing 'In vs I' and spelling)
    "कमरे मे बहुत धूल जमी है मुजे पूरा बिस्वास है की तुमने खिड़की खुली छोड़ी थी इसलिए अब सारा सामान साफ करो और इसे फिर से जमाओ",

    # 7. Weather/Speculation (Testing questions in middle of paragraphs)
    "कल मौसम बहुत ख़राब था क्या तुम्हे पता है की वहा ओले गिरे है में तो घर के अंदर ही था पर बाहर खड़े पेड़ गिर गए है",

    # 8. Your Original Long Case (For comparison)
    "नमस्ते सर में कल बाज़ार गया था वहा मेने देखा की बहुत भीड है पर मुझे समझ नही आया की लोग इतने परेशां क्यूँ थे क्या आपको पता है की वहा क्या हुआ था अगर आपको कुछ खबर मिले तो मुझे बताना में इंतज़ार करुगा"
]

print("Running Evaluation on Large Paragraphs & Complex Context...\n" + "="*80)

for i, noisy in enumerate(test, 1):
    output = normalize_asr_strict(noisy)
    print(f"Test {i} (Input) : {noisy}")
    print(f"Test {i} (Output): {output}")
    print("-" * 80)

Running Evaluation on Large Paragraphs & Complex Context...
Test 1 (Input) : नमस्ते सर में कल दफ्तर नहीं आ पाया क्युकी मेरी तबियत ठीक नहीं थी और में डॉक्टर के पास गया था इसलिए क्या आप मुझे कल की मीटिंग की जानकारी दे सकते है ताकि में अपना काम पूरा कर सकू
Test 1 (Output): नमस्ते सर! मैं कल दफ्तर नहीं आ पाया क्योंकि मेरी तबियत ठीक नहीं थी और मैं डॉक्टर के पास गया था| इसलिए क्या आप मुझे कल की मीटिंग की जानकारी दे सकते हैं ताकि मैं अपना काम पूरा कर सकूँ?
--------------------------------------------------------------------------------
Test 2 (Input) : सावधान आगे रास्ता खराब है वहा बहुत पत्थर और कांच के टुकड़े पड़े है अपनी गाड़ी धीरे चलाइये वरना टायर पंक्चर हो सकता है क्या आपको मेरी बात समझ आ रही है
Test 2 (Output): सावधान! आगे रास्ता खराब है, वहाँ बहुत पत्थर और कांच के टुकड़े पड़े हैं। अपनी गाड़ी धीरे चलाइये वरना टायर पंक्चर हो सकता है। क्या आपको मेरी बात समझ आ रही है?
--------------------------------------------------------------------------------
Test 3 (Input) : मेरी दादी ने कहा की में बहुत 

In [None]:
!pip install openai-whisper

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m573.4/803.2 kB[0m [31m17.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=e146fda3e0a85ef0f782a8d41d9a523b263ba4f42f206e5bc91c57e81e27f0d7
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d1

In [None]:
!pip install SpeechRecognition pydub

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.5-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.5-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.5


In [None]:
import gradio as gr
import speech_recognition as sr
from pydub import AudioSegment
import os
import uuid

# FUNCTION 1: Manual Text Normalization
def manual_text_normalize(noisy_text):
    if not noisy_text.strip():
        return "Please enter some text."
    # Calling inference function
    return quick_test_inference(noisy_text)

# FUNCTION 2: Audio Pipeline
def process_audio_pipeline(audio_path):
    if audio_path is None or not os.path.exists(audio_path):
        return "Audio signal not detected.", ""
#to process audio data
    recognizer = sr.Recognizer()
    unique_wav = f"temp_{uuid.uuid4().hex}.wav"

    try:
        audio = AudioSegment.from_file(audio_path)
        audio.export(unique_wav, format="wav") #loaded audio saved on disk

        with sr.AudioFile(unique_wav) as source:
            recognizer.adjust_for_ambient_noise(source, duration=0.5)
            audio_data = recognizer.record(source)
            raw_text = recognizer.recognize_google(audio_data, language="hi-IN")

        if raw_text.strip():
            # Using model function
            clean_text = quick_test_inference(raw_text)
            return raw_text, clean_text
        else:
            return "Speech not recognized.", "Please speak clearly."

    except Exception as e:
        return f"Error: {str(e)}", "Please try again."
    finally:
        if os.path.exists(unique_wav):
            os.remove(unique_wav)

# GRADIO INTERFACE
#  light theme using custom CSS and Soft theme
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo:
    gr.Markdown("Raw ASR Normalizer")

    with gr.Tabs():
        # TAB 1: Speech to Text
        with gr.TabItem("Audio Normalizer"):
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Record/Upload")
                    btn_audio = gr.Button("Process Audio", variant="primary")
                with gr.Column():
                    raw_out = gr.Textbox(label="Raw ASR Output", lines=3)
                    clean_out = gr.Textbox(label="Corrected Text", lines=3)

            btn_audio.click(process_audio_pipeline, inputs=audio_input, outputs=[raw_out, clean_out])

        # TAB 2: Manual Text Input
        with gr.TabItem("Direct Text Input"):
            gr.Markdown("### Enter Noisy Hindi Text manually to clean it")
            with gr.Row():
                with gr.Column():
                    text_input = gr.Textbox(label="Input Noisy Text", placeholder="e.g., में वहा जा रहा हु...", lines=5)
                    btn_text = gr.Button("Normalize Text", variant="primary")
                with gr.Column():
                    text_output = gr.Textbox(label="Corrected Output", lines=5)

            btn_text.click(manual_text_normalize, inputs=text_input, outputs=text_output)

    gr.Markdown("---")
    gr.Markdown("Built with IndicBART & Google ASR Engine")

# Launch
demo.launch(debug=True, share=True)

  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo:


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://86a1dcefe10f59f39b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
if 'demo' in locals():
    demo.close()

Closing server running on port: 7860
