In [None]:
import pandas as pd

## Normalization


In [None]:
import re
import json

def load_normalization_dict(filename):
    try:
        with open(filename, "r", encoding="utf-8") as file:
            raw_dict = json.load(file)

        processed_dict = {}
        for key, value in raw_dict.items():
            key_lower = key.lower()
            value_lower = value.lower()
            if key_lower not in processed_dict:
                processed_dict[key_lower] = value_lower

        return processed_dict
    except json.JSONDecodeError:
        print("Error: Unable to decode JSON. Please check the file format.")
        return {}
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
        return {}

# Load dictionary
normalization_dict = load_normalization_dict("./dict.json")

normalization_dict

{'mei': 'main',
 'mai': 'main',
 'main': 'main',
 'kya': 'kya',
 'kia': 'karna',
 'kiya': 'kar',
 'mujhe': 'mujhe',
 'mujhay': 'mujhe',
 'mujhy': 'mujhe',
 'uthna': 'uthna',
 'uthay': 'uthna',
 'utha': 'uthna',
 'nashta': 'nashta',
 'nashtay': 'nashta',
 'bjay': 'baje',
 'baje': 'baje',
 'nikla': 'nikalna',
 'niklay': 'nikalna',
 'niklaa': 'nikalna',
 'niklayga': 'niklega',
 'poncha': 'pohcha',
 'pohunch': 'pohcha',
 'pohanch': 'pohcha',
 'chal': 'chala',
 'chala': 'chala',
 'chalay': 'chala',
 'aya': 'aaya',
 'aaya': 'aaya',
 'aai': 'aayi',
 'gaya': 'gaya',
 'gai': 'gayi',
 'gaye': 'gaye',
 'betha': 'baitha',
 'bethay': 'baitha',
 'bethayga': 'baithayga',
 'parhna': 'parhna',
 'parhi': 'parhna',
 'parh': 'parhna',
 'parhayi': 'parhna',
 'parhay': 'parhna',
 'parhny': 'parhna',
 'acha': 'achha',
 'accha': 'achha',
 'bohot': 'bahut',
 'boht': 'bahut',
 'kaafi': 'kaafi',
 'thoda': 'thoda',
 'thori': 'thodi',
 'thoriya': 'thodi',
 'masla': 'masla',
 'maslay': 'masla',
 'masle': 'masla',
 

### Data preprocessing

In [None]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove unnecessary punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'\t+', ' ', text).strip()  # Remove tabs
    text = re.sub(r'^\d+\s*', '', text)  # Remove numbering at start
    text = re.sub(r'\n+', ' ', text).strip()  # Remove extra line breaks
    return text


### Load dataset

In [None]:
import os

def load_data(folder_path):
    count=0
    dict_data={}
    all_lines = []


    for filename in os.listdir(folder_path):
        count+=1
        print(f"{count}, {filename}" )

        if filename.endswith(".txt"):
            new_dict=[]
            filepath = os.path.join(folder_path, filename)

            with open(filepath, 'r', encoding='utf-8') as file:
                for line in file:
                    processed_line = preprocess_text(line.strip())
                    words = processed_line.split()
                    normalized_words = [normalization_dict.get(word, word) for word in words]
                    line= " ".join(normalized_words)

                    if line:
                        new_dict.append(line.strip())

            all_lines.append(new_dict)

    return all_lines

folder_path = "./"
data = load_data(folder_path)

# print(data[:5])

1, .config
2, 0.wav
3, 22i0618_AminFahim_Day4.txt
4, 22i0618_AminFahim_Day3.txt
5, 22i0618_AminFahim_Day13.txt
6, 22i0618_AminFahim_Day7.txt
7, 22i0618_AminFahim_Day5.txt
8, 22i0618_AminFahim_Day6.txt
9, =0.9.2
10, 22i0618_AminFahim_Day9.txt
11, 22i0618_AminFahim_Day1.txt
12, 22i0618_AminFahim_Day14.txt
13, 22i0618_AminFahim_Day8.txt
14, 22i0618_AminFahim_Day2.txt
15, 22i0618_AminFahim_Day11.txt
16, 22i0618_AminFahim_Day12.txt
17, 22i0618_AminFahim_Day10.txt
18, dict.json
19, sample_data


In [None]:
data

[['aj subah ma 7:30 pa uthna aur torhi deer phone use karna uska baad muh dhona chala gaya aur phir nashta karna',
  'nashta kar ka baad ma apna dost ko lakar 8 baja university karna taraf nikal gaya university phonch kar ma seedha apni class ma gaya',
  'do consecutive classes li aur phir ma apna dost ka sath cafe chala gaya aj jumma karna wajah sa hamari 2 ghanta karna break thi',
  'break ma torhi deer football khela aur phir ma dost ka sath khana khana aur phir hum sara milkar jumma parhna chala gaya',
  'jumma karna namaz ka baad ma class laina chala gaya aur class li 5 baja hamari chuti hui aur ma 530 ko university sa nikal gaya aur 610 taak ghar phonch gaya',
  'ghar aka fresh huwa aur phir dost ka sath bahar chala gaya kaafi deer unka sath tha phir ma khana khana ghar wapis a gaya',
  'khana ka baad ma dost ka sath torha game khela aur phir ma apna university ka kaam kar lag gaya',
  'kaam kar ka baad movie dekhi aur ma phir so gaya'],
 ['aj subah ma 9:00 pa uthna aur torhi dee

## Testing different TTS models

In [None]:
pip install outetts --upgrade

Collecting outetts
  Downloading outetts-0.4.0-py3-none-any.whl.metadata (8.7 kB)
Collecting llama-cpp-python==0.3.8 (from outetts)
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting encodec (from outetts)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers==4.48.3 (from outetts)
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.1 M

In [None]:
import outetts

# Initialize the interface
interface = outetts.Interface(
    config=outetts.ModelConfig.auto_config(
        model=outetts.Models.VERSION_1_0_SIZE_1B,
        # For llama.cpp backend
        backend=outetts.Backend.LLAMACPP,
        quantization=outetts.LlamaCppQuantization.FP16
        # For transformers backend
        # backend=outetts.Backend.HF,
    )
)

# Load the default speaker profile
speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL")

# Or create your own speaker profiles in seconds and reuse them instantly
# speaker = interface.create_speaker("path/to/audio.wav")
# interface.save_speaker(speaker, "speaker.json")
# speaker = interface.load_speaker("speaker.json")

# Generate speech
output = interface.generate(
    config=outetts.GenerationConfig(
        text="Hello, how are you doing?",
        generation_type=outetts.GenerationType.CHUNKED,
        speaker=speaker,
        sampler_config=outetts.SamplerConfig(
            temperature=0.4
        ),
    )
)

# Save to file
output.save("output.wav")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

[32m2025-04-17 05:45:13.759[0m | [1mINFO    [0m | [36moutetts.models.config[0m:[36mauto_config[0m:[36m147[0m - [1mInitializing model configuration for Llama-OuteTTS-1.0-1B model with llamacpp backend.[0m


Llama-OuteTTS-1.0-1B-FP16.gguf:   0%|          | 0.00/2.50G [00:00<?, ?B/s]

[32m2025-04-17 05:45:43.499[0m | [1mINFO    [0m | [36moutetts.models.config[0m:[36mauto_config[0m:[36m154[0m - [1mModel path: /root/.cache/outeai/gguf/Llama-OuteTTS-1.0-1B-FP16.gguf[0m
[32m2025-04-17 05:45:43.502[0m | [1mINFO    [0m | [36moutetts.models.config[0m:[36mauto_config[0m:[36m179[0m - [1mLLAMA.CPP backend selected. Offloading all layers to GPU.[0m
[32m2025-04-17 05:45:43.504[0m | [1mINFO    [0m | [36moutetts.models.config[0m:[36mauto_config[0m:[36m184[0m - [1mUsing config:[0m


{ 'additional_model_config': {},
  'audio_codec_path': None,
  'backend': <Backend.LLAMACPP: 'llamacpp'>,
  'device': None,
  'dtype': None,
  'interface_version': <InterfaceVersion.V3: 3>,
  'max_seq_length': 8192,
  'model_path': '/root/.cache/outeai/gguf/Llama-OuteTTS-1.0-1B-FP16.gguf',
  'n_gpu_layers': 99,
  'tokenizer_path': 'OuteAI/Llama-OuteTTS-1.0-1B',
  'verbose': False}


weights_24khz_1.5kbps_v1.0.pth:   0%|          | 0.00/296M [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)


tokenizer_config.json:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/18.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131k [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
load: control token: 131193 '<|energy_887|>' is not marked as EOG
load: control token: 131192 '<|energy_886|>' is not marked as EOG
load: control token: 131190 '<|energy_884|>' is not marked as EOG
load: control token: 131188 '<|energy_882|>' is not marked as EOG
load: control token: 131187 '<|energy_881|>' is not marked as EOG
load: control token: 131183 '<|energy_877|>' is not marked as EOG
load: control token: 131182 '<|energy_876|>' is not marked as EOG
load: control token: 131181 '<|energy_875|>' is not marked as EOG
load: control token: 131180 '<|energy_874|>' is not marked as EOG
load: control token: 131179 '<|energy_873|>' is not marked as EOG
load: control token: 131178 '<|energy_872|>' is not marked as EOG
load: control token: 131177 '<|energy_871|>' is not marked as EOG
load: control token: 131176 '<|energy_870|>' is not marked as EOG
load: control token: 131174 '<|energy_868|>' is not marked as EOG
load: contr

In [None]:
from IPython.display import Audio

display(Audio("output.wav"))

In [None]:
output = interface.generate(
    config=outetts.GenerationConfig(
        text="aJ ma subah uth ka university gaya",
        generation_type=outetts.GenerationType.CHUNKED,
        speaker=speaker,
        sampler_config=outetts.SamplerConfig(
            temperature=0.4
        ),
    )
)

# Save to file
output.save("output.wav")

[32m2025-04-17 05:56:59.723[0m | [1mINFO    [0m | [36moutetts.version.interface[0m:[36mchunk_generation[0m:[36m251[0m - [1mCreated: 1 text chunks[0m
[32m2025-04-17 05:56:59.725[0m | [1mINFO    [0m | [36moutetts.version.interface[0m:[36mchunk_generation[0m:[36m253[0m - [1mProccessing: Chunk 1 / 1[0m
0it [00:00, ?it/s]Llama.generate: 37 prefix-match hit, remaining 1807 prompt tokens to eval
622it [06:28,  1.60it/s, tokens=2466, max tokens=8192]
[32m2025-04-17 06:04:00.051[0m | [1mINFO    [0m | [36moutetts.version.playback[0m:[36msave[0m:[36m43[0m - [1mSaved audio to: output.wav[0m


In [None]:
from IPython.display import Audio

display(Audio("output.wav"))

## Hugging Face kokoro TTS model

In [None]:
!pip install -q kokoro>=0.9.2 soundfile
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf
import torch
pipeline = KPipeline(lang_code='a')
text = '''
[Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
'''
generator = pipeline(text, voice='af_heart')
for i, (gs, ps, audio) in enumerate(generator):
    print(i, gs, ps)
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    sf.write(f'{i}.wav', audio, 24000)



config.json:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)


kokoro-v1_0.pth:   0%|          | 0.00/327M [00:00<?, ?B/s]

af_heart.pt:   0%|          | 0.00/523k [00:00<?, ?B/s]

0 Kokoro is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, Kokoro can be deployed anywhere from production environments to personal projects. kˈOkəɹO ɪz ɐn ˈOpᵊnwˌAt tˌitˌiˈɛs mˈɑdᵊl wɪð ˈATi tˈu mˈɪljᵊn pəɹˈæməTəɹz. dəspˈIt ɪts lˈItwˌAt ˈɑɹkətˌɛkʧəɹ, ɪt dəlˈɪvəɹz kˈɑmpəɹəbᵊl kwˈɑləTi tə lˈɑɹʤəɹ mˈɑdᵊlz wˌIl bˈiɪŋ səɡnˈɪfəkəntli fˈæstəɹ ænd mˈɔɹ kˈɔstəfˌɪʃənt. wˌɪð əpˌæʧilˈIsᵊnst wˈAts, kˈOkəɹO kæn bi dəplˈYd ˈɛniwˌɛɹ fɹʌm pɹədˈʌkʃən ənvˈIɹənmᵊnts tə pˈɜɹsᵊnəl pɹˈɑʤˌɛkts.


In [None]:
pipeline = KPipeline(lang_code='p')
text = '''
aj ma subah uth ka school gaya
'''
generator = pipeline(text, voice='af_heart')
for i, (gs, ps, audio) in enumerate(generator):
    print(i, gs, ps)
    display(Audio(data=audio, rate=24000, autoplay=i==0))
    # sf.write(f'{i}.wav', audio, 24000)

0 aj ma subah uth ka school gaya ˈaʒ mˈa subˈa ˈut kˈa sʃˈʊl ɡˈIæ


## Training on our dataset

In [None]:
data

[['aj subah ma 7:30 pa uthna aur torhi deer phone use karna uska baad muh dhona chala gaya aur phir nashta karna',
  'nashta kar ka baad ma apna dost ko lakar 8 baja university karna taraf nikal gaya university phonch kar ma seedha apni class ma gaya',
  'do consecutive classes li aur phir ma apna dost ka sath cafe chala gaya aj jumma karna wajah sa hamari 2 ghanta karna break thi',
  'break ma torhi deer football khela aur phir ma dost ka sath khana khana aur phir hum sara milkar jumma parhna chala gaya',
  'jumma karna namaz ka baad ma class laina chala gaya aur class li 5 baja hamari chuti hui aur ma 530 ko university sa nikal gaya aur 610 taak ghar phonch gaya',
  'ghar aka fresh huwa aur phir dost ka sath bahar chala gaya kaafi deer unka sath tha phir ma khana khana ghar wapis a gaya',
  'khana ka baad ma dost ka sath torha game khela aur phir ma apna university ka kaam kar lag gaya',
  'kaam kar ka baad movie dekhi aur ma phir so gaya'],
 ['aj subah ma 9:00 pa uthna aur torhi dee

In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from kokoro import KPipeline
from IPython.display import Audio
import soundfile as sf
from pydub import AudioSegment
import os

# Initialize Kokoro pipeline
pipeline = KPipeline(lang_code='p')


# Create output folder
os.makedirs("day_audio", exist_ok=True)

dairy=data[:2]

# Process and save audio
for day_idx, day in enumerate(dairy, 1):
    combined = AudioSegment.silent(duration=0)
    for i, text in enumerate(day):
        generator = pipeline(text, voice='af_heart')
        for j, (gs, ps, audio) in enumerate(generator):
            temp_file = f"day_audio/day{day_idx}_entry{i}.wav"
            sf.write(temp_file, audio, 24000)
            entry_audio = AudioSegment.from_wav(temp_file)
            combined += entry_audio + AudioSegment.silent(duration=500)
    combined.export(f"day_audio/day{day_idx}_full.wav", format="wav")



  WeightNorm.apply(module, name, dim)


In [None]:
from kokoro import KPipeline
from IPython.display import Audio
import soundfile as sf
from pydub import AudioSegment
import os

# Initialize Kokoro pipeline
pipeline = KPipeline(lang_code='p')


# Create output folder
os.makedirs("day_audio", exist_ok=True)

dairy=data

# Process and save audio
for day_idx, day in enumerate(dairy, 1):
    combined = AudioSegment.silent(duration=0)
    for i, text in enumerate(day):
        generator = pipeline(text, voice='af_heart')
        for j, (gs, ps, audio) in enumerate(generator):
            temp_file = f"day_audio/day{day_idx}_entry{i}.wav"
            sf.write(temp_file, audio, 24000)
            entry_audio = AudioSegment.from_wav(temp_file)
            combined += entry_audio + AudioSegment.silent(duration=500)
    combined.export(f"day_audio/day{day_idx}_full.wav", format="wav")





In [None]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive('day_audio', 'zip', 'day_audio')

# Download the zip file
files.download('day_audio.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>