### Importing Required Libraries

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

### Dataset Processing

In [2]:
import zipfile
import os
# Unzip the main dataset file from the parent directory
zip_path = '../hindi_female_english.zip'
output_dir = 'dataset'
if os.path.exists(zip_path):
    print(f"Found zip file at: {zip_path}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
        print(f"Successfully extracted to '{output_dir}/'")
else:
    print(f"Error: File not found at {zip_path}")
    print("Current working directory:", os.getcwd())

Error: File not found at ../hindi_female_english.zip
Current working directory: /Users/rutwik/piper-model-training/scripts


### Some Analysis of the dataset

In [3]:
path_to_file = 'dataset/english/txt.done.data'

In [4]:
text = open(path_to_file, 'r',encoding='utf-8',
                 errors='ignore').read()

In [5]:
print(text[:1000])

( train_hindifullfemale_00001 " Author of the danger trail, Philip Steels, etc. " )
( train_hindifullfemale_00002 " Not at this particular case, Tom, apologized Whittemore. " )
( train_hindifullfemale_00003 " For the twentieth time that evening the two men shook hands. " )
( train_hindifullfemale_00004 " Lord, but I'm glad to see you again, Phil. " )
( train_hindifullfemale_00005 " Will we ever forget it. " )
( train_hindifullfemale_00006 " God bless 'em, I hope I'll go on seeing them forever. " )
( train_hindifullfemale_00007 " And you always want to see it in the superlative degree. " )
( train_hindifullfemale_00008 " Gad, your letter came just in time. " )
( train_hindifullfemale_00009 " He turned sharply, and faced Gregson across the table. " )
( train_hindifullfemale_00010 " I'm playing a single hand in what looks like a losing game. " )
( train_hindifullfemale_00011 " If I ever needed a fighter in my life I need one now. " )
( train_hindifullfemale_00012 " Gregson shoved back his

In [6]:
# The unique characters in the file
vocab = sorted(set(text))
print(vocab)
len(vocab)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'â€“']


77

In [7]:
char_to_ind = {u:i for i, u in enumerate(vocab)}
ind_to_char = np.array(vocab)
encoded_text = np.array([char_to_ind[c] for c in text])
seq_len = 250
total_num_seq = len(text)//(seq_len+1)
total_num_seq

2921

In [8]:
import tensorflow as tf
char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)

sequences = char_dataset.batch(seq_len+1, drop_remainder=True)

def create_seq_targets(seq):
    input_txt = seq[:-1]
    target_txt = seq[1:]
    return input_txt, target_txt

dataset = sequences.map(create_seq_targets)

2026-01-14 16:27:20.233018: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  if not hasattr(np, "object"):


### Forming the metadata.csv file

In [9]:
# Load and clean
metadata_female_fp = "dataset/english/txt.done.data"

# Read file split by double quotes (to separate filename and transcript)
metadata_female = pd.read_csv(metadata_female_fp, sep='"', usecols=[0, 1], header=None)

# Clean filename: remove leading '(' and whitespace
metadata_female[0] = metadata_female[0].str.replace(r'\(', '', regex=True).str.strip()

# Clean transcript: remove leading/trailing whitespace
metadata_female[1] = metadata_female[1].str.strip()

# Optional: add speaker info
metadata_female[2] = 'female'

# Show shape and preview
print(metadata_female.shape)
metadata_female.head()


(6541, 3)


Unnamed: 0,0,1,2
0,train_hindifullfemale_00001,"Author of the danger trail, Philip Steels, etc.",female
1,train_hindifullfemale_00002,"Not at this particular case, Tom, apologized W...",female
2,train_hindifullfemale_00003,For the twentieth time that evening the two me...,female
3,train_hindifullfemale_00004,"Lord, but I'm glad to see you again, Phil.",female
4,train_hindifullfemale_00005,Will we ever forget it.,female


### Bulding the metadata.csv file

In [10]:
metadata = metadata_female
metadata.to_csv('dataset/english/metadata.csv', sep='|', index=False, header=False)


### Preprocessing

In [11]:
import os

# 1. Define paths
# Absolute path to your dataset folder (the one containing metadata.csv)
dataset_path = os.path.abspath("dataset/english")

# Absolute path to the 'src/python' folder inside the cloned piper_repo
# This is where the 'piper_train' python package lives
piper_src_path = os.path.abspath("../piper_repo/src/python")

print(f"Dataset Path: {dataset_path}")
print(f"Piper Source Path: {piper_src_path}")

# 2. Run the command with PYTHONPATH set
# This tells Python: "Look in 'piper_src_path' when I ask for 'piper_train'"
!PYTHONPATH="{piper_src_path}" python3 -m piper_train.preprocess \
  --language en \
  --input-dir "{dataset_path}" \
  --output-dir training_dir \
  --dataset-format ljspeech \
  --single-speaker \
  --sample-rate 22050

Dataset Path: /Users/rutwik/piper-model-training/scripts/dataset/english
Piper Source Path: /Users/rutwik/piper-model-training/piper_repo/src/python
INFO:preprocess:Single speaker dataset
INFO:preprocess:Wrote dataset config
INFO:preprocess:Processing 6541 utterance(s) with 12 worker(s)


### Building the monotonic align 

In [12]:
import os
import shutil

# 1. Define paths
# Points to: /Users/rutwik/piper-model-training/piper_repo/src/python
piper_src_path = os.path.abspath("../piper_repo/src/python")

# Source files we need to compile
monotonic_align_src = os.path.join(piper_src_path, "piper_train/vits/monotonic_align")
temp_build_dir = "temp_build"

# 2. Create temp directory
os.makedirs(temp_build_dir, exist_ok=True)
print(f"Created {temp_build_dir}")

# 3. Copy necessary files (core.pyx and setup.py)
for filename in ["core.pyx", "setup.py"]:
    src_file = os.path.join(monotonic_align_src, filename)
    dst_file = os.path.join(temp_build_dir, filename)
    if os.path.exists(src_file):
        shutil.copy(src_file, dst_file)
        print(f"Copied {filename}")
    else:
        print(f"Error: Could not find {filename} at {src_file}")

# 4. Move into the build directory
%cd {temp_build_dir}

Created temp_build
Copied core.pyx
Copied setup.py
/Users/rutwik/piper-model-training/scripts/temp_build


### Copying the monotonic align file core.so 

In [17]:
!python3 setup.py build_ext --inplace

# Copy the compiled .so file back to the SOURCE location in piper_repo
# This is crucial so the training script can find it later
import shutil
import glob
import os

# Find the compiled file (e.g., core.cpython-311-darwin.so)
compiled_files = glob.glob("core*.so")

if compiled_files:
    src_so = compiled_files[0]
    # Destination: piper_train/vits/monotonic_align/ inside your repo
    dest_dir = os.path.join(piper_src_path, "piper_train/vits/monotonic_align")
    
    print(f"Copying {src_so} to {dest_dir}...")
    shutil.copy(src_so, dest_dir)
    print("Success: Monotonic alignment module built and installed.")
else:
    print("Error: Build failed, no .so file found.")

Compiling /Users/rutwik/piper-model-training/scripts/temp_build/core.pyx because it changed.
[1/1] Cythonizing /Users/rutwik/piper-model-training/scripts/temp_build/core.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
Copying core.cpython-311-darwin.so to /Users/rutwik/piper-model-training/piper_repo/src/python/piper_train/vits/monotonic_align...
Success: Monotonic alignment module built and installed.


### Checking if the core.so is formed

In [18]:
import os

# Define the correct path to the monotonic_align directory
# Assuming 'piper_src_path' is still defined from previous cells as ../piper_repo/src/python
piper_src_path = os.path.abspath("../../piper_repo/src/python") # Adjusting for being inside temp_build
monotonic_align_dir = os.path.join(piper_src_path, "piper_train/vits/monotonic_align")

print(f"Listing contents of: {monotonic_align_dir}")

if os.path.exists(monotonic_align_dir):
    files = os.listdir(monotonic_align_dir)
    for f in files:
        print(f)
else:
    print("Directory not found. Please check the path.")

Listing contents of: /Users/rutwik/piper-model-training/piper_repo/src/python/piper_train/vits/monotonic_align
Makefile
__init__.py
core.c
setup.py
core.cpython-311-darwin.so
core.pyx


### Checking if the CSV file is Correct

In [24]:
# 1. Switch back to the main scripts directory
%cd ..

import os

# 2. Define the correct path relative to 'scripts'
dataset_csv_path = os.path.abspath("dataset/english/metadata.csv")

print(f"Reading: {dataset_csv_path}")

# 3. Read and print the file
if os.path.exists(dataset_csv_path):
    with open(dataset_csv_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            print(line.strip())
            if i == 4:  # Show only first 5 lines
                break
else:
    print(f"Error: File not found at {dataset_csv_path}")

/Users/rutwik/piper-model-training/scripts
Reading: /Users/rutwik/piper-model-training/scripts/dataset/english/metadata.csv
train_hindifullfemale_00001|Author of the danger trail, Philip Steels, etc.|female
train_hindifullfemale_00002|Not at this particular case, Tom, apologized Whittemore.|female
train_hindifullfemale_00003|For the twentieth time that evening the two men shook hands.|female
train_hindifullfemale_00004|Lord, but I'm glad to see you again, Phil.|female
train_hindifullfemale_00005|Will we ever forget it.|female
