# Import libraries

In [1]:
import pandas as pd
import os
from pathlib import Path
import shutil
from typing import List
import json 
from tqdm import tqdm

# Read the validated tsv

In [2]:
validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')

  validated_data = pd.read_csv('../data/raw/cv-corpus-20.0-2024-12-06/th/validated.tsv', sep='\t')


# Define word replacement function

In [3]:
def replace_words(dataframe: pd.DataFrame, column_name: str, replacing_pairs: List[List[str]]) -> pd.DataFrame:
    for old_word, new_word in replacing_pairs:
        dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))
    return dataframe

replacing_word = [
    ['เพฃร', 'เพชร'],
]

# Filter and group client_id that have over 100 records

In [4]:
filtered_data = validated_data[
    validated_data['client_id'].map(
        validated_data['client_id'].value_counts() >= 100
    )
]
filtered_data = replace_words(filtered_data, 'sentence', replacing_word)
grouped = filtered_data.groupby('client_id').agg(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column_name] = dataframe[column_name].apply(lambda x: x.replace(old_word, new_word))


# Define new id instead of client_id

In [5]:
id_mapper = {id_: f'cv{str(i+1).zfill(3)}' for i, id_ in enumerate(filtered_data['client_id'].unique())}

grouped_data = {
    id_mapper[client_id]: list(zip(sentences, paths))
    for (client_id, (sentences, paths)) in grouped[['sentence', 'path']].iterrows()
}

# Moving files to the new folder

In [6]:
# Define paths
DEST_DIR = "../data/converted/commonvoice-to-vctk"
DEST_TEXT_PATH = os.path.join(DEST_DIR, "txt")
DEST_AUDIO_PATH = os.path.join(DEST_DIR, "wav48")
SRC_AUDIO_PATH = "../data/raw/cv-corpus-20.0-2024-12-06/th/clips"

# Clean and create directories
if os.path.exists(DEST_DIR):
   print("Clearing destination folder")
   shutil.rmtree(DEST_DIR)
os.makedirs(DEST_TEXT_PATH, exist_ok=True)
os.makedirs(DEST_AUDIO_PATH, exist_ok=True)

all_chars = set()

# Process files with progress bar
for client_id, data in tqdm(grouped_data.items(), desc="Processing"):
   client_text_dir = os.path.join(DEST_TEXT_PATH, client_id)
   client_mp3_dir = os.path.join(DEST_AUDIO_PATH, client_id)
   os.makedirs(client_text_dir, exist_ok=True)
   os.makedirs(client_mp3_dir, exist_ok=True)

   for i, d in enumerate(data):
       # Write text file
       text_path = os.path.join(client_text_dir, f"{client_id}_{(i + 1):03d}.txt")
       with open(text_path, 'w') as f:
           f.write(d[0])
           all_chars.update(d[0])

       # Copy audio file  
       shutil.copyfile(
           os.path.join(SRC_AUDIO_PATH, d[1]),
           os.path.join(client_mp3_dir, f"{client_id}_{i:03d}.mp3")
       )

print("Restructuring complete")

Processing: 100%|██████████| 134/134 [00:55<00:00,  2.41it/s]

Restructuring complete





# Create metadata

In [7]:
DEST_DIR = Path(DEST_DIR)

json_files = {
   'grouped_data.json': [
       {"client_id": cid, "data": [{"path": d[1], "sentence": d[0]} for d in data]}
       for cid, data in grouped_data.items()
   ],
   'language_ids.json': {'th': 0},
   'speakers_ids.json': {cid: i for i, cid in enumerate(grouped_data)},
   'id_mapper.json': id_mapper
}

# Write JSON files
for filename, data in json_files.items():
   with open(DEST_DIR / filename, 'w') as f:
       json.dump(data, f, indent=2)

# Write character files
sorted_chars = sorted(all_chars)
with open(DEST_DIR / 'all_chars_unicode.txt', 'w') as f:
   f.write(''.join(c.encode('unicode_escape').decode('ascii') for c in sorted_chars))
   
with open(DEST_DIR / 'all_chars.txt', 'w') as f:
   f.write(''.join(sorted_chars))