In [1]:
import os
os.environ["HF_TOKEN"] = "hf_"

In [2]:
from huggingface_hub import whoami, HfApi

print(whoami())
# or
api = HfApi(token=os.environ["HF_TOKEN"])
print(api.whoami())


{'type': 'user', 'id': '68728a71f3dc2e92b272ae2b', 'name': 'karenlu653', 'fullname': 'Karen Lu', 'email': 'karenlu653@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/527d466532c7bbd5df76ca20d7c1c9c5.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'dialect_model', 'role': 'write', 'createdAt': '2025-09-04T19:18:37.535Z'}}}
{'type': 'user', 'id': '68728a71f3dc2e92b272ae2b', 'name': 'karenlu653', 'fullname': 'Karen Lu', 'email': 'karenlu653@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/527d466532c7bbd5df76ca20d7c1c9c5.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'dialect_model', 'role': 'write', 'createdAt': '2025-09-04T19:18:37.535Z'}}}


In [3]:
import os
# os.makedirs("/mnt/sagemaker-nvme/hf_datasets_cache", exist_ok=True)
os.environ["HF_DATASETS_CACHE"] = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/hf_datasets_cache"

In [4]:
from datasets import load_dataset

In [5]:
try:
    shanghai_corpus = load_dataset("TingChen-ppmc/Shanghai_Dialect_Conversational_Speech_Corpus") #3.79k rows
    print("Shanghai corpus loaded successfully")
except Exception as e:
    print(f"Error loading Shanghai corpus: {e}") 
    shanghai_corpus = None

Shanghai corpus loaded successfully


In [6]:
# filepath = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/shanghai_corpus.parquet"
# shanghai_corpus["train"].to_parquet(filepath)

Saving the dataset (1/1 shards): 100%|██████████| 3792/3792 [00:04<00:00, 813.21 examples/s] 


In [10]:
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

def process_and_save_shanghai_parquet(shanghai_corpus, out_path, max_samples=None):
    train_ds = shanghai_corpus["train"]  # Get the train split

    data = []
    num_rows = len(train_ds) if max_samples is None else min(max_samples, len(train_ds))

    for i, row in enumerate(tqdm(train_ds, total=num_rows, desc="Processing Shanghai")):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'shanghai'
        text = row.get('transcription', '')
        audio_length = len(audio) / sampling_rate
        gender = row.get('gender', None)
        data.append({
            "audio": audio,
            "sampling_rate": sampling_rate,
            "label": label,
            "text": text,
            "audio_length": audio_length,
            "gender": gender
        })
        if max_samples is not None and i >= max_samples - 1:
            break

    if data:
        table = pa.Table.from_pylist(data)
        pq.write_table(table, out_path)
        print(f"Saved {len(data)} rows to {out_path}")
    else:
        print("No data to save.")

# Example usage:
out_path = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/shanghai_processed.parquet"
process_and_save_shanghai_parquet(shanghai_corpus, out_path, max_samples=None)  # set max_samples to None to process all


Processing Shanghai: 100%|██████████| 3792/3792 [00:21<00:00, 178.41it/s]


Saved 3792 rows to /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/shanghai_processed.parquet


In [11]:
try:
    mandarin_corpus = load_dataset("urarik/free_st_chinese_mandarin_corpus") #10k rows 
    print("Mandarin corpus loaded successfully")
except Exception as e:
    print(f"Error loading Mandarin corpus: {e}")
    mandarin_corpus = None

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/23 [00:00<?, ?it/s]

Mandarin corpus loaded successfully


In [13]:
# filepath = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/mandarin_corpus.parquet"
# mandarin_corpus["train"].to_parquet(filepath)

In [15]:
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

def process_and_save_mandarin_parquet(mandarin_corpus, split, out_path, max_samples):
    ds = mandarin_corpus[split]  # e.g., split = "train" or "test"

    data = []
    num_rows = len(ds) if max_samples is None else min(max_samples, len(ds))

    for i, row in enumerate(tqdm(ds, total=num_rows, desc=f"Processing Mandarin ({split})")):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'mandarin'
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate
        data.append({
            "audio": audio,
            "sampling_rate": sampling_rate,
            "label": label,
            "text": text,
            "audio_length": audio_length
        })
        if max_samples is not None and i >= max_samples - 1:
            break

    if data:
        table = pa.Table.from_pylist(data)
        pq.write_table(table, out_path)
        print(f"Saved {len(data)} rows to {out_path}")
    else:
        print("No data to save.")

#for "test" split:
out_path_test = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/mandarin_test.parquet"
process_and_save_mandarin_parquet(mandarin_corpus, "test", out_path_test, max_samples=None)


Processing Mandarin (test): 100%|██████████| 10260/10260 [00:51<00:00, 199.54it/s]


Saved 10260 rows to /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/mandarin_test.parquet


In [11]:
try:
    sichuan_corpus = load_dataset("wanghaikuan/sichuan") #6k rows 
    print("Sichuan corpus loaded successfully")
except Exception as e:
    print(f"Error loading Sichuan corpus: {e}")
    sichuan_corpus = None

Error loading Sichuan corpus: BuilderConfig AudioFolderConfig(name='default', version=0.0.0, data_dir=None, data_files={NamedSplit('train'): ['hf://datasets/wanghaikuan/sichuan@e2744f14db7c6e985da24b18ba2d458e50f07839/sichuan.zip']}, description=None, features=None, drop_labels=None, drop_metadata=None, metadata_filenames=None, filters=None) doesn't have a 'use_auth_token' key.


In [None]:
# filepath = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/sichuan_corpus.parquet"
# sichuan_corpus["train"].to_parquet(filepath)

In [5]:
try:
    cantonese_corpus = load_dataset("ziyou-li/cantonese_daily", split="train") #4k rows
    print("Cantonese corpus loaded successfully")
except Exception as e:
    print(f"Error loading Cantonese corpus: {e}")
    cantonese_corpus = None

Resolving data files:   0%|          | 0/4061 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/4061 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/4060 [00:00<?, ? examples/s]

Error loading Cantonese corpus: An error occurred while generating the dataset


In [16]:
from datasets import load_dataset
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

out_path = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/cantonese_corpus.parquet"

# Adjust the dataset name and fields to your use case
ds_stream = load_dataset("ziyou-li/cantonese_daily", split="train", streaming=True)
rows = []
processed = 0

for row in tqdm(ds_stream, desc="Processing Cantonese (streaming)"):
    try:
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'cantonese'
        # Adjust field names as per your dataset; using 'sentence' as an example
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate
        gender = row.get('gender', None)

        rows.append({
            "audio": audio,
            "sampling_rate": sampling_rate,
            "label": label,
            "text": text,
            "audio_length": audio_length,
            "gender": gender
        })
        processed += 1

    except Exception as e:
        print(f"Skipping row {processed} due to error: {e}")

# Save all rows to Parquet at the end
if rows:
    table = pa.Table.from_pylist(rows)
    pq.write_table(table, out_path)
    print(f"Saved {len(rows)} rows to {out_path}")
else:
    print("No data to save.")


Resolving data files:   0%|          | 0/4061 [00:00<?, ?it/s]

Processing Cantonese (streaming): 4060it [13:04,  5.18it/s]


Saved 4060 rows to /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/cantonese_corpus.parquet


In [8]:
%%time
from datasets import load_dataset
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

out_path = "/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/sichuan_subset.parquet"

ds_stream = load_dataset("wanghaikuan/sichuan", split="train", streaming=True)
rows = []
processed = 0

for row in tqdm(ds_stream, desc="Processing Sichuan (streaming)"):
    try:
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'sichuan'
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate
        gender = row.get('gender', None)

        rows.append({
            "audio": audio,
            "sampling_rate": sampling_rate,
            "label": label,
            "text": text,
            "audio_length": audio_length,
            "gender": gender
        })
        processed += 1

    except Exception as e:
        print(f"Skipping row {processed} due to error: {e}")

# After collecting all rows, save ONCE
if rows:
    table = pa.Table.from_pylist(rows)
    pq.write_table(table, out_path)
    print(f"Saved {len(rows)} rows to {out_path}")
else:
    print("No data to save.")



Processing Sichuan (streaming): 6522it [24:40,  4.41it/s]


Saved 6522 rows to /mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/sichuan_subset.parquet
CPU times: user 11min 43s, sys: 45.3 s, total: 12min 28s
Wall time: 25min


In [19]:
## test parquet data 

import pandas as pd

df = pd.read_parquet("/mnt/custom-file-systems/efs/fs-0a84517bf3cf54d59_fsap-04bd72a3f345b82c0/dialect-modeling/data/sichuan_subset.parquet")
print(df.head())        # Show first 5 rows
print(df.columns)       # See all available columns


                                               audio  sampling_rate    label  \
0  [-9.1552734e-05, -0.00021362305, -0.0001525878...          16000  sichuan   
1  [0.00015258789, 6.1035156e-05, 0.00021362305, ...          16000  sichuan   
2  [0.00012207031, -0.00012207031, 3.0517578e-05,...          16000  sichuan   
3  [9.1552734e-05, 0.00079345703, 0.00024414062, ...          16000  sichuan   
4  [0.00079345703, -0.00018310547, -0.00018310547...          16000  sichuan   

  text  audio_length gender  
0           3.384688   None  
1           3.176938   None  
2           3.940000   None  
3           4.750000   None  
4           2.920000   None  
Index(['audio', 'sampling_rate', 'label', 'text', 'audio_length', 'gender'], dtype='object')
