# Import the datasets from Hugging Face 

In [None]:
# #increases rate limits 

# from huggingface_hub import login
# login()  # Then paste your token when prompted

In [1]:
from datasets import load_dataset, Audio
import torch
torch.multiprocessing.set_sharing_strategy('file_system') # This is necessary to avoid issues with multiprocessing in PyTorch
shanghai_corpus = load_dataset("TingChen-ppmc/Shanghai_Dialect_Conversational_Speech_Corpus", split = "train")
mandarin_corpus = load_dataset("urarik/free_st_chinese_mandarin_corpus", split="train", streaming=True)
sichuan_corpus = load_dataset("wanghaikuan/sichuan", split="train", streaming=True) #6k rows 
cantonese_corpus = load_dataset("ziyou-li/cantonese_daily", split="train", streaming=True)
# note: streaming=True prevents a download that exceeds my computer space limit but can load full on in container 



Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/4061 [00:00<?, ?it/s]

In [None]:
# from datasets import config
# print(config.HF_DATASETS_CACHE) #to see where the datasets are stored

/Users/karenlu/.cache/huggingface/datasets


In [6]:
mandarin_corpus

IterableDataset({
    features: ['audio', 'sentence'],
    num_shards: 23
})

## Restructure datasets 

Iterates over the data to pull out the relevant features from each dataset and balance each sample amount (currently set at 3000) 

In [9]:
#pulls out audio, sample, transcription, and label from datasets and organizes them into a list of tuples
def process_shanghai(shanghai_corpus, max_samples=3000):
    data = []
    for i, row in enumerate(shanghai_corpus):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'shanghai'  # Label for Shanghai dataset
        text = row.get('transcription', '')  # Get transcription if available
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

def process_mandarin(mandarin_corpus, max_samples=3000):
    data = []
    for i, row in enumerate(mandarin_corpus):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'mandarin'  # Label for Mandarin dataset
        text = row.get('sentence', '')  # Get sentence if available
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

# Process both datasets
shanghai_data = process_shanghai(shanghai_corpus)
mandarin_data = process_mandarin(mandarin_corpus)

# Combine the datasets
combined_data = shanghai_data + mandarin_data

# print the first 5 samples with text
# print the first 5 samples with text, audio length, and gender
for audio, sampling_rate, label, text, audio_length, gender in combined_data[-5:]:
    print(f"Label: {label}, Text: {text[:30]}..., Length: {audio_length:.2f}s, Gender: {gender}")

Label: mandarin, Text: 亲爱滴我愿意你做永远的局长..., Length: 4.04s, Gender: None
Label: mandarin, Text: 摸摸你的胸还有小穴啊..., Length: 3.87s, Gender: None
Label: mandarin, Text: 俩二娃么时候回威海给我那介绍下..., Length: 4.74s, Gender: None
Label: mandarin, Text: 卡盘中心孔多大四爪..., Length: 4.71s, Gender: None
Label: mandarin, Text: 我家和鲅鱼圈都是周日..., Length: 3.53s, Gender: None


In [None]:
from tqdm import tqdm
# Process Sichuan and Cantonese datasets with progress bars since they are streaming 
# Takes incredibly long to process the full datasets, so we limit the number of samples processed for demo

def process_sichuan(sichuan_corpus, max_samples=100):
    data = []
    for i, row in enumerate(tqdm(sichuan_corpus, desc="Processing Sichuan", total=max_samples)):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'sichuan'
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

def process_cantonese(cantonese_corpus, max_samples=100):
    data = []
    for i, row in enumerate(tqdm(cantonese_corpus, desc="Processing Cantonese", total=max_samples)):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'cantonese'
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

# Process all datasets with progress bars
sichuan_data = process_sichuan(sichuan_corpus)
cantonese_data = process_cantonese(cantonese_corpus)

In [22]:
combined_data = combined_data + sichuan_data + cantonese_data

In [44]:
cantonese_data

[(array([ 0.01287842,  0.01242065,  0.01013184, ..., -0.00253296,
         -0.00299072, -0.00253296]),
  16000,
  'cantonese',
  '霸道，我唔食呢一套噶！',
  3.22,
  None),
 (array([ 0.0017395 ,  0.00115967,  0.0012207 , ..., -0.00140381,
         -0.00204468, -0.00164795]),
  16000,
  'cantonese',
  '你易不得叫我早啲瞓！',
  3.25,
  None),
 (array([-0.00527954, -0.00509644, -0.00430298, ..., -0.00317383,
         -0.00222778, -0.00158691]),
  16000,
  'cantonese',
  '知道哥哥嘿嘿。',
  3.31,
  None),
 (array([-0.00469971, -0.00460815, -0.00402832, ...,  0.00222778,
          0.00256348,  0.00247192]),
  16000,
  'cantonese',
  '仲有一啲事我哋又唔好意思讲！',
  4.24,
  None),
 (array([-0.00152588, -0.00112915, -0.0012207 , ...,  0.00061035,
          0.0007019 ,  0.0007019 ]),
  16000,
  'cantonese',
  '我而家只会爱你一生一世！',
  4.12,
  None),
 (array([-0.00564575, -0.00668335, -0.00796509, ..., -0.00308228,
         -0.00360107, -0.00408936]),
  16000,
  'cantonese',
  '你日日都喺度做乜嘢啊？',
  2.98,
  None),
 (array([-0.00100708, -0.00085449, 

# Add Features 

Adds the following: gender, snr (signal-to-noise ratio), tokens, sentiment/emotion 

### Add gender

In [38]:
# Adding gender detection using a pre-trained model
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

# Load model and processor
model_name = "prithivMLmods/Common-Voice-Gender-Detection"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model.eval()

# Label mapping
id2label = {
    0: "female",
    1: "male"
}

def predict_gender(audio_array, sampling_rate):
    # all audio arrays should be 16kHz, so we don't need to resample
    # Prepare input
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = logits.argmax(dim=-1).item()
    return id2label[pred_id]

In [None]:
# Apply gender prediction only to rows where gender is None (i.e., not 'shanghai')
from tqdm import tqdm

combined_data_with_gender = []
for audio, sampling_rate, label, text, audio_length, gender in tqdm(combined_data, desc="Gender Detection"):
    # gender is already present for 'shanghai'), keep it
    if gender is not None:
        combined_data_with_gender.append((audio, sampling_rate, label, text, audio_length, gender))
    else:
        # Predict gender for other dialects
        predicted_gender = predict_gender(audio, sampling_rate)
        combined_data_with_gender.append((audio, sampling_rate, label, text, audio_length, predicted_gender))

# Now each tuple is (audio, sampling_rate, label, text, audio_length, gender)
# print the first 5 with gender
for row in combined_data_with_gender[:5]:
    print(f"Label: {row[2]}, Text: {row[3][:30]}, Length: {row[4]:.2f}s, Gender: {row[5]}")

Gender Detection: 100%|██████████| 6200/6200 [11:44<00:00,  8.80it/s]   

Label: shanghai, Text: 北京爱数智慧语音采集, Length: 3.98s, Gender: male
Label: shanghai, Text: 北京爱数智慧语音采集, Length: 2.69s, Gender: female
Label: shanghai, Text: 阿拉两个拧来聊聊金融方面呃, Length: 2.66s, Gender: male
Label: shanghai, Text: 金融方面嘛, Length: 1.32s, Gender: male
Label: shanghai, Text: 搿呃，阿姨喃，应该讲，侬已经交关年数辣辣了解了, Length: 5.37s, Gender: male





### Add tokens from transcription (sichuan does not have this)

In [47]:
from transformers import BertTokenizer, ErnieModel
tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-nano-zh")
model = ErnieModel.from_pretrained("nghuyong/ernie-3.0-nano-zh")

Some weights of ErnieModel were not initialized from the model checkpoint at nghuyong/ernie-3.0-nano-zh and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
combined_data_with_tokens = []
for audio, sampling_rate, label, text, audio_length, gender in combined_data_with_gender: 
    token = tokenizer(text, padding='max_length', truncation=True, max_length = 128, return_tensors="pt")
    combined_data_with_tokens.append((audio, sampling_rate, label, text, audio_length, gender, token))

In [None]:
# test = combined_data_with_tokens[:1][0][-1]  # Show the token for the first entry
# tokenizer.decode(test.input_ids[0], skip_special_tokens=True)

## Add sentiment

In [62]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
import torch

tokenizer=BertTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment')
model=BertForSequenceClassification.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment')

text='今天心情不好'

output=model(torch.tensor([tokenizer.encode(text)]))
print(torch.nn.functional.softmax(output.logits,dim=-1))

tensor([[0.9551, 0.0449]], grad_fn=<SoftmaxBackward0>)


In [71]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment_id = torch.argmax(probs, dim=-1).item()
    sentiment_map = {0: "negative", 1: "neutral", 2: "positive"}
    return sentiment_map.get(sentiment_id, "unknown")

from tqdm import tqdm

combined_data_with_sentiment = []
for audio, sampling_rate, label, text, audio_length, gender, token in tqdm(combined_data_with_tokens, desc="Sentiment Analysis"):
    sentiment = get_sentiment(text)
    combined_data_with_sentiment.append((audio, sampling_rate, label, text, audio_length, gender, token, sentiment))

Sentiment Analysis: 100%|██████████| 6200/6200 [07:54<00:00, 13.08it/s]


In [None]:
# # Example: print first 3 rows with sentiment
# for row in combined_data_with_sentiment[:3]:
#     print(f"Text: {row[3][:30]}, Audio Length: {row[4]}, Gender: {row[5]}, Sentiment: {row[7]}")

Text: 北京爱数智慧语音采集, Audio Length: 3.98, Gender: male, Sentiment: neutral
Text: 北京爱数智慧语音采集, Audio Length: 2.69, Gender: female, Sentiment: neutral
Text: 阿拉两个拧来聊聊金融方面呃, Audio Length: 2.66, Gender: male, Sentiment: neutral


#### Alternative sentiment model

In [72]:
from collections import Counter

# Extract sentiment labels from combined_data_with_sentiment
sentiments = [row[7] for row in combined_data_with_sentiment]
sentiment_counts = Counter(sentiments)

print("Sentiment distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count}")

Sentiment distribution:
neutral: 2936
negative: 3264


In [74]:
from transformers import AutoModelForAudioClassification, AutoProcessor
import torch

# Load model and processor
model_name = "emotion2vec/emotion2vec_base"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForAudioClassification.from_pretrained(model_name)
model.eval()

# Example function to predict emotion from audio
def predict_emotion(audio_array, sampling_rate):
    # Preprocess audio
    inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = logits.argmax(dim=-1).item()
    # Get label mapping from model config
    id2label = model.config.id2label
    return id2label[str(pred_id)]

# Usage example (replace with your audio array and sampling rate)
# emotion = predict_emotion(audio_array, sampling_rate)
# print("Predicted emotion:", emotion)

# Usage example: predict emotion for the first audio sample in combined_data
first_audio, first_sampling_rate, *_ = combined_data[0]
emotion = predict_emotion(first_audio, first_sampling_rate)
print("Predicted emotion:", emotion)

ValueError: Unrecognized model in emotion2vec/emotion2vec_base. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, arcee, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, bitnet, blenderbot, blenderbot-small, blip, blip-2, blip_2_qformer, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, colqwen2, conditional_detr, convbert, convnext, convnextv2, cpmant, csm, ctrl, cvt, d_fine, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, dia, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dots1, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_h1, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, gemma3n, gemma3n_audio, gemma3n_text, gemma3n_vision, git, glm, glm4, glm4v, glm4v_text, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granite_speech, granitemoe, granitemoehybrid, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hgnet_v2, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, internvl, internvl_vision, jamba, janus, jetmoe, jukebox, kosmos-2, kyutai_speech_to_text, layoutlm, layoutlmv2, layoutlmv3, led, levit, lightglue, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, minimax, mistral, mistral3, mixtral, mlcd, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_omni, qwen2_5_vl, qwen2_5_vl_text, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen2_vl_text, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_hq, sam_hq_vision_model, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smollm3, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, t5gemma, table-transformer, tapas, textnet, time_series_transformer, timesfm, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, vjepa2, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

## Add SNR (signal to noise ratio)

# Rebalance dataset with features 

Note that some datasets have more/less of a feature, e.g. more female speakers than male in shanghai dataset, so it gets rebalanced