# Import the datasets from Hugging Face 

In [None]:
# #increases rate limits 

# from huggingface_hub import login
# login()  # Then paste your token when prompted

In [1]:
from datasets import load_dataset, Audio
import torch
torch.multiprocessing.set_sharing_strategy('file_system') # This is necessary to avoid issues with multiprocessing in PyTorch
shanghai_corpus = load_dataset("TingChen-ppmc/Shanghai_Dialect_Conversational_Speech_Corpus", split = "train")
mandarin_corpus = load_dataset("urarik/free_st_chinese_mandarin_corpus", split="train", streaming=True)
sichuan_corpus = load_dataset("wanghaikuan/sichuan", split="train", streaming=True) #6k rows 
cantonese_corpus = load_dataset("ziyou-li/cantonese_daily", split="train", streaming=True)
# note: streaming=True prevents a download that exceeds my computer space limit but can load full on in container 



Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/4061 [00:00<?, ?it/s]

In [None]:
# from datasets import config
# print(config.HF_DATASETS_CACHE) #to see where the datasets are stored

/Users/karenlu/.cache/huggingface/datasets


In [2]:
mandarin_corpus

IterableDataset({
    features: ['audio', 'sentence'],
    num_shards: 23
})

## Restructure datasets 

Iterates over the data to pull out the relevant features from each dataset and balance each sample amount (currently set at 3000) 

In [3]:
#pulls out audio, sample, transcription, and label from datasets and organizes them into a list of tuples
def process_shanghai(shanghai_corpus, max_samples=3000):
    data = []
    for i, row in enumerate(shanghai_corpus):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'shanghai'  # Label for Shanghai dataset
        text = row.get('transcription', '')  # Get transcription if available
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

def process_mandarin(mandarin_corpus, max_samples=3000):
    data = []
    for i, row in enumerate(mandarin_corpus):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'mandarin'  # Label for Mandarin dataset
        text = row.get('sentence', '')  # Get sentence if available
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

# Process both datasets
shanghai_data = process_shanghai(shanghai_corpus)
mandarin_data = process_mandarin(mandarin_corpus)

# Combine the datasets
combined_data = shanghai_data + mandarin_data

# print the first 5 samples with text
# print the first 5 samples with text, audio length, and gender
for audio, sampling_rate, label, text, audio_length, gender in combined_data[-5:]:
    print(f"Label: {label}, Text: {text[:30]}..., Length: {audio_length:.2f}s, Gender: {gender}")

Label: mandarin, Text: 亲爱滴我愿意你做永远的局长..., Length: 4.04s, Gender: None
Label: mandarin, Text: 摸摸你的胸还有小穴啊..., Length: 3.87s, Gender: None
Label: mandarin, Text: 俩二娃么时候回威海给我那介绍下..., Length: 4.74s, Gender: None
Label: mandarin, Text: 卡盘中心孔多大四爪..., Length: 4.71s, Gender: None
Label: mandarin, Text: 我家和鲅鱼圈都是周日..., Length: 3.53s, Gender: None


In [None]:
from tqdm import tqdm
# Process Sichuan and Cantonese datasets with progress bars since they are streaming 
# Takes incredibly long to process the full datasets, so we limit the number of samples processed for demo

def process_sichuan(sichuan_corpus, max_samples=100):
    data = []
    for i, row in enumerate(tqdm(sichuan_corpus, desc="Processing Sichuan", total=max_samples)):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'sichuan'
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

def process_cantonese(cantonese_corpus, max_samples=100):
    data = []
    for i, row in enumerate(tqdm(cantonese_corpus, desc="Processing Cantonese", total=max_samples)):
        audio = row['audio']['array']
        sampling_rate = row['audio']['sampling_rate']
        label = 'cantonese'
        text = row.get('sentence', '')
        audio_length = len(audio) / sampling_rate  # Calculate audio length in seconds
        gender = row.get('gender', None)  # Get gender if available
        data.append((audio, sampling_rate, label, text, audio_length, gender))
        if i >= max_samples - 1:
            break
    return data

# Process all datasets with progress bars
sichuan_data = process_sichuan(sichuan_corpus)
cantonese_data = process_cantonese(cantonese_corpus)

In [6]:
combined_data = combined_data + sichuan_data + cantonese_data

# Add Features 

Adds the following: gender, snr (signal-to-noise ratio), tokens, sentiment/emotion 

### Add gender

In [8]:
# Adding gender detection using a pre-trained model
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

# Load model and processor
model_name = "prithivMLmods/Common-Voice-Gender-Detection"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model.eval()

# Label mapping
id2label = {
    0: "female",
    1: "male"
}

def predict_gender(audio_array, sampling_rate):
    # all audio arrays should be 16kHz, so we don't need to resample
    # Prepare input
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = logits.argmax(dim=-1).item()
    return id2label[pred_id]

In [9]:
# Apply gender prediction only to rows where gender is None (i.e., not 'shanghai')
from tqdm import tqdm

combined_data_with_gender = []
for audio, sampling_rate, label, text, audio_length, gender in tqdm(combined_data, desc="Gender Detection"):
    # gender is already present for 'shanghai'), keep it
    if gender is not None:
        combined_data_with_gender.append((audio, sampling_rate, label, text, audio_length, gender))
    else:
        # Predict gender for other dialects
        predicted_gender = predict_gender(audio, sampling_rate)
        combined_data_with_gender.append((audio, sampling_rate, label, text, audio_length, predicted_gender))

# Now each tuple is (audio, sampling_rate, label, text, audio_length, gender)
# print the first 5 with gender
for row in combined_data_with_gender[:5]:
    print(f"Label: {row[2]}, Text: {row[3][:30]}, Length: {row[4]:.2f}s, Gender: {row[5]}")

Gender Detection: 100%|██████████| 6200/6200 [07:29<00:00, 13.78it/s]  

Label: shanghai, Text: 北京爱数智慧语音采集, Length: 3.98s, Gender: male
Label: shanghai, Text: 北京爱数智慧语音采集, Length: 2.69s, Gender: female
Label: shanghai, Text: 阿拉两个拧来聊聊金融方面呃, Length: 2.66s, Gender: male
Label: shanghai, Text: 金融方面嘛, Length: 1.32s, Gender: male
Label: shanghai, Text: 搿呃，阿姨喃，应该讲，侬已经交关年数辣辣了解了, Length: 5.37s, Gender: male





### Add tokens from transcription (sichuan does not have this)

In [10]:
from transformers import BertTokenizer, ErnieModel
tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-nano-zh")
model = ErnieModel.from_pretrained("nghuyong/ernie-3.0-nano-zh")

Some weights of ErnieModel were not initialized from the model checkpoint at nghuyong/ernie-3.0-nano-zh and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
combined_data_with_tokens = []
for audio, sampling_rate, label, text, audio_length, gender in combined_data_with_gender: 
    token = tokenizer(text, padding='max_length', truncation=True, max_length = 128, return_tensors="pt")
    combined_data_with_tokens.append((audio, sampling_rate, label, text, audio_length, gender, token))

In [None]:
# test = combined_data_with_tokens[:1][0][-1]  # Show the token for the first entry
# tokenizer.decode(test.input_ids[0], skip_special_tokens=True)

'北 京 爱 数 智 慧 语 音 采 集'

## Add sentiment

In [13]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
import torch

tokenizer=BertTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment')
model=BertForSequenceClassification.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment')

text='今天心情不好'

output=model(torch.tensor([tokenizer.encode(text)]))
print(torch.nn.functional.softmax(output.logits,dim=-1))

tensor([[0.9551, 0.0449]], grad_fn=<SoftmaxBackward0>)


In [14]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment_id = torch.argmax(probs, dim=-1).item()
    sentiment_map = {0: "negative", 1: "neutral", 2: "positive"}
    return sentiment_map.get(sentiment_id, "unknown")

from tqdm import tqdm

combined_data_with_sentiment = []
for audio, sampling_rate, label, text, audio_length, gender, token in tqdm(combined_data_with_tokens, desc="Sentiment Analysis"):
    sentiment = get_sentiment(text)
    combined_data_with_sentiment.append((audio, sampling_rate, label, text, audio_length, gender, token, sentiment))

Sentiment Analysis: 100%|██████████| 6200/6200 [05:42<00:00, 18.09it/s]


In [None]:
# # Example: print first 3 rows with sentiment
# for row in combined_data_with_sentiment[:3]:
#     print(f"Text: {row[3][:30]}, Audio Length: {row[4]}, Gender: {row[5]}, Sentiment: {row[7]}")

Text: 北京爱数智慧语音采集, Audio Length: 3.98, Gender: male, Sentiment: neutral
Text: 北京爱数智慧语音采集, Audio Length: 2.69, Gender: female, Sentiment: neutral
Text: 阿拉两个拧来聊聊金融方面呃, Audio Length: 2.66, Gender: male, Sentiment: neutral


In [16]:
from collections import Counter

# Extract sentiment labels from combined_data_with_sentiment
sentiments = [row[7] for row in combined_data_with_sentiment]
sentiment_counts = Counter(sentiments)

print("Sentiment distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count}")

Sentiment distribution:
neutral: 2936
negative: 3264


#### Alternative sentiment model

In [None]:

# from funasr import AutoModel

# # Load the emotion2vec_plus_base model
# model = AutoModel(model="iic/emotion2vec_plus_base")

# # Run inference on a sample of 10 audio samples in combined_data_with_tokens
# results = []
# sample_size = 10
# for i, (audio, sampling_rate, label, text, audio_length, gender, token) in enumerate(combined_data_with_tokens[:sample_size]):
#     import soundfile as sf
#     import tempfile
#     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
#         sf.write(tmp_wav.name, audio, sampling_rate)
#         wav_file = tmp_wav.name
#         # Run inference
#         res = model.generate(wav_file, output_dir=None, granularity="utterance", extract_embedding=False)
#         results.append((label, text, res))
#     import os
#     os.remove(wav_file)

# # Print the results
# for r in results:
#     label, text, res = r
#     print(f"Label: {label}, Text: {text[:30]}, Emotion: {res}")

In [23]:
# Print just the highest emotion label and score for each result in results
for entry in results:
    label, text, emotions = entry
    if isinstance(emotions, list) and len(emotions) > 0 and "labels" in emotions[0] and "scores" in emotions[0]:
        labels = emotions[0]["labels"]
        scores = emotions[0]["scores"]
        max_idx = scores.index(max(scores))
        top_emotion = labels[max_idx]
        top_score = scores[max_idx]
        print(f"Label: {label}, Text: {text[:30]}, Top Emotion: {top_emotion}, Score: {top_score:.4f}")

Label: shanghai, Text: 北京爱数智慧语音采集, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 北京爱数智慧语音采集, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 阿拉两个拧来聊聊金融方面呃, Top Emotion: 中立/neutral, Score: 0.6006
Label: shanghai, Text: 金融方面嘛, Top Emotion: 中立/neutral, Score: 0.5899
Label: shanghai, Text: 搿呃，阿姨喃，应该讲，侬已经交关年数辣辣了解了, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 葛末，吾辣辣金融方面已经有的三四年了, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 最少辰光阿拉是做撒呃喃，有钞票就是到银行里保本保息, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 吾已经做了已经到八七年了, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 八七年呃，当时辰光辣里哴相做理财呃, Top Emotion: 中立/neutral, Score: 0.9999
Label: shanghai, Text: 是两级风险，三级风险，四级风险吾侪做呃，信托咯撒侪做呃, Top Emotion: 中立/neutral, Score: 1.0000


## Add SNR (signal to noise ratio)

# Rebalance dataset with features 

Note that some datasets have more/less of a feature, e.g. more female speakers than male in shanghai dataset, so it gets rebalanced