In [23]:
#import needed libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import soundfile as sf
import tempfile
import os
import time
import torch
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
from transformers import BertTokenizer, ErnieModel
from tqdm import tqdm
from pyannote.audio import Model, Inference
from transformers import BertForSequenceClassification
from transformers import BertTokenizer

# Add Features 

generates the following (if doesn't already exist): gender, snr (signal-to-noise ratio), tokens, sentiment/emotion 

### Add gender

In [24]:

# Load model and processor
model_name = "prithivMLmods/Common-Voice-Gender-Detection"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model.eval()

# Label mapping
id2label = {
    0: "female",
    1: "male"
}

def predict_gender(audio_array, sampling_rate):
    # all audio arrays should be 16kHz, so we don't need to resample
    # Prepare input
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        pred_id = logits.argmax(dim=-1).item()
    return id2label[pred_id]

In [None]:
# Apply gender prediction only to rows where gender is None (i.e., not 'shanghai')

combined_data_with_gender = []
for audio, sampling_rate, label, text, audio_length, gender in tqdm(loaded_data, desc="Gender Detection"):
    # gender is already present for 'shanghai'), keep it
    if gender is not None:
        combined_data_with_gender.append((audio, sampling_rate, label, text, audio_length, gender))
    else:
        # Predict gender for other dialects
        predicted_gender = predict_gender(audio, sampling_rate)
        combined_data_with_gender.append((audio, sampling_rate, label, text, audio_length, predicted_gender))

# Now each tuple is (audio, sampling_rate, label, text, audio_length, gender)
# print the first 5 with gender
for row in combined_data_with_gender[:5]:
    print(f"Label: {row[2]}, Text: {row[3][:30]}, Length: {row[4]:.2f}s, Gender: {row[5]}")

Gender Detection: 100%|██████████| 6200/6200 [13:45<00:00,  7.51it/s]  

Label: shanghai, Text: 北京爱数智慧语音采集, Length: 3.98s, Gender: male
Label: shanghai, Text: 北京爱数智慧语音采集, Length: 2.69s, Gender: female
Label: shanghai, Text: 阿拉两个拧来聊聊金融方面呃, Length: 2.66s, Gender: male
Label: shanghai, Text: 金融方面嘛, Length: 1.32s, Gender: male
Label: shanghai, Text: 搿呃，阿姨喃，应该讲，侬已经交关年数辣辣了解了, Length: 5.37s, Gender: male





### Add tokens from transcription (sichuan does not have this)

In [None]:
tokenizer = BertTokenizer.from_pretrained("nghuyong/ernie-3.0-nano-zh")
model = ErnieModel.from_pretrained("nghuyong/ernie-3.0-nano-zh")

Some weights of ErnieModel were not initialized from the model checkpoint at nghuyong/ernie-3.0-nano-zh and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
combined_data_with_tokens = []
for audio, sampling_rate, label, text, audio_length, gender in combined_data_with_gender: 
    token = tokenizer(text, padding='max_length', truncation=True, max_length = 128, return_tensors="pt")
    combined_data_with_tokens.append((audio, sampling_rate, label, text, audio_length, gender, token))

In [None]:
# test = combined_data_with_tokens[:1][0][-1]  # Show the token for the first entry
# tokenizer.decode(test.input_ids[0], skip_special_tokens=True)

'北 京 爱 数 智 慧 语 音 采 集'

## Add sentiment

In [None]:
#load and test 

tokenizer=BertTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment')
model=BertForSequenceClassification.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-110M-Sentiment')

text='今天心情不好'

output=model(torch.tensor([tokenizer.encode(text)]))
print(torch.nn.functional.softmax(output.logits,dim=-1))

tensor([[0.9551, 0.0449]], grad_fn=<SoftmaxBackward0>)


In [14]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment_id = torch.argmax(probs, dim=-1).item()
    sentiment_map = {0: "negative", 1: "neutral", 2: "positive"}
    return sentiment_map.get(sentiment_id, "unknown")

from tqdm import tqdm

combined_data_with_sentiment = []
for audio, sampling_rate, label, text, audio_length, gender, token in tqdm(combined_data_with_tokens, desc="Sentiment Analysis"):
    sentiment = get_sentiment(text)
    combined_data_with_sentiment.append((audio, sampling_rate, label, text, audio_length, gender, token, sentiment))

Sentiment Analysis: 100%|██████████| 6200/6200 [05:42<00:00, 18.09it/s]


In [None]:
# # Example: print first 3 rows with sentiment
# for row in combined_data_with_sentiment[:3]:
#     print(f"Text: {row[3][:30]}, Audio Length: {row[4]}, Gender: {row[5]}, Sentiment: {row[7]}")

Text: 北京爱数智慧语音采集, Audio Length: 3.98, Gender: male, Sentiment: neutral
Text: 北京爱数智慧语音采集, Audio Length: 2.69, Gender: female, Sentiment: neutral
Text: 阿拉两个拧来聊聊金融方面呃, Audio Length: 2.66, Gender: male, Sentiment: neutral


In [16]:
from collections import Counter

# Extract sentiment labels from combined_data_with_sentiment
sentiments = [row[7] for row in combined_data_with_sentiment]
sentiment_counts = Counter(sentiments)

print("Sentiment distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment}: {count}")

Sentiment distribution:
neutral: 2936
negative: 3264


#### Alternative sentiment model

In [None]:

# from funasr import AutoModel

# # Load the emotion2vec_plus_base model
# model = AutoModel(model="iic/emotion2vec_plus_base")

# # Run inference on a sample of 10 audio samples in combined_data_with_tokens
# results = []
# sample_size = 10
# for i, (audio, sampling_rate, label, text, audio_length, gender, token) in enumerate(combined_data_with_tokens[:sample_size]):
#     import soundfile as sf
#     import tempfile
#     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
#         sf.write(tmp_wav.name, audio, sampling_rate)
#         wav_file = tmp_wav.name
#         # Run inference
#         res = model.generate(wav_file, output_dir=None, granularity="utterance", extract_embedding=False)
#         results.append((label, text, res))
#     import os
#     os.remove(wav_file)

# # Print the results
# for r in results:
#     label, text, res = r
#     print(f"Label: {label}, Text: {text[:30]}, Emotion: {res}")

In [None]:
# # Print just the highest emotion label and score for each result in results
# for entry in results:
#     label, text, emotions = entry
#     if isinstance(emotions, list) and len(emotions) > 0 and "labels" in emotions[0] and "scores" in emotions[0]:
#         labels = emotions[0]["labels"]
#         scores = emotions[0]["scores"]
#         max_idx = scores.index(max(scores))
#         top_emotion = labels[max_idx]
#         top_score = scores[max_idx]
#         print(f"Label: {label}, Text: {text[:30]}, Top Emotion: {top_emotion}, Score: {top_score:.4f}")

Label: shanghai, Text: 北京爱数智慧语音采集, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 北京爱数智慧语音采集, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 阿拉两个拧来聊聊金融方面呃, Top Emotion: 中立/neutral, Score: 0.6006
Label: shanghai, Text: 金融方面嘛, Top Emotion: 中立/neutral, Score: 0.5899
Label: shanghai, Text: 搿呃，阿姨喃，应该讲，侬已经交关年数辣辣了解了, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 葛末，吾辣辣金融方面已经有的三四年了, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 最少辰光阿拉是做撒呃喃，有钞票就是到银行里保本保息, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 吾已经做了已经到八七年了, Top Emotion: 中立/neutral, Score: 1.0000
Label: shanghai, Text: 八七年呃，当时辰光辣里哴相做理财呃, Top Emotion: 中立/neutral, Score: 0.9999
Label: shanghai, Text: 是两级风险，三级风险，四级风险吾侪做呃，信托咯撒侪做呃, Top Emotion: 中立/neutral, Score: 1.0000


## Add SNR (signal to noise ratio)

In [None]:
# 1. visit hf.co/pyannote/brouhaha and accept user conditions
# 2. visit hf.co/settings/tokens to create an access token
# 3. instantiate pretrained model

from pyannote.audio import Model, Inference
model = Model.from_pretrained("pyannote/brouhaha", 
                              use_auth_token="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")

# Create inference object
inference = Inference(model)

def extract_snr_from_audio(audio_array, sampling_rate):
    """
    Extract SNR from audio array using pyannote.audio brouhaha model
    
    Args:
        audio_array: numpy array of audio data
        sampling_rate: sampling rate of the audio
    
    Returns:
        float: average SNR value
    """
    # Create a temporary WAV file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
        # Write audio array to temporary file
        sf.write(tmp_wav.name, audio_array, sampling_rate)
        
        try:
            # Apply model inference
            output = inference(tmp_wav.name)
            
            # Extract SNR values
            snr_values = []
            for frame, (vad, snr, c50) in output:
                snr_values.append(snr)
            
            # Calculate average SNR (you can modify this to use median, max, etc.)
            avg_snr = np.mean(snr_values) if snr_values else 0.0
            
        except Exception as e:
            print(f"Error processing audio: {e}")
            avg_snr = 0.0
        
        finally:
            # Clean up temporary file
            os.unlink(tmp_wav.name)
    
    return avg_snr


def add_snr_to_dataset(combined_data_with_sentiment):
    """
    Add SNR feature to existing dataset
    
    Args:
        combined_data_with_sentiment: your existing dataset with audio arrays
    
    Returns:
        list: final_data with SNR added
    """
    final_data = []
    
    print(f"Processing {len(combined_data_with_sentiment)} audio samples...")
    
    for i, row in enumerate(combined_data_with_sentiment):
        # Extract existing data
        audio, sampling_rate, label, text, audio_length, gender, token, sentiment = row
        
        # Extract SNR from the audio array
        snr = extract_snr_from_audio(audio, sampling_rate)
        
        # Add SNR to the row
        updated_row = (audio, sampling_rate, label, text, audio_length, gender, token, sentiment, snr)
        final_data.append(updated_row)
        
        # Print progress for first few items and every 100th item
        if i < 3 or i % 100 == 0:
            print(f"Processed item {i}: SNR = {snr:.2f} dB")
    
    print(f"Completed! Added SNR to {len(final_data)} samples.")
    return final_data

# Test with just one instance first
print("=== Testing with one instance ===")
test_row = combined_data_with_sentiment[0]
audio, sampling_rate, label, text, audio_length, gender, token, sentiment = test_row

print(f"Original row length: {len(test_row)}")
print(f"Audio data type: {type(audio)}")
print(f"Audio data shape: {audio.shape if hasattr(audio, 'shape') else 'No shape attribute'}")
print(f"Sampling rate: {sampling_rate}")

# Extract SNR for the test instance
snr = extract_snr_from_audio(audio, sampling_rate)
print(f"Extracted SNR: {snr:.2f} dB")

# Create updated row
updated_test_row = (audio, sampling_rate, label, text, audio_length, gender, token, sentiment, snr)
print(f"Updated row length: {len(updated_test_row)}")

# If the test works, process the entire dataset
print("\n=== Processing entire dataset ===")
final_data = add_snr_to_dataset(combined_data_with_sentiment)

# Update your columns list
updated_columns = ["audio", "sampling_rate", "label", "text", "audio_length", "gender", "token", "sentiment", "snr"]
print(f"\nUpdated columns: {updated_columns}")

# Verify the final dataset
print(f"\nFinal dataset size: {len(final_data)}")
print(f"First row structure: {len(final_data[0])} elements")
print(f"First row SNR: {final_data[0][8]:.2f} dB")  # SNR should be at index 8

# Optional: Convert to DataFrame if needed
import pandas as pd
df_final = pd.DataFrame(final_data, columns=updated_columns)
print(f"\nDataFrame shape: {df_final.shape}")
print(f"SNR column statistics:")
print(df_final['snr'].describe())

# # Example of how to add SNR to your existing data structure
# def add_snr_to_dataset(combined_data_with_sentiment, dialect_corpus):
#     """
#     Add SNR feature to existing dataset
    
#     Args:
#         combined_data_with_sentiment: your existing dataset
#         dialect_corpus: the original dataset with audio data
    
#     Returns:
#         list: updated dataset with SNR added
#     """
#     updated_data = []
    
#     for i, row in enumerate(combined_data_with_sentiment):
#         # Extract existing data
#         audio, sampling_rate, label, text, audio_length, gender, token, sentiment = row
        
#         # Get the corresponding audio data from the original dataset
#         # You'll need to map this correctly based on your data structure
#         original_audio_data = dialect_corpus['train'][i]['audio']['array']
#         original_sampling_rate = dialect_corpus['train'][i]['audio']['sampling_rate']
        
#         # Extract SNR
#         snr = extract_snr_from_audio(original_audio_data, original_sampling_rate)
        
#         # Add SNR to the row
#         updated_row = (audio, sampling_rate, label, text, audio_length, gender, token, sentiment, snr)
#         updated_data.append(updated_row)
        
#         # Print progress for first few items
#         if i < 3:
#             print(f"Processed item {i}: SNR = {snr:.2f} dB")
    
#     return updated_data

# # Test with just one instance first
# print("\n=== Testing with one instance ===")
# test_row = combined_data_with_sentiment[0]  # Assuming this exists
# audio, sampling_rate, label, text, audio_length, gender, token, sentiment = test_row

# # Get corresponding audio data (you'll need to adjust this mapping)
# original_audio_data = dialect_corpus['train'][0]['audio']['array']
# original_sampling_rate = dialect_corpus['train'][0]['audio']['sampling_rate']

# # Extract SNR
# snr = extract_snr_from_audio(original_audio_data, original_sampling_rate)

# # Create updated row
# updated_test_row = (audio, sampling_rate, label, text, audio_length, gender, token, sentiment, snr)
# print(f"Original row length: {len(test_row)}")
# print(f"Updated row length: {len(updated_test_row)}")
# print(f"Added SNR: {snr:.2f} dB")

# # Update your columns list
# updated_columns = ["audio", "sampling_rate", "label", "text", "audio_length", "gender", "token", "sentiment", "snr"]

# Rebalance dataset with features 

Note that some datasets have more/less of a feature, e.g. more female speakers than male in shanghai dataset, so it gets rebalanced

# Export data 

In [None]:
### FIX THIS 
# 
# # If your list is a list of tuples, specify column names:
columns = ["audio", "sampling_rate", "label", "text", "audio_length", "gender", "token", "sentiment"]
df = pd.DataFrame(combined_data_with_sentiment, columns=columns)

df.to_csv("final_data.csv", index=False)