**Noise Reduction using several methods and Feature Extraction for voice data**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#unzip files in google drive
import zipfile
zip_file_path = '/content/drive/MyDrive/dataset.zip'
extract_dir = '/content/dataset_unzipped'

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
print("File extracted successfully!")

File extracted successfully!


In [None]:
import os
# Directory containing audio files
audio_directory = '/content/dataset_unzipped/dataset/voices/'

# Target sampling rate
target_sr = 44100

#  sorting filenames
def custom_sort(filename):
    return int(filename.split('_')[1].split('.')[0])

sorted_filenames = sorted(os.listdir(audio_directory), key=custom_sort)

high and low with wiener

In [None]:

import scipy.signal as signal
import soundfile as sf
import librosa
import numpy as np
import pandas as pd
import scipy as sc
import warnings
warnings.filterwarnings('ignore')
# Feature extraction parameters
n_mfcc = 13  # Number of MFCC coefficients
n_temporal_features = 5

# Data storage
data = []

# Wiener filter parameters
wiener_factor = 1.5

# Frequency cutoffs for high-pass and low-pass filters
low_cutoff = 8000
high_cutoff = 100

# Iterate through each file in the directory
for filename in sorted_filenames:
    if filename.endswith(".mp3"):
        audio_path = os.path.join(audio_directory, filename)

        # Load audio file
        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

        # Apply high-pass filter
        highpass_filtered_audio = signal.butter(4, high_cutoff, 'high', fs=target_sr, output='sos')
        audio_highpass = signal.sosfilt(highpass_filtered_audio, audio)

        # Apply low-pass filter
        lowpass_filtered_audio = signal.butter(4, low_cutoff, 'low', fs=target_sr, output='sos')
        audio_lowpass = signal.sosfilt(lowpass_filtered_audio, audio_highpass)

        # Apply Wiener filter to the low-pass filtered signal
        denoised_audio = wiener_factor * signal.wiener(audio_lowpass)


        # Feature extraction
        pitch, _ = librosa.core.piptrack(y=denoised_audio, sr=target_sr)
        mfcc = librosa.feature.mfcc(y=denoised_audio, sr=target_sr, n_mfcc=n_mfcc)
        temporal_features = [np.max(np.abs(denoised_audio)), np.std(denoised_audio), sc.stats.skew(denoised_audio),
                             sc.stats.kurtosis(denoised_audio), np.mean(librosa.feature.zero_crossing_rate(denoised_audio)[0]), np.mean(librosa.feature.spectral_rolloff(y=denoised_audio)[0])]

        # Store the features in a dictionary
        features_dict = {'Filename': filename, 'mean_Pitch': np.mean(pitch), 'std_pitch': np.std(pitch),
            'max': temporal_features[0],
            'std': temporal_features[1],
            'skewness': temporal_features[2],
            'Temporal_kurtosis': temporal_features[3],
            'Zero_crossing_rate': temporal_features[4],
            'roll_off' : temporal_features[5] }
        for i in range(n_mfcc):
            features_dict[f'MFCC_{i + 1}'] = np.mean(mfcc[i])

        # Append the dictionary to the data list
        data.append(features_dict)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df.to_csv('/content/feature_1.csv', index=False)

wiener only

In [None]:
import scipy.signal as signal
import soundfile as sf
import librosa
import numpy as np
import pandas as pd
import scipy as sc


# Feature extraction parameters
n_mfcc = 13  # Number of MFCC coefficients
n_temporal_features = 5

# Data storage
data = []

# Iterate through each file in the directory
for filename in sorted_filenames:
    if filename.endswith(".mp3"):
        audio_path = os.path.join(audio_directory, filename)

        # Load audio file
        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

        # Apply Wiener filter for noise reduction
        denoised_audio = signal.wiener(audio)

        # Feature extraction
        pitch, _ = librosa.core.piptrack(y=denoised_audio, sr=target_sr)
        mfcc = librosa.feature.mfcc(y=denoised_audio, sr=target_sr, n_mfcc=n_mfcc)
        temporal_features = [np.max(np.abs(denoised_audio)), np.std(denoised_audio), sc.stats.skew(denoised_audio),
                             sc.stats.kurtosis(denoised_audio), np.mean(librosa.feature.zero_crossing_rate(denoised_audio)[0]), np.mean(librosa.feature.spectral_rolloff(y=denoised_audio)[0])]

        # Store the features in a dictionary
        features_dict = {'Filename': filename, 'mean_Pitch': np.mean(pitch), 'std_pitch': np.std(pitch),
            'max': temporal_features[0],
            'std': temporal_features[1],
            'skewness': temporal_features[2],
            'Temporal_kurtosis': temporal_features[3],
            'Zero_crossing_rate': temporal_features[4],
            'roll_off' : temporal_features[5] }
        for i in range(n_mfcc):
            features_dict[f'MFCC_{i + 1}'] = np.mean(mfcc[i])

        # Append the dictionary to the data list
        data.append(features_dict)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df.to_csv('/content/feature_2.csv', index=False)

Spectral noise reduction

In [None]:
import pandas as pd
import librosa
import numpy as np
import scipy as sc


n_fft = 4096
hop_length = 256
n_std_thresh = 1.1

# Feature extraction parameters
n_mfcc = 13  # Number of MFCC coefficients
n_temporal_features = 5  # Example: mean, std, skewness, kurtosis, zero-crossing rate

# Data storage
data = []

# Iterate through each file in the directory
for filename in sorted_filenames:
    if filename.endswith(".mp3"):
        audio_path = os.path.join(audio_directory, filename)

        # Load audio file
        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

        # Apply spectral noise reduction
        stft_matrix = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
        magnitude = np.abs(stft_matrix)
        mean_magnitude = np.mean(magnitude, axis=1, keepdims=True)
        mask = (magnitude - mean_magnitude) > n_std_thresh * np.std(magnitude, axis=1, keepdims=True)
        denoised_stft = stft_matrix * mask

        # Inverse STFT to get denoised audio
        denoised_audio = librosa.istft(denoised_stft, hop_length=hop_length)


        # Feature extraction
        pitch, _ = librosa.core.piptrack(y=denoised_audio, sr=target_sr)
        mfcc = librosa.feature.mfcc(y=denoised_audio, sr=target_sr, n_mfcc=n_mfcc)
        temporal_features = [np.max(np.abs(denoised_audio)), np.std(denoised_audio), sc.stats.skew(denoised_audio),
                             sc.stats.kurtosis(denoised_audio), np.mean(librosa.feature.zero_crossing_rate(denoised_audio)[0]), np.mean(librosa.feature.spectral_rolloff(y=denoised_audio)[0])]

        # Store the features in a dictionary
        features_dict = {'Filename': filename, 'mean_Pitch': np.mean(pitch), 'std_pitch': np.std(pitch),
            'max': temporal_features[0],
            'std': temporal_features[1],
            'skewness': temporal_features[2],
            'Temporal_kurtosis': temporal_features[3],
            'Zero_crossing_rate': temporal_features[4],
            'roll_off' : temporal_features[5] }
        for i in range(n_mfcc):
            features_dict[f'MFCC_{i + 1}'] = np.mean(mfcc[i])

        # Append the dictionary to the data list
        data.append(features_dict)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df.to_csv('/content/feature_3.csv', index=False)

median

In [None]:
import scipy.signal as signal
import soundfile as sf
import numpy as np
import pandas as pd
import librosa
import scipy as sc

# Feature extraction parameters
n_mfcc = 13  # Number of MFCC coefficients
n_temporal_features = 5

# Data storage
data = []

# Iterate through each file in the directory
for filename in sorted_filenames:
    if filename.endswith(".mp3"):
        audio_path = os.path.join(audio_directory, filename)

        # Load audio file
        audio, sr = librosa.load(audio_path, sr=target_sr, mono=True)

        window_size = 15
        denoised_audio = signal.medfilt(audio, kernel_size=window_size)

        # Feature extraction
        pitch, _ = librosa.core.piptrack(y=denoised_audio, sr=target_sr)
        mfcc = librosa.feature.mfcc(y=denoised_audio, sr=target_sr, n_mfcc=n_mfcc)
        temporal_features = [np.max(np.abs(denoised_audio)), np.std(denoised_audio), sc.stats.skew(denoised_audio),
                             sc.stats.kurtosis(denoised_audio), np.mean(librosa.feature.zero_crossing_rate(denoised_audio)[0]), np.mean(librosa.feature.spectral_rolloff(y=denoised_audio)[0])]

        # Store the features in a dictionary
        features_dict = {'Filename': filename, 'mean_Pitch': np.mean(pitch), 'std_pitch': np.std(pitch),
            'max': temporal_features[0],
            'std': temporal_features[1],
            'skewness': temporal_features[2],
            'Temporal_kurtosis': temporal_features[3],
            'Zero_crossing_rate': temporal_features[4],
            'roll_off' : temporal_features[5] }
        for i in range(n_mfcc):
            features_dict[f'MFCC_{i + 1}'] = np.mean(mfcc[i])

        # Append the dictionary to the data list
        data.append(features_dict)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df.to_csv('/content/feature_4.csv', index=False)

**Cleaning Transcripts**

In [2]:
import pandas as pd

#first upload the transcripts which is in our files
df = pd.read_csv("transcripts.csv")
def remove_punctuation_inplace(row):
    if pd.notna(row['transcript']):
        row['transcript'] = row['transcript'].lstrip('–ئ،؟?!.').rstrip('–ئ،؟?!.')

# Apply the function to the DataFrame in place
df.apply(remove_punctuation_inplace, axis=1)

# Function to clean the 'gender' column in-place
def clean_gender_inplace(row):
    if pd.notna(row['gender']):
        # Convert variations of 'man' to 'male'
        if 'مرد' in row['gender'].lower():
            row['gender'] = 'male'
        # Convert 'Female' to 'female'
        elif 'Female' in row['gender'].lower():
            row['gender'] = 'female'

# Apply the function to the 'gender' column
df.apply(clean_gender_inplace, axis=1)

# Function to clean the 'accent' column in-place
def clean_accent_inplace(row):
    if pd.notna(row['accent']):
        # Convert '(farsi)' to 'persian'
        if 'farsi' in row['accent'].lower():
            row['accent'] = 'فارسی'

# Apply the function to the 'accent' column
df.apply(clean_accent_inplace, axis=1)

# Function to clean the 'tone' column in-place
def clean_tone_inplace(row):
    if pd.notna(row['tone']):
        # Convert '(Normal)' or '(nomal)' to 'normal'
        if 'Normal' in row['tone'].lower() or 'nomal' in row['tone'].lower():
            row['tone'] = 'normal'
        # Convert '(Question)' to 'question'
        elif 'Question' in row['tone'].lower():
            row['tone'] = 'question'

# Apply the function to the 'tone' column
df.apply(clean_tone_inplace, axis=1)


# Remove rows where 'transcript' is empty
df = df.dropna(subset=['transcript'])

# Function to process 'transcript' and 'tone' columns in place
def process_transcript_inplace(row):
    if pd.notna(row['transcript']):
        # Check if 'tone' is 'question' and 'transcript' doesn't end with '?'
        if row['tone'] == 'question' and not row['transcript'].endswith('؟'):
            row['transcript'] =row['transcript'] +'؟'

        # Check if 'tone' is 'normal' and 'transcript' doesn't end with '.'
        elif row['tone'] == 'normal' and not row['transcript'].endswith('.'):
            row['transcript'] ='.'+row['transcript']

        # Check if 'tone' is 'exclamatory' and 'transcript' doesn't end with '!'
        elif row['tone'] == 'exclamatory' and not row['transcript'].endswith('!'):
            row['transcript'] = '!'+row['transcript']
        elif row['tone'] == 'imperative' and not row['transcript'].endswith('!'):
            row['transcript'] = '!'+row['transcript']

# Apply the function to the DataFrame in place
df.apply(process_transcript_inplace, axis=1)
# Save the modified DataFrame back to the Excel file
##df.to_excel(excel_file_path, index=False)

print("Processing completed.")

Processing completed.


In [3]:
df

Unnamed: 0,voice_filename,transcript,accent,gender,tone
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question
1,voice_2.mp3,. همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal
2,voice_3.mp3,. دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal
3,voice_4.mp3,.شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal
4,voice_5.mp3,.باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولی...,فارسی,male,normal
...,...,...,...,...,...
6037,voice_6038.mp3,.تا روی آن بنشیند و آن را گرم نگهدارد تا جوجه ...,فارسی,male,normal
6038,voice_6039.mp3,.یک روز تخم شکست و جوجه عقاب از آن بیرون آمد. ...,فارسی,male,normal
6039,voice_6040.mp3,.و طولی نکشید که جوجه عقاب باور کرد که چیزی جز...,فارسی,male,normal
6040,voice_6041.mp3,.او زندگی و خانواده‌اش را دوست داشت اما چیزی ا...,فارسی,male,normal
