<a href="https://colab.research.google.com/github/chaulagai2001/Speech_for_timit_dataset/blob/main/features_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Move the uploaded Kaggle API key to the required directory
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Replace "your_dataset" with the Kaggle dataset username/dataset name
!kaggle datasets download -d mfekadu/darpa-timit-acousticphonetic-continuous-speech

Downloading darpa-timit-acousticphonetic-continuous-speech.zip to /content
100% 826M/829M [00:28<00:00, 33.1MB/s]
100% 829M/829M [00:28<00:00, 30.6MB/s]


In [2]:
!unzip -q darpa-timit-acousticphonetic-continuous-speech.zip -d darpa-timit-acousticphonetic-continuous-speech

In [3]:
import os
import numpy as np
import librosa
from sklearn import preprocessing

In [5]:
# Phoneme mapping dictionary, adjust based on your actual mapping:
phn_map ={
  "iy": 1,
  "ix": 2,
  "ih": 3,
  "eh": 4,
  "ae": 5,
  "ax": 6,
  "ah": 7,
  "ax-h": 8,
  "uw": 9,
  "ux": 10,
  "uh": 11,
  "ao": 12,
  "aa": 13,
  "ey": 14,
  "ay": 15,
  "oy": 16,
  "aw": 17,
  "ow": 18,
  "er": 19,
  "axr": 20,
  "l": 21,
  "el": 22,
  "r": 23,
  "w": 24,
  "y": 25,
  "m": 26,
  "em": 27,
  "n": 28,
  "en": 29,
  "nx": 30,
  "ng": 31,
  "eng": 32,
  "v": 33,
  "f": 34,
  "dh": 35,
  "th": 36,
  "z": 37,
  "s": 38,
  "zh": 39,
  "sh": 40,
  "jh": 41,
  "ch": 42,
  "b": 43,
  "p": 44,
  "d": 45,
  "dx": 46,
  "t": 47,
  "g": 48,
  "k": 49,
  "hh": 50,
  "hv": 51,
  "bcl": 52,
  "pcl": 53,
  "dcl": 54,
  "tcl": 55,
  "gcl": 56,
  "kcl": 57,
  "q": 58,
  "epi": 59,
  "pau": 60,
  "h#": 61
}

In [6]:
def load_phoneme_labels(label_filename):
    phenome = []
    with open(label_filename, 'r') as f:
        for line in f.read().splitlines():
            start_time, end_time, phoneme = line.split()
            phenome.append((float(start_time), float(end_time), phn_map.get(phoneme)))
    return phenome

def align_labels(audio_length, phenome, win_len, win_step):
    aligned_labels = []
    current_time = 0.0
    for i in range(len(phenome)):
        start_time, end_time, phoneme = phenome[i]
        while current_time < end_time:
            if current_time >= start_time:
                aligned_labels.append(phoneme)
            current_time += win_step
    return aligned_labels

In [7]:
phoneme_filename = "/content/darpa-timit-acousticphonetic-continuous-speech/data/TEST/DR1/FAKS0/SA1.PHN"  # Replace "your_phoneme_file.txt" with the actual filename

# Call the load_phoneme_labels function with the filename
phoneme_labels = load_phoneme_labels(phoneme_filename)
phoneme_labels

[(0.0, 9640.0, 61),
 (9640.0, 11240.0, 40),
 (11240.0, 12783.0, 1),
 (12783.0, 14078.0, 51),
 (14078.0, 16157.0, 5),
 (16157.0, 16880.0, 54),
 (16880.0, 17103.0, 45),
 (17103.0, 17587.0, 25),
 (17587.0, 18760.0, 19),
 (18760.0, 19720.0, 54),
 (19720.0, 19962.0, 45),
 (19962.0, 21514.0, 13),
 (21514.0, 22680.0, 23),
 (22680.0, 23800.0, 57),
 (23800.0, 24104.0, 49),
 (24104.0, 26280.0, 38),
 (26280.0, 28591.0, 9),
 (28591.0, 29179.0, 46),
 (29179.0, 30337.0, 3),
 (30337.0, 31880.0, 31),
 (31880.0, 32500.0, 56),
 (32500.0, 33170.0, 48),
 (33170.0, 33829.0, 23),
 (33829.0, 35150.0, 1),
 (35150.0, 37370.0, 38),
 (37370.0, 38568.0, 1),
 (38568.0, 40546.0, 24),
 (40546.0, 42357.0, 13),
 (42357.0, 45119.0, 40),
 (45119.0, 45624.0, 59),
 (45624.0, 46855.0, 24),
 (46855.0, 48680.0, 13),
 (48680.0, 49240.0, 46),
 (49240.0, 51033.0, 19),
 (51033.0, 52378.0, 58),
 (52378.0, 54500.0, 12),
 (54500.0, 55461.0, 21),
 (55461.0, 57395.0, 25),
 (57395.0, 59179.0, 1),
 (59179.0, 60600.0, 20),
 (60600.0, 63

In [38]:
def process_audio_files(audio_file_path, label_file_path, save_directory, mode, feature_len, win_len, win_step, seq2seq, save):
    filenameNoSuffix = os.path.splitext(os.path.basename(audio_file_path))[0]

    # Load and process audio
    sig, rate = librosa.load(audio_file_path, sr=16000)
    feat = librosa.feature.mfcc(y=sig, sr=rate, n_mfcc=feature_len, hop_length=int(rate*win_step), n_fft=int(rate*win_len))

    # Transpose feature matrix to have frames as rows
    feat = np.transpose(feat)

    # Read phoneme labels
    phenome = load_phoneme_labels(label_file_path)
    aligned_labels = align_labels(len(sig) / rate, phenome, win_len, win_step)

    # Save data (modify paths as needed)
    feat_dir = os.path.join(save_directory, 'features')
    label_dir = os.path.join(save_directory, 'labels')
    os.makedirs(feat_dir, exist_ok=True)
    os.makedirs(label_dir, exist_ok=True)

    # Save each feature set in individual .npy files
    for i, feature_set in enumerate(feat):
        feature_filename = os.path.join(feat_dir, f'{filenameNoSuffix}_feat_{i}.npy')
        np.save(feature_filename, feature_set)

    # Save aligned labels in a single .npy file
    label_filename = os.path.join(label_dir, filenameNoSuffix + '.npy')
    np.save(label_filename, aligned_labels)

# Example usage:
audio_file_path = "/content/darpa-timit-acousticphonetic-continuous-speech/data/TEST/DR1/FAKS0/SA1.WAV.wav"  # Replace with the path to your .wav file
label_file_path = "/content/darpa-timit-acousticphonetic-continuous-speech/data/TEST/DR1/FAKS0/SA1.PHN"  # Replace with the path to your .phn file
save_directory = "/content/SA1"    # Replace with the directory where you want to save features and labels
mode = "mfcc"
feature_len = 13  # Example value, adjust according to your needs
win_len = 0.025   # Example value, adjust according to your needs
win_step = 0.01   # Example value, adjust according to your needs
seq2seq = False   # Example value, adjust according to your needs
save = True       # Example value, adjust according to your needs

process_audio_files(audio_file_path, label_file_path, save_directory, mode, feature_len, win_len, win_step, seq2seq, save)


In [44]:
features

[]

In [40]:
labels

[]

In [None]:
# Load MFCC features and aligned phoneme labels
mfcc_features = np.load(feature_filename)  # Load MFCC features
aligned_labels = np.load(label_filename)   # Load aligned phoneme labels

# Now you can associate each frame of MFCC features with its corresponding phoneme label
for frame, label in zip(mfcc_features, aligned_labels):
    print("Frame:", frame, "Label:", label)


In [50]:
feature = np.load("/content/SA1/features/SA1.WAV_feat_103.npy")
feature.shape

(13,)

In [53]:
array = np.load("/content/SA1/labels/SA1.WAV.npy")
array.shape

(6344000,)

In [47]:
# Import numpy
import numpy as np

# Load the array
labels = np.load("/content/SA1/labels/SA1.WAV.npy")

# Choose a filename
output_file = "labels.txt"

# Save the array to a text file with comma-separated values
np.savetxt(output_file, labels, delimiter=",")

print(f"Array saved to: {output_file}")


Array saved to: labels.txt


In [51]:
features_dir = "/content/SA1/features"  # Replace with your actual directory path

features = []

# Loop through all files in the directory
for filename in os.listdir(features_dir):
    # Check if the file is a `.npy` file
    if filename.endswith(".npy"):
        feature_path = os.path.join(features_dir, filename)
        feature = np.load(feature_path)
        features.append(feature)

print(f"Loaded {len(features)} features")


Loaded 397 features


In [57]:
combined_features = np.concatenate(features, axis=0)

In [59]:
combined_features

array([-411.34244 ,  111.17155 ,  -65.11557 , ...,   -8.515079,
        -16.239136,   -9.489464], dtype=float32)

In [58]:
combined_features.shape

(5161,)