In [2]:
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import os

# Dla jednego pliku

In [4]:

# Initialize the Wav2Vec2 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Define the audio file path
audio_file_path = 'data/train/audio/happy/0ab3b47d_nohash_0.wav'  # Replace with your audio file path

# Load the audio file
audio, rate = librosa.load(audio_file_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate

# Preprocess the audio file
inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

# Make the prediction
logits = model(**inputs).logits

# Compute the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the IDs to get the transcription
transcription = processor.decode(predicted_ids[0])

print(f"Transcription for {audio_file_path}: {transcription}")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Transcription for data/train/audio/happy/0ab3b47d_nohash_0.wav: HAPPY


In [5]:
# Extract the label from the file path
real_label = os.path.basename(os.path.dirname(audio_file_path))

print(f"Real label for {audio_file_path}: {real_label}")

Real label for data/train/audio/happy/0ab3b47d_nohash_0.wav: happy


Dla 2 folderów:

In [17]:
# Define the directory paths for 'bed' and 'bird'
dirs = ['data/train/audio/bed', 'data/train/audio/bird']

# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            print(f"Transcription for {audio_file}: {transcription}")

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))

            print(f"Real label for {audio_file}: {real_label}")

Transcription for 00176480_nohash_0.wav: BED
Real label for 00176480_nohash_0.wav: bed
Transcription for 004ae714_nohash_0.wav: BAYARD
Real label for 004ae714_nohash_0.wav: bed
Transcription for 004ae714_nohash_1.wav: BAOT
Real label for 004ae714_nohash_1.wav: bed
Transcription for 00f0204f_nohash_0.wav: BED
Real label for 00f0204f_nohash_0.wav: bed
Transcription for 00f0204f_nohash_1.wav: BAD
Real label for 00f0204f_nohash_1.wav: bed
Transcription for 012c8314_nohash_0.wav: BED
Real label for 012c8314_nohash_0.wav: bed
Transcription for 012c8314_nohash_1.wav: BED
Real label for 012c8314_nohash_1.wav: bed
Transcription for 0132a06d_nohash_0.wav: BEN
Real label for 0132a06d_nohash_0.wav: bed
Transcription for 0135f3f2_nohash_0.wav: BAD
Real label for 0135f3f2_nohash_0.wav: bed
Transcription for 0137b3f4_nohash_0.wav: THAT
Real label for 0137b3f4_nohash_0.wav: bed
Transcription for 014f9f65_nohash_0.wav: BED
Real label for 014f9f65_nohash_0.wav: bed
Transcription for 01648c51_nohash_0.wa

In [6]:
# Initialize a counter for the correct predictions
correct_predictions = 0
total_predictions = 0
dirs = ['data/train/audio/bed', 'data/train/audio/bird']
# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            # Convert the transcription to lowercase
            transcription = transcription.lower()

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))

            # Convert the real label to lowercase
            real_label = real_label.lower()

            # Check if the prediction is correct
            if transcription == real_label:
                correct_predictions += 1

            total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

Accuracy: 0.6187572590011614


In [7]:
import Levenshtein

# Lista dostępnych etykiet
available_labels = ["happy", "sad", "angry", "neutral", "excited"]

# Wynik klasyfikacji
classification_result = "happu"

# Znajdź najbliższe słowo na podstawie minimalnej odległości Levenshteina
closest_label = min(available_labels, key=lambda label: Levenshtein.distance(label, classification_result))

print("Closest label:", closest_label)


Closest label: happy


In [8]:
distances = {label: Levenshtein.distance(label, classification_result) for label in available_labels}

# Wydrukuj odległości
for label, distance in distances.items():
    print(f"Distance to {label}: {distance}")

Distance to happy: 1
Distance to sad: 4
Distance to angry: 5
Distance to neutral: 7
Distance to excited: 7


In [9]:
import os

# Ścieżka do folderu audio
audio_dir = 'data/train/audio'

# Pobierz wszystkie podkatalogi w folderze audio
all_subdirs = os.listdir(audio_dir)

# Filtruj tylko katalogi (pomijając pliki)
available_labels = [dir_name for dir_name in all_subdirs if os.path.isdir(os.path.join(audio_dir, dir_name))]


# Initialize a counter for the correct predictions
correct_predictions = 0
total_predictions = 0
dirs = ['data/train/audio/bed', 'data/train/audio/bird']

# Threshold for Levenshtein distance
levenshtein_threshold = 2  # Adjust this value based on your requirements

# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            # Convert the transcription to lowercase
            transcription = transcription.lower()

            # Find the closest label based on Levenshtein distance
            closest_label = min(available_labels, key=lambda label: Levenshtein.distance(label, transcription))

            # If the Levenshtein distance is above the threshold, predict 'unknown'
            if Levenshtein.distance(closest_label, transcription) > levenshtein_threshold:
                closest_label = 'unknown'

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))
            
            print(f"Predicted label for {audio_file}: {closest_label}")
            print(f"Real label for {audio_file}: {real_label}")

            # Convert the real label to lowercase
            real_label = real_label.lower()

            # Check if the prediction is correct
            if closest_label == real_label:
                correct_predictions += 1

            total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

Predicted label for 00176480_nohash_0.wav: bed
Real label for 00176480_nohash_0.wav: bed
Predicted label for 004ae714_nohash_0.wav: unknown
Real label for 004ae714_nohash_0.wav: bed
Predicted label for 004ae714_nohash_1.wav: cat
Real label for 004ae714_nohash_1.wav: bed
Predicted label for 00f0204f_nohash_0.wav: bed
Real label for 00f0204f_nohash_0.wav: bed
Predicted label for 00f0204f_nohash_1.wav: bed
Real label for 00f0204f_nohash_1.wav: bed
Predicted label for 012c8314_nohash_0.wav: bed
Real label for 012c8314_nohash_0.wav: bed
Predicted label for 012c8314_nohash_1.wav: bed
Real label for 012c8314_nohash_1.wav: bed
Predicted label for 0132a06d_nohash_0.wav: bed
Real label for 0132a06d_nohash_0.wav: bed
Predicted label for 0135f3f2_nohash_0.wav: bed
Real label for 0135f3f2_nohash_0.wav: bed
Predicted label for 0137b3f4_nohash_0.wav: cat
Real label for 0137b3f4_nohash_0.wav: bed
Predicted label for 014f9f65_nohash_0.wav: bed
Real label for 014f9f65_nohash_0.wav: bed
Predicted label f

In [10]:
import os

# Ścieżka do folderu audio
audio_dir = 'data/train/audio'

# Ścieżki do plików z listami testowymi i walidacyjnymi
testing_list_path = 'data/train/testing_list.txt'
validation_list_path = 'data/train/validation_list.txt'

# Wczytaj zawartość plików do list
with open(testing_list_path, 'r') as file:
    testing_list = file.read().splitlines()

with open(validation_list_path, 'r') as file:
    validation_list = file.read().splitlines()

# Pobierz wszystkie podkatalogi w folderze audio
all_subdirs = os.listdir(audio_dir)

# Filtruj tylko katalogi (pomijając pliki)
available_labels = [dir_name for dir_name in all_subdirs if os.path.isdir(os.path.join(audio_dir, dir_name))]

# Inicjalizuj listę plików treningowych
training_files = []

# Przejdź przez wszystkie dostępne etykiety
for label in available_labels:
    label_dir = os.path.join(audio_dir, label)

    # Upewnij się, że katalog istnieje
    if os.path.isdir(label_dir):
        # Pobierz wszystkie pliki w katalogu
        all_files = os.listdir(label_dir)

        # Filtruj tylko pliki audio
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Przejdź przez wszystkie pliki audio
        for audio_file in audio_files:
            # Utwórz ścieżkę do pliku audio
            audio_path = os.path.join(label, audio_file)

            # Jeśli plik audio nie jest na liście testowej ani walidacyjnej, dodaj go do listy treningowej
            if audio_path not in testing_list and audio_path not in validation_list:
                training_files.append(audio_path)

print(f"Number of training files: {len(training_files)}")

Number of training files: 64727


In [11]:
# Ścieżka do nowego pliku
training_list_path = 'data/train/training_list.txt'

# Otwórz plik w trybie zapisu
with open(training_list_path, 'w') as file:
    # Przejdź przez wszystkie pliki treningowe
    for training_file in training_files:
        # Dodaj ścieżkę do pliku do pliku treningowego
        file.write(training_file + '\n')

print(f"Training file paths have been written to {training_list_path}")

Training file paths have been written to data/train/training_list.txt


In [12]:
from datasets import Dataset
from transformers import Wav2Vec2Processor
import soundfile as sf

# Ścieżka do folderu audio
audio_dir = 'data/train/audio'

# Inicjalizuj procesor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Inicjalizuj listy do przechowywania danych
input_values = []
labels = []

# Przejdź przez wszystkie pliki treningowe
for audio_file in training_files:
    # Utwórz pełną ścieżkę do pliku audio
    audio_path = os.path.join(audio_dir, audio_file)

    # Wczytaj plik audio
    audio, _ = sf.read(audio_path)

    # Przetwórz audio za pomocą procesora
    input_value = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000).input_values[0]

    # Dodaj wartości wejściowe do listy
    input_values.append(input_value)

    # Wyodrębnij etykietę z nazwy pliku
    label = os.path.basename(os.path.dirname(audio_file))

    # Przetwórz etykietę za pomocą procesora
    label_id = processor.tokenizer(label, return_tensors="pt").input_ids[0]

    # Dodaj identyfikator etykiety do listy
    labels.append(label_id)

# Utwórz zestaw danych
dataset = Dataset.from_dict({"input_values": input_values, "labels": labels})

MemoryError: 

In [None]:
import numpy as np
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer
from datasets import load_metric

# Inicjalizuj model
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Zdefiniuj argumenty treningowe
training_args = TrainingArguments(
    output_dir='./results',          # ścieżka wyjściowa dla zapisywania modelu
    num_train_epochs=3,              # liczba epok treningowych
    per_device_train_batch_size=16,  # rozmiar batcha dla treningu
    per_device_eval_batch_size=64,   # rozmiar batcha dla ewaluacji
    warmup_steps=500,                # liczba kroków rozgrzewki
    weight_decay=0.01,               # współczynnik zaniku wagi
    logging_dir='./logs',            # ścieżka do zapisywania logów
)

# Zdefiniuj funkcję do obliczania metryki podczas treningu
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer_metric = load_metric("wer")
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# Zainicjuj trenera
trainer = Trainer(
    model=model,                         # model do trenowania
    args=training_args,                  # argumenty treningowe
    train_dataset=dataset,               # dane treningowe
    compute_metrics=compute_metrics,     # funkcja do obliczania metryk
)

# Rozpocznij trening
trainer.train()
# Step 1: Evaluate the model
eval_result = trainer.evaluate()

# Print the evaluation result
print(f"Evaluation Result: {eval_result}")

# Step 2: Save the model
model_path = "./wav2vec2_trained_model"
model.save_pretrained(model_path)
processor.save_pretrained(model_path)

# Step 3: Use the model for inference
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the saved model and processor
model = Wav2Vec2ForCTC.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)

# Assume we have a new audio file for inference
new_audio_file = "path_to_your_audio_file.wav"

# Load and preprocess the audio file
audio, rate = librosa.load(new_audio_file, sr=16000)
inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

# Make the prediction
logits = model(**inputs).logits

# Compute the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the IDs to get the transcription
transcription = processor.decode(predicted_ids[0])

print(f"Transcription for {new_audio_file}: {transcription}")