In [2]:
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import os

# Dla jednego pliku

In [4]:

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Define the audio file path
audio_file_path = 'data/train/audio/happy/0ab3b47d_nohash_0.wav'  # Replace with your audio file path

# Load the audio file
audio, rate = librosa.load(audio_file_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate

# Preprocess the audio file
inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

# Make the prediction
logits = model(**inputs).logits

# Compute the predicted IDs
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the IDs to get the transcription
transcription = processor.decode(predicted_ids[0])

print(f"Transcription for {audio_file_path}: {transcription}")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Transcription for data/train/audio/happy/0ab3b47d_nohash_0.wav: HAPPY


In [5]:
# Extract the label from the file path
real_label = os.path.basename(os.path.dirname(audio_file_path))

print(f"Real label for {audio_file_path}: {real_label}")

Real label for data/train/audio/happy/0ab3b47d_nohash_0.wav: happy


# Dla 2 folderów:

In [17]:
# Define the directory paths for 'bed' and 'bird'
dirs = ['data/train/audio/bed', 'data/train/audio/bird']

# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            print(f"Transcription for {audio_file}: {transcription}")

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))

            print(f"Real label for {audio_file}: {real_label}")

Transcription for 00176480_nohash_0.wav: BED
Real label for 00176480_nohash_0.wav: bed
Transcription for 004ae714_nohash_0.wav: BAYARD
Real label for 004ae714_nohash_0.wav: bed
Transcription for 004ae714_nohash_1.wav: BAOT
Real label for 004ae714_nohash_1.wav: bed
Transcription for 00f0204f_nohash_0.wav: BED
Real label for 00f0204f_nohash_0.wav: bed
Transcription for 00f0204f_nohash_1.wav: BAD
Real label for 00f0204f_nohash_1.wav: bed
Transcription for 012c8314_nohash_0.wav: BED
Real label for 012c8314_nohash_0.wav: bed
Transcription for 012c8314_nohash_1.wav: BED
Real label for 012c8314_nohash_1.wav: bed
Transcription for 0132a06d_nohash_0.wav: BEN
Real label for 0132a06d_nohash_0.wav: bed
Transcription for 0135f3f2_nohash_0.wav: BAD
Real label for 0135f3f2_nohash_0.wav: bed
Transcription for 0137b3f4_nohash_0.wav: THAT
Real label for 0137b3f4_nohash_0.wav: bed
Transcription for 014f9f65_nohash_0.wav: BED
Real label for 014f9f65_nohash_0.wav: bed
Transcription for 01648c51_nohash_0.wa

In [6]:
# Initialize a counter for the correct predictions
correct_predictions = 0
total_predictions = 0
dirs = ['data/train/audio/bed', 'data/train/audio/bird']
# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            # Convert the transcription to lowercase
            transcription = transcription.lower()

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))

            # Convert the real label to lowercase
            real_label = real_label.lower()

            # Check if the prediction is correct
            if transcription == real_label:
                correct_predictions += 1

            total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

Accuracy: 0.6187572590011614


In [5]:
# Initialize a counter for the correct predictions
correct_predictions = 0
total_predictions = 0
dirs = ['data/train/audio/bed', 'data/train/audio/bird']
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("../wav2vec2_finetuned_testing_data")
# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            # Convert the transcription to lowercase
            transcription = transcription.lower()

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))

            # Convert the real label to lowercase
            real_label = real_label.lower()

            # Check if the prediction is correct
            if transcription == real_label:
                correct_predictions += 1

            total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

KeyboardInterrupt: 

In [7]:
import Levenshtein

# Lista dostępnych etykiet
available_labels = ["happy", "sad", "angry", "neutral", "excited"]

# Wynik klasyfikacji
classification_result = "happu"

# Znajdź najbliższe słowo na podstawie minimalnej odległości Levenshteina
closest_label = min(available_labels, key=lambda label: Levenshtein.distance(label, classification_result))

print("Closest label:", closest_label)


Closest label: happy


In [8]:
distances = {label: Levenshtein.distance(label, classification_result) for label in available_labels}

# Wydrukuj odległości
for label, distance in distances.items():
    print(f"Distance to {label}: {distance}")

Distance to happy: 1
Distance to sad: 4
Distance to angry: 5
Distance to neutral: 7
Distance to excited: 7


In [9]:
import os

# Ścieżka do folderu audio
audio_dir = 'data/train/audio'

# Pobierz wszystkie podkatalogi w folderze audio
all_subdirs = os.listdir(audio_dir)

# Filtruj tylko katalogi (pomijając pliki)
available_labels = [dir_name for dir_name in all_subdirs if os.path.isdir(os.path.join(audio_dir, dir_name))]


# Initialize a counter for the correct predictions
correct_predictions = 0
total_predictions = 0
dirs = ['data/train/audio/bed', 'data/train/audio/bird']

# Threshold for Levenshtein distance
levenshtein_threshold = 2  # Adjust this value based on your requirements

# Traverse through the specified directories
for data_dir in dirs:
    # Ensure the directory is indeed a directory
    if os.path.isdir(data_dir):
        # List all files in the directory
        all_files = os.listdir(data_dir)

        # Filter out the audio files
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Load and preprocess the audio files
        for audio_file in audio_files:
            audio_path = os.path.join(data_dir, audio_file)
            audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
            inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
            logits = model(**inputs).logits

            # Compute the predicted IDs
            predicted_ids = torch.argmax(logits, dim=-1)

            # Decode the IDs to get the transcription
            transcription = processor.decode(predicted_ids[0])

            # Convert the transcription to lowercase
            transcription = transcription.lower()

            # Find the closest label based on Levenshtein distance
            closest_label = min(available_labels, key=lambda label: Levenshtein.distance(label, transcription))

            # If the Levenshtein distance is above the threshold, predict 'unknown'
            if Levenshtein.distance(closest_label, transcription) > levenshtein_threshold:
                closest_label = 'unknown'

            # Extract the label from the file path
            real_label = os.path.basename(os.path.dirname(audio_path))
            
            print(f"Predicted label for {audio_file}: {closest_label}")
            print(f"Real label for {audio_file}: {real_label}")

            # Convert the real label to lowercase
            real_label = real_label.lower()

            # Check if the prediction is correct
            if closest_label == real_label:
                correct_predictions += 1

            total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

Predicted label for 00176480_nohash_0.wav: bed
Real label for 00176480_nohash_0.wav: bed
Predicted label for 004ae714_nohash_0.wav: unknown
Real label for 004ae714_nohash_0.wav: bed
Predicted label for 004ae714_nohash_1.wav: cat
Real label for 004ae714_nohash_1.wav: bed
Predicted label for 00f0204f_nohash_0.wav: bed
Real label for 00f0204f_nohash_0.wav: bed
Predicted label for 00f0204f_nohash_1.wav: bed
Real label for 00f0204f_nohash_1.wav: bed
Predicted label for 012c8314_nohash_0.wav: bed
Real label for 012c8314_nohash_0.wav: bed
Predicted label for 012c8314_nohash_1.wav: bed
Real label for 012c8314_nohash_1.wav: bed
Predicted label for 0132a06d_nohash_0.wav: bed
Real label for 0132a06d_nohash_0.wav: bed
Predicted label for 0135f3f2_nohash_0.wav: bed
Real label for 0135f3f2_nohash_0.wav: bed
Predicted label for 0137b3f4_nohash_0.wav: cat
Real label for 0137b3f4_nohash_0.wav: bed
Predicted label for 014f9f65_nohash_0.wav: bed
Real label for 014f9f65_nohash_0.wav: bed
Predicted label f

In [13]:
import os

# Ścieżka do folderu audio
audio_dir = 'data/train/audio'

# Ścieżki do plików z listami testowymi i walidacyjnymi
testing_list_path = 'data/train/testing_list.txt'
validation_list_path = 'data/train/validation_list.txt'

# Wczytaj zawartość plików do list
with open(testing_list_path, 'r') as file:
    testing_list = file.read().splitlines()

with open(validation_list_path, 'r') as file:
    validation_list = file.read().splitlines()

# Pobierz wszystkie podkatalogi w folderze audio
all_subdirs = os.listdir(audio_dir)

# Filtruj tylko katalogi (pomijając pliki)
available_labels = [dir_name for dir_name in all_subdirs if os.path.isdir(os.path.join(audio_dir, dir_name))]

# Inicjalizuj listę plików treningowych
training_files = []

# Przejdź przez wszystkie dostępne etykiety
for label in available_labels:
    label_dir = os.path.join(audio_dir, label)

    # Upewnij się, że katalog istnieje
    if os.path.isdir(label_dir):
        # Pobierz wszystkie pliki w katalogu
        all_files = os.listdir(label_dir)

        # Filtruj tylko pliki audio
        audio_files = [file for file in all_files if file.endswith('.wav')]

        # Przejdź przez wszystkie pliki audio
        for audio_file in audio_files:
            # Utwórz ścieżkę do pliku audio
            audio_path = os.path.join(label, audio_file)

            # Jeśli plik audio nie jest na liście testowej ani walidacyjnej, dodaj go do listy treningowej
            if audio_path not in testing_list and audio_path not in validation_list:
                training_files.append(audio_path)

print(f"Number of training files: {len(training_files)}")

Number of training files: 64727


In [11]:
# Ścieżka do nowego pliku
training_list_path = 'data/train/training_list.txt'

# Otwórz plik w trybie zapisu
with open(training_list_path, 'w') as file:
    # Przejdź przez wszystkie pliki treningowe
    for training_file in training_files:
        # Dodaj ścieżkę do pliku do pliku treningowego
        file.write(training_file + '\n')

print(f"Training file paths have been written to {training_list_path}")

Training file paths have been written to data/train/training_list.txt


In [None]:
# Open the file in read mode
with open('data/train/testing_list.txt', 'r') as file:
    # Read all lines and remove trailing newline characters
    training_files = [line.rstrip() for line in file]

# Now training_files is a list containing all lines in testing_list.txt
print(training_files)

# Create datasets locally

In [6]:
from datasets import Dataset, DatasetDict
import os
import librosa
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_dir = "./data/train/audio" 
with open('data/train/testing_list.txt', 'r') as file:
    training_list = [line.rstrip() for line in file]

dataset_dict = DatasetDict()

for audio_file in training_list:
    audio_path = os.path.join(audio_dir, audio_file)
    audio, rate = librosa.load(audio_path, sr=16000)  
    inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

    label_id = processor.tokenizer(os.path.basename(os.path.dirname(audio_file)).upper(), return_tensors="pt").input_ids[0]
    print(audio_file, label_id)

    dataset = Dataset.from_dict({"input_values": inputs["input_values"][0], "labels": label_id})
    dataset_dict[audio_file] = dataset

dataset_dict.save_to_disk("./data/dataset_testing")

bed/0c40e715_nohash_0.wav tensor([24,  5, 14])


ArrowInvalid: Column 1 named labels expected length 16000 but got length 3

In [1]:
from datasets import load_from_disk

# Load the dataset from the directory
dataset = load_from_disk('./dataset_testing')

In [3]:
dataset[0]

{'input_values': [0.0013349769869819283,
  0.0006778182578273118,
  0.0023207152262330055,
  0.0023207152262330055,
  2.065944136120379e-05,
  0.0006778182578273118,
  0.002977874130010605,
  0.0013349769869819283,
  -0.0006364993751049042,
  -0.0003079199814237654,
  0.0013349769869819283,
  0.0033064535818994045,
  0.001663556438870728,
  -0.0026079758536070585,
  -0.001950817066244781,
  0.001992135774344206,
  0.0023207152262330055,
  0.0006778182578273118,
  0.0013349769869819283,
  0.000349238864146173,
  -0.0012936581624671817,
  0.001992135774344206,
  0.0049493503756821156,
  0.0013349769869819283,
  -0.0016222377307713032,
  0.0006778182578273118,
  0.0033064535818994045,
  0.005606509745121002,
  0.0062636686488986015,
  0.003963612485677004,
  0.0013349769869819283,
  0.0013349769869819283,
  2.065944136120379e-05,
  -0.0026079758536070585,
  -0.0022793966345489025,
  -0.0009650788269937038,
  -0.0016222377307713032,
  -0.0012936581624671817,
  -0.0003079199814237654,
  -0.

# mdoel

In [None]:
import numpy as np
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer
from datasets import load_metric
from torch.nn.utils.rnn import pad_sequence

# Inicjalizuj model
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Zdefiniuj argumenty treningowe
training_args = TrainingArguments(
    output_dir='./results',          # ścieżka wyjściowa dla zapisywania modelu
    num_train_epochs=3,              # liczba epok treningowych
    per_device_train_batch_size=16,  # rozmiar batcha dla treningu
    per_device_eval_batch_size=64,   # rozmiar batcha dla ewaluacji
    warmup_steps=500,                # liczba kroków rozgrzewki
    weight_decay=0.01,               # współczynnik zaniku wagi
    logging_dir='./logs',            # ścieżka do zapisywania logów
)

# Zdefiniuj funkcję do obliczania metryki podczas treningu
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer_metric = load_metric("wer")
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# Zainicjuj trenera
trainer = Trainer(
    model=model,                         # model do trenowania
    args=training_args,                  # argumenty treningowe
    train_dataset=dataset,               # dane treningowe
    compute_metrics=compute_metrics,     # funkcja do obliczania metryk
    data_collator=lambda data: {"input_values": pad_sequence([torch.tensor(f["input_values"]) for f in data], batch_first=True), "labels": pad_sequence([torch.tensor(f["labels"]) for f in data], batch_first=True)}, 
)

# Rozpocznij trening
trainer.train()
# Step 1: Evaluate the model
eval_result = trainer.evaluate()

# Print the evaluation result
print(f"Evaluation Result: {eval_result}")

# Step 2: Save the model
model_path = "./wav2vec2_trained_model"
model.save_pretrained(model_path)
processor.save_pretrained(model_path)
# 
# # Step 3: Use the model for inference
# from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# 
# # Load the saved model and processor
# model = Wav2Vec2ForCTC.from_pretrained(model_path)
# processor = Wav2Vec2Processor.from_pretrained(model_path)
# 
# # Assume we have a new audio file for inference
# new_audio_file = "path_to_your_audio_file.wav"
# 
# # Load and preprocess the audio file
# audio, rate = librosa.load(new_audio_file, sr=16000)
# inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
# 
# # Make the prediction
# logits = model(**inputs).logits
# 
# # Compute the predicted IDs
# predicted_ids = torch.argmax(logits, dim=-1)
# 
# # Decode the IDs to get the transcription
# transcription = processor.decode(predicted_ids[0])
# 
# print(f"Transcription for {new_audio_file}: {transcription}")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Step,Training Loss


Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
   ---------------------------------------- 0.0/297.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/297.6 kB ? eta -:--:--
   --------- ------------------------------ 71.7/297.6 kB ? eta -:--:--
   --------- ------------------------------ 71.7/297.6 kB ? eta -:--:--
   -------------- ----------------------- 112.6/297.6 kB 939.4 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/297.6 kB 939.4 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/297.6 kB 939.4 kB/s eta 0:00:01
   ------------------- ------------------ 153.6/297.6 kB 706.2 kB/s eta 0:00:01
   ------------------------ ------------- 194.6/297.6 kB 692.9 kB/s eta 0:00:01
   ---------------------------- --------- 225.3/297.6 kB 724.0 kB/s eta 0:00:01
   ---------------------------- --------- 225.3/297.6 kB 724.0 kB/s eta 0:00:01
   -------

In [3]:
from transformers import Wav2Vec2CTCTokenizer

# Initialize the tokenizer
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h')

# Convert label to id
label_id = tokenizer.convert_tokens_to_ids('BIRD')

print(label_id)

3


In [4]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
label_id = processor.tokenizer('BIRDS', return_tensors="pt").input_ids[0]

In [5]:
label_id

tensor([24, 10, 13, 14, 12])

# finetune pretrained model

In [29]:
from datasets import Dataset
import os
import librosa
from transformers import Wav2Vec2Processor

# Initialize the Wav2Vec2 processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Define the path to your audio directory
audio_dir = "./data/train/audio"  # Replace with your actual path
# Initialize dictionaries to store your data
input_values = []
labels = []

# Read the training_list.txt file and store the audio file paths in a list
with open('data/train/training_list.txt', 'r') as file:
    training_list = [line.rstrip() for line in file]

# For each audio file path in the training list
for audio_file in training_list:
    # Construct the full path to the audio file
    audio_path = os.path.join(audio_dir, audio_file)

    # Load the audio file
    audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate

    # Preprocess the audio file
    inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

    # Tokenize the label (subdirectory name)
    label_id = processor.tokenizer(os.path.basename(os.path.dirname(audio_file)).upper(), return_tensors="pt").input_ids[0]

    # Append the preprocessed audio data and the tokenized label to the data list
    input_values.append(inputs["input_values"][0])
    labels.append(label_id)
    print(audio_file, label_id)

# Create a dictionary of lists (columns)
column_data = {"input_values": input_values, "labels": labels}

# Convert the data list into a Dataset object
dataset = Dataset.from_dict(column_data)

bed\00176480_nohash_0.wav tensor([24,  5, 14])
bed\004ae714_nohash_0.wav tensor([24,  5, 14])
bed\004ae714_nohash_1.wav tensor([24,  5, 14])
bed\00f0204f_nohash_0.wav tensor([24,  5, 14])
bed\00f0204f_nohash_1.wav tensor([24,  5, 14])
bed\012c8314_nohash_0.wav tensor([24,  5, 14])
bed\012c8314_nohash_1.wav tensor([24,  5, 14])
bed\0132a06d_nohash_0.wav tensor([24,  5, 14])
bed\0135f3f2_nohash_0.wav tensor([24,  5, 14])
bed\0137b3f4_nohash_0.wav tensor([24,  5, 14])
bed\014f9f65_nohash_0.wav tensor([24,  5, 14])
bed\01648c51_nohash_0.wav tensor([24,  5, 14])
bed\01648c51_nohash_1.wav tensor([24,  5, 14])
bed\016e2c6d_nohash_0.wav tensor([24,  5, 14])
bed\01b4757a_nohash_0.wav tensor([24,  5, 14])
bed\01b4757a_nohash_1.wav tensor([24,  5, 14])
bed\01bcfc0c_nohash_0.wav tensor([24,  5, 14])
bed\0227998e_nohash_0.wav tensor([24,  5, 14])
bed\026290a7_nohash_0.wav tensor([24,  5, 14])
bed\02746d24_nohash_0.wav tensor([24,  5, 14])
bed\035de8fe_nohash_0.wav tensor([24,  5, 14])
bed\0362539c_

MemoryError: 

In [7]:
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load your data
dataset = load_dataset("text", data_files="training_list.txt")

# Iimport os

# Initialize the Wav2Vec2 processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Preprocess your data
def preprocess_data(examples):
    # Load the audio file
    audio, rate = librosa.load(examples["path"], sr=16000)  # Wav2Vec2 expects 16kHz sample rate

    # Preprocess the audio file
    inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)

    # Extract the label from the directory name
    label = os.path.basename(os.path.dirname(examples["path"]))

    return {"input_values": inputs["input_values"][0], "labels": label}

dataset = dataset.map(preprocess_data)

# Tokenize your transcriptions
def tokenize_data(examples):
    # Tokenize the transcriptions
    label_id = processor.tokenizer(examples["labels"], return_tensors="pt").input_ids[0]

    return {"input_values": examples["input_values"], "labels": label_id}

dataset = dataset.map(tokenize_data)

FileNotFoundError: Unable to find 'C:/Users/wojew/Documents/DS/sem1/Deep Learning/CINIC10_Proj1/P2_Transformers_Speech_Classification\training_list.txt'

# sprawdz czy tunowanie cos zrobiło             

In [None]:
bez tunowania

In [8]:
with open('data/train/testing_list.txt', 'r') as file:
    training_list = [line.rstrip() for line in file]
audio_dir = 'data/train/audio'
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

correct_predictions = 0
total_predictions = 0
# For each audio file path in the training list
for audio_file in training_list:
    # Construct the full path to the audio file
    audio_path = os.path.join(audio_dir, audio_file)

    audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
    inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
    logits = model(**inputs).logits

    # Compute the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the IDs to get the transcription
    transcription = processor.decode(predicted_ids[0])

    # Convert the transcription to lowercase
    transcription = transcription

    # Extract the label from the file path
    real_label = os.path.basename(os.path.dirname(audio_path)).upper()

    # Convert the real label to lowercase
    real_label = real_label
    print(audio_file, real_label, transcription)

    # Check if the prediction is correct
    if transcription == real_label:
        correct_predictions += 1

    total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

bed/0c40e715_nohash_0.wav BED FED
bed/0ea0e2f4_nohash_0.wav BED BED
bed/0ea0e2f4_nohash_1.wav BED BAD
bed/105a0eea_nohash_0.wav BED BED
bed/1528225c_nohash_0.wav BED BED
bed/1528225c_nohash_1.wav BED BED
bed/1528225c_nohash_2.wav BED BED
bed/1528225c_nohash_3.wav BED BED
bed/1b4c9b89_nohash_0.wav BED BED
bed/1cb788bc_nohash_0.wav BED BED
bed/1cb788bc_nohash_1.wav BED BED
bed/20d3f11f_nohash_0.wav BED BED
bed/210f3aa9_nohash_0.wav BED BED
bed/2796ac50_nohash_0.wav BED BED
bed/283d7a53_nohash_0.wav BED BED
bed/28497c5b_nohash_0.wav BED BED
bed/2c6d3924_nohash_0.wav BED BED
bed/2d82a556_nohash_0.wav BED BED
bed/2d82a556_nohash_1.wav BED BED
bed/370844f7_nohash_0.wav BED BAD
bed/37dca74f_nohash_0.wav BED BED
bed/3df9a3d4_nohash_0.wav BED 
bed/3df9a3d4_nohash_1.wav BED AT
bed/3f170018_nohash_0.wav BED BED
bed/3f2b358d_nohash_0.wav BED 
bed/3ff840aa_nohash_0.wav BED BED
bed/3ff840aa_nohash_1.wav BED BED
bed/422d3197_nohash_0.wav BED BED
bed/4290ca61_nohash_0.wav BED BED
bed/43fc47a7_nohash_0

z tunowaniem

In [11]:
with open('data/train/testing_list.txt', 'r') as file:
    training_list = [line.rstrip() for line in file]
    
audio_dir = 'data/train/audio'
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("../wav2vec2_finetuned_testing_data")

correct_predictions = 0
total_predictions = 0
# For each audio file path in the training list
for audio_file in training_list:
    # Construct the full path to the audio file
    audio_path = os.path.join(audio_dir, audio_file)

    audio, rate = librosa.load(audio_path, sr=16000)  # Wav2Vec2 expects 16kHz sample rate
    inputs = processor(audio, return_tensors="pt", padding=True, sampling_rate=16000)
    logits = model(**inputs).logits

    # Compute the predicted IDs
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the IDs to get the transcription
    transcription = processor.decode(predicted_ids[0])

    # Convert the transcription to lowercase
    transcription = transcription

    # Extract the label from the file path
    real_label = os.path.basename(os.path.dirname(audio_path)).upper()

    # Convert the real label to lowercase
    real_label = real_label

    # Check if the prediction is correct
    if transcription == real_label:
        correct_predictions += 1
    print(audio_file, real_label, transcription)
    total_predictions += 1

# Compute the accuracy
accuracy = correct_predictions / total_predictions

print(f"Accuracy: {accuracy}")

bed/0c40e715_nohash_0.wav BED 
bed/0ea0e2f4_nohash_0.wav BED 
bed/0ea0e2f4_nohash_1.wav BED 
bed/105a0eea_nohash_0.wav BED 
bed/1528225c_nohash_0.wav BED 
bed/1528225c_nohash_1.wav BED 
bed/1528225c_nohash_2.wav BED 
bed/1528225c_nohash_3.wav BED 
bed/1b4c9b89_nohash_0.wav BED 
bed/1cb788bc_nohash_0.wav BED 
bed/1cb788bc_nohash_1.wav BED 
bed/20d3f11f_nohash_0.wav BED 
bed/210f3aa9_nohash_0.wav BED 
bed/2796ac50_nohash_0.wav BED 
bed/283d7a53_nohash_0.wav BED 
bed/28497c5b_nohash_0.wav BED 
bed/2c6d3924_nohash_0.wav BED 
bed/2d82a556_nohash_0.wav BED 
bed/2d82a556_nohash_1.wav BED 
bed/370844f7_nohash_0.wav BED 
bed/37dca74f_nohash_0.wav BED 
bed/3df9a3d4_nohash_0.wav BED 
bed/3df9a3d4_nohash_1.wav BED 
bed/3f170018_nohash_0.wav BED 
bed/3f2b358d_nohash_0.wav BED 
bed/3ff840aa_nohash_0.wav BED 
bed/3ff840aa_nohash_1.wav BED 
bed/422d3197_nohash_0.wav BED 
bed/4290ca61_nohash_0.wav BED 
bed/43fc47a7_nohash_0.wav BED 
bed/44260689_nohash_0.wav BED 
bed/4620dc14_nohash_0.wav BED 
bed/47d0

KeyboardInterrupt: 