In [1]:
import os
import pandas as pd
from datasets import Dataset, load_dataset
import torchaudio
from transformers import Wav2Vec2Processor
from transformers import HubertModel, Wav2Vec2FeatureExtractor, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_wav_files(directory):
    mp3_files = []
    for file in os.listdir(directory):
        if file.endswith(".wav"):
            file_path = os.path.join(directory, file)
            mp3_files.append(file_path)
    return mp3_files

def create_huggingface_dataset(mp3_files):
    df = pd.DataFrame({"file_path": mp3_files})
    dataset = Dataset.from_pandas(df)
    return dataset

# Replace 'music-segments' with the path to your folder containing .mp3 files
music_segments_dir = "music-segments/"
wav_files = load_wav_files(music_segments_dir)

# Create the Hugging Face dataset
music_dataset = create_huggingface_dataset(wav_files)

In [3]:
def preprocess_data(file_path: str, processor: Wav2Vec2Processor, target_sampling_rate: int = 16000):
    waveform, sampling_rate = torchaudio.load(file_path)
    
    if sampling_rate != target_sampling_rate:
        waveform = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)(waveform)

    # Process the waveform and extract the required features
    input_values = processor(waveform.numpy(), sampling_rate=target_sampling_rate, return_tensors="pt").input_values

    return input_values

# Initialize the processor
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

In [4]:
def preprocess_function(example):
    input_values = preprocess_data(example["file_path"], processor)
    return {"input_values": input_values.squeeze()}


In [5]:
music_dataset = music_dataset.map(preprocess_function, remove_columns=["file_path"], num_proc=4)

                                                                               

In [5]:
# music_dataset.save_to_disk("music_dataset")

                                                                                                  

In [5]:
# from datasets import load_from_disk

# # # Load the dataset from disk
# music_dataset = load_from_disk("music_dataset")

In [6]:
# Load the model and processor
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
# processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft")

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./hubert_output",
    per_device_train_batch_size=8,
    num_train_epochs=5,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=500,
    eval_steps=500,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=music_dataset,
    tokenizer=processor,
)

In [8]:
# Train the model
trainer.train()



ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state. For reference, the inputs it received are input_values,attention_mask.

# OLD

In [17]:
def preprocess_function(example):
    input_values = preprocess_data(example["file_path"], processor)
    return {"input_values": input_values}

music_dataset = music_dataset.map(preprocess_function, remove_columns=["file_path"], num_proc=4)

                                                                               

In [18]:
music_dataset.save_to_disk("music_dataset")

                                                                                                

In [None]:
# from datasets import load_from_disk

# # Load the dataset from disk
# loaded_music_dataset = load_from_disk("music_dataset")

In [21]:
# Load the model and tokenizer
model = HubertModel.from_pretrained("facebook/hubert-xlarge-ls960-ft")
tokenizer = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-xlarge-ls960-ft")

Downloading pytorch_model.bin: 100%|██████████| 3.85G/3.85G [03:35<00:00, 17.8MB/s]
Some weights of the model checkpoint at facebook/hubert-xlarge-ls960-ft were not used when initializing HubertModel: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./hubert_output",
    per_device_train_batch_size=8,
    num_train_epochs=5,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=500,
    eval_steps=500,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=music_dataset,
    tokenizer=tokenizer,
)

In [23]:
trainer.train()



RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [8, 1, 1, 1, 160000]