In [1]:
from datasets import load_from_disk, load_dataset, Audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "saved_model/best_model_wav2vec_base"

In [3]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_path)

In [4]:
from transformers import pipeline

classifier = pipeline("audio-classification", model=model_path, )

In [5]:
datasets = load_from_disk("data")

In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 70578
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 9951
    })
    new_unseen: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 29556
    })
    drift: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 42697
    })
})

In [7]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

def convert_label(example):
    example['gender'] = int(label2id[example['gender']])
    return example

    

In [8]:
labels = ["male", "female"]
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [9]:
train_dataset = datasets['train']
test_dataset = datasets['test']
new_unseen_dataset = datasets['new_unseen']
drift_dataset = datasets['drift']

In [10]:
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16_000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16_000))
new_unseen_dataset = new_unseen_dataset.cast_column("audio", Audio(sampling_rate=16_000))
drift_dataset = drift_dataset.cast_column("audio", Audio(sampling_rate=16_000))

In [11]:
train_dataset = train_dataset.map(convert_label)
test_dataset = test_dataset.map(convert_label)
new_unseen_dataset = new_unseen_dataset.map(convert_label)
drift_dataset = drift_dataset.map(convert_label)

#encoded_train_audios = train_dataset.map(preprocess_function, remove_columns="audio", batched=True)
#encoded_train_audios = encoded_train_audios.rename_column("gender", "label")

100%|██████████| 70578/70578 [00:16<00:00, 4400.32ex/s]
100%|██████████| 9951/9951 [00:02<00:00, 4216.79ex/s]
100%|██████████| 29556/29556 [00:06<00:00, 4465.86ex/s]
100%|██████████| 42697/42697 [00:09<00:00, 4487.51ex/s]


In [12]:
from transformers import AutoModel, AutoTokenizer
import torch

In [13]:
model = AutoModel.from_pretrained(model_path)

Some weights of the model checkpoint at saved_model/best_model_wav2vec_base were not used when initializing Wav2Vec2Model: ['classifier.weight', 'projector.weight', 'classifier.bias', 'projector.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# Ensure the model is in evaluation mode
model.eval()

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (2): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (3): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (4): Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5): Wav2Vec2NoLay

In [None]:
#inputs = feature_extractor(datasets["test"][0]["audio"]["array"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")

In [None]:
#inputs = feature_extractor(datasets["test"][0]["audio"]["array"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")

In [15]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained(model_path, output_hidden_states=True)

In [None]:
#encoded_train_audios[1]

In [22]:
import numpy as np
from tqdm import tqdm
import h5py
import os

def extract_embedding(dataset):
    original_label_ids = []
    predicted_label_ids = []
    
    original_label_names = []
    predicted_label_names = []
    
    last_hidden_states = []
    
    accents = []
    
    for sample in tqdm(dataset):
        inputs = feature_extractor(sample["audio"]["array"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
            
        hidden_states = outputs.hidden_states
        last_hidden_state = hidden_states[-1]
        averaged_hidden_state = torch.mean(last_hidden_state, dim=1)
        averaged_hidden_state_np = averaged_hidden_state.numpy()
        
        logits = outputs.logits
        
        predicted_label_id = torch.argmax(logits).item()
        predicted_label = model.config.id2label[predicted_label_id]
        predicted_label_id = int(predicted_label_id)
        
        original_label_id = sample['gender']
        original_label = model.config.id2label[original_label_id]
        
        
        original_label_ids.append(original_label_id)
        predicted_label_ids.append(predicted_label_id)
        original_label_names.append(original_label)
        predicted_label_names.append(predicted_label)
        
        last_hidden_states.append(averaged_hidden_state_np.squeeze(0))
        
        accents.append(sample['accent'])
        
    embedding_matrix = np.vstack(last_hidden_states)
        
    return embedding_matrix, original_label_ids, original_label_names, predicted_label_ids, predicted_label_names, accents


def save_embedding(output_path, E, Y_original, Y_original_names, Y_predicted, Y_predicted_names, accents):

    fp = h5py.File(output_path, "w")
    fp.create_dataset("E", data=E, compression="gzip")
    fp.create_dataset("Y_original", data=Y_original, compression="gzip")
    fp.create_dataset("Y_original_names", data=Y_original_names, compression="gzip")
    fp.create_dataset("Y_predicted", data=Y_predicted, compression="gzip")
    fp.create_dataset("Y_predicted_names", data=Y_predicted_names, compression="gzip")
    fp.create_dataset("accents", data=accents, compression="gzip")
    fp.close()
    return
        
        
        
        

In [23]:
embedding_dir = os.path.join("saved_embedding", "gender_classification", "wav2vec")

In [None]:
E_train, Y_original_train, Y_original_names_train, Y_predicted_train, Y_predicted_names_train, accents_train = extract_embedding(train_dataset)

  4%|▍         | 2861/70578 [05:56<2:29:10,  7.57it/s]

In [None]:
save_embedding(os.path.join(embedding_dir, "train_embedding.hdf5"), 
                E_train, 
                Y_original_train, 
                Y_original_names_train, 
                Y_predicted_train, 
                Y_predicted_names_train,
                accents_train)

In [None]:
E_test, Y_original_test, Y_original_names_test, Y_predicted_test, Y_predicted_names_test, accents_test = extract_embedding(test_dataset)

In [None]:
save_embedding(os.path.join(embedding_dir, "test_embedding.hdf5"), 
                E_test, 
                Y_original_test, 
                Y_original_names_test, 
                Y_predicted_test, 
                Y_predicted_names_test,
                accents_test)

In [None]:
E_new_unseen, Y_original_new_unseen, Y_original_names_new_unseen, Y_predicted_new_unseen, Y_predicted_names_new_unseen, accents_new_unseen = extract_embedding(new_unseen_dataset)

In [None]:
save_embedding(os.path.join(embedding_dir, "new_unseen_embedding.hdf5"), 
                E_new_unseen, 
                Y_original_new_unseen, 
                Y_original_names_new_unseen, 
                Y_predicted_new_unseen, 
                Y_predicted_names_new_unseen,
                accents_new_unseen)

In [None]:
E_drift, Y_original_drift, Y_original_names_drift, Y_predicted_drift, Y_predicted_names_drift, accents_drift = extract_embedding(drift_dataset)

In [None]:
save_embedding(os.path.join(embedding_dir, "drift_embedding.hdf5"), 
                E_drift, 
                Y_original_drift, 
                Y_original_names_drift, 
                Y_predicted_drift, 
                Y_predicted_names_drift,
                accents_drift)