In [None]:
# when executed in a Google Colab setting, we must install the required libraries

# !pip install torch
# !pip install os
# !pip install transformers
# !pip install numpy
# !pip install pandas
# !pip install soundfile
# !pip install librosa

In [None]:
import os
import torch
from torch import nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, AutoProcessor, HubertModel, Wav2Vec2Model, Wav2Vec2Processor
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import pickle
import base64

In [None]:
#### Edit variables and filepaths here ####
DATASET_FILEPATH = './drive/MyDrive/Thesis/'
RANDOM_SEED = 2 # Dataset identified by the random seed used to shuffle for its creation
M_SIZE = 'base' # 'base' or 'large' - 1024 dimension embeddings for large or base for 768

In [None]:
CLASSIFICATION_DETAILS_FILEPATH = os.path.join(DATASET_FILEPATH, f'{M_SIZE}/{RANDOM_SEED}/')
AUDIO_FILEPATH = os.path.join(DATASET_FILEPATH, 'audio')

# Files to save our datasets with embeddings to
optimised_train_csv_file = f'./drive/MyDrive/Thesis/{M_SIZE}/{RANDOM_SEED}/processed/train_dataset.csv'
optimised_validation_csv_file = f'./drive/MyDrive/Thesis/{M_SIZE}/{RANDOM_SEED}/processed/validation_dataset.csv'
optimised_test_csv_file = f'./drive/MyDrive/Thesis/{M_SIZE}/{RANDOM_SEED}/processed/test_dataset.csv'

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device("cpu")
print('Device: ', device)

Device:  cuda


In [None]:
def generate_bert_embedding(text, tokenizer, model):
    """
    Generate BERT embeddings for the provided text using the given tokenizer and model.

    :param text: The input text for which embeddings are to be generated.
    :param tokenizer: The tokenizer specific to the BERT model.
    :param model: Pre-trained BERT model.
    :returns: A tensor containing averaged BERT embeddings for the input text.
    """
    # first, we tokenize the input
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)]) # add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.

    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids)
    embeddings = outputs[0][0] # retrieve embeddings from output

    averaged_embeddings = torch.mean(embeddings, dim=0)
    averaged_embeddings = averaged_embeddings.cpu()

    del input_ids
    torch.cuda.empty_cache()

    return averaged_embeddings

In [None]:
def generate_hubert_embedding(filename, processor, model):
    """
    Generate HuBERT embeddings for the audio file using the provided processor and model.

    :param filename: Name of the audio file, e.g. 'Group 1: 14:06.8 - 14:07.5 - 1.wav'.
    :param processor: The processor, for HuBERT we use wav2vec's processor.
    :param model: Pre-trained HuBERT model.
    :returns: A tensor containing embeddings for the input audio file.
    """
    file = os.path.join(AUDIO_FILEPATH, filename)

    speech_array, sampling_rate = librosa.load(file, sr=16_000)
    if len(speech_array.shape) > 1:
        speech_array = np.mean(speech_array, axis=1)  # convert to mono if stereo
    input_values = processor(speech_array, return_tensors="pt", sampling_rate=16000).input_values
    input_values = input_values.to(device)
    with torch.no_grad():
      embedding = model(input_values).last_hidden_state
    embedding = torch.squeeze(embedding)

    embedding = embedding.cpu()
    del input_values
    torch.cuda.empty_cache()

    return embedding

In [None]:
def generate_wav2vec2_embedding(filename, processor, model):
  """
  Generate Wav2Vec2 embeddings for the audio file using the provided processor and model.

  :param filename: Name of the audio file, e.g. 'Group 1: 14:06.8 - 14:07.5 - 1.wav'.
  :param processor: The processor specific to the Wav2Vec2 model.
  :param model: Pre-trained Wav2Vec2 model.
  :returns: A tensor containing embeddings for the input audio file.
  """
  file = os.path.join(AUDIO_FILEPATH, filename)
  speech_array, sampling_rate = librosa.load(file, sr=16_000)
  if len(speech_array.shape) > 1:
      speech_array = np.mean(speech_array, axis=1)  # convert to mono if stereo
  inputs = processor(speech_array, return_tensors="pt", sampling_rate=16000, padding=True)
  inputs = inputs.to(device)
  with torch.no_grad():
      outputs = model(inputs.input_values.to(device))
  embedding = torch.squeeze(outputs.last_hidden_state)

  embedding = embedding.cpu()
  del inputs
  torch.cuda.empty_cache()
  return embedding

In [None]:
#### Helper functions ####

def to_base64_str(tensor):
    """
    Convert a PyTorch tensor to a base64 encoded string.

    :param tensor: Input PyTorch tensor.
    :returns: A base64 encoded string representation of the tensor.
    """
    return base64.b64encode(pickle.dumps(tensor)).decode()
def to_tensor(base64_str):
    """
    Convert a base64 encoded string back to a PyTorch tensor.

    :param base64_str: A base64 encoded string representation of a tensor.
    :returns: Decoded PyTorch tensor.
    """
    return pickle.loads(base64.b64decode(base64_str.encode()))

def save_dataframe(df, csv_file):
    """
    Save a pandas dataframe to a CSV file.

    :param df: Input pandas dataframe.
    :param csv_file: Path to the output CSV file.
    """
    df.to_csv(csv_file, index=False)

def label_encoding(df):
    """
    Performs label encoding on the 'classification' column of the dataframe.
    Modifies the dataframe in place (this provides a side-effect).

    :param df: Input pandas dataframe with a 'classification' column.
    """
    classification_mapping = {'interruption': 1, 'non-interruption': 0}
    df['classification'] = df['classification'].map(classification_mapping)

def read_dataset(element):
    """
    Read a dataset given the type ('train', 'test', 'validation') and return as a pandas dataframe.

    :param element: Type of the dataset. Can be either 'train', 'test', or 'validation'.
    :returns: Pandas dataframe containing the dataset details.
    """
    df = pd.read_csv(os.path.join(CLASSIFICATION_DETAILS_FILEPATH, f'./{element}_classification_details.txt'), delimiter='\|\|', header=None,
                    names=['audio_file_name', 'classification', 'conversational_history'],
                    engine='python')
    label_encoding(df)
    return df

In [None]:
# load HuBERT
if M_SIZE == 'base':
  hubert_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
  hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").to(device)
else:
  hubert_processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
  hubert_model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").to(device)

hubert_model.eval()

train_df = read_dataset('train')
validation_df = read_dataset('validation')
test_df = read_dataset('test')

train_df['hubert_embeddings'] = train_df['audio_file_name'].apply(lambda x: to_base64_str(generate_hubert_embedding(x, hubert_processor, hubert_model)))
validation_df['hubert_embeddings'] = validation_df['audio_file_name'].apply(lambda x: to_base64_str(generate_hubert_embedding(x, hubert_processor, hubert_model)))
test_df['hubert_embeddings'] = test_df['audio_file_name'].apply(lambda x: to_base64_str(generate_hubert_embedding(x, hubert_processor, hubert_model)))

del hubert_model, hubert_processor
torch.cuda.empty_cache()

# load BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bert_model.eval()

train_df['bert_embeddings'] = train_df['conversational_history'].apply(lambda x: to_base64_str(generate_bert_embedding(x.replace('"', ''), bert_tokenizer, bert_model)))
validation_df['bert_embeddings'] = validation_df['conversational_history'].apply(lambda x: to_base64_str(generate_bert_embedding(x.replace('"', ''), bert_tokenizer, bert_model)))
test_df['bert_embeddings'] = test_df['conversational_history'].apply(lambda x: to_base64_str(generate_bert_embedding(x.replace('"', ''), bert_tokenizer, bert_model)))

del bert_model, bert_tokenizer
torch.cuda.empty_cache()

# load wav2vec 2.0
if M_SIZE == 'base':
  wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
  wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base", output_hidden_states=True).to(device)
else:
  wav2vec2_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
  wav2vec2_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h", output_hidden_states=True).to(device)

wav2vec2_model.eval()

train_df['wav2vec_embeddings'] = train_df['audio_file_name'].apply(lambda x: to_base64_str(generate_wav2vec2_embedding(x, wav2vec2_processor, wav2vec2_model)))
validation_df['wav2vec_embeddings'] = validation_df['audio_file_name'].apply(lambda x: to_base64_str(generate_wav2vec2_embedding(x, wav2vec2_processor, wav2vec2_model)))
test_df['wav2vec_embeddings'] = test_df['audio_file_name'].apply(lambda x: to_base64_str(generate_wav2vec2_embedding(x, wav2vec2_processor, wav2vec2_model)))


# finally we save our new dataframes
save_dataframe(train_df, optimised_train_csv_file)
save_dataframe(validation_df, optimised_validation_csv_file)
save_dataframe(test_df, optimised_test_csv_file)

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

In [None]:
#### Tests and checks ####
train_csv_file = os.path.join(CLASSIFICATION_DETAILS_FILEPATH, 'train_classification_details.txt')

selected_columns = ['audio_file_name','classification', 'wav2vec_embeddings', 'hubert_embeddings']
train_df = pd.read_csv(train_csv_file, usecols=selected_columns, converters={'wav2vec_embeddings': to_tensor, 'hubert_embeddings' : to_tensor})
print('Wav2Vec2 shape ', train_df.iloc[0]['wav2vec_embeddings'].shape)
print('HuBERT shape ', train_df.iloc[0]['hubert_embeddings'].shape)

Wav2Vec2 shape  torch.Size([14, 768])
HuBERT shape  torch.Size([14, 768])
