In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install transformers --quiet
!pip install huggingface_hub --quiet

!pip install accelerate -U --quiet
!pip install transformers[torch] --quiet
!pip install shap --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m503.5 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m533.5/533.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

### Generate Labels for Data

In [None]:
import os
import csv

audio_directory = "/content/drive/MyDrive/all_audio"
#text_directory = "/content/drive/MyDrive/all_audio/all_text"
data = []

for filename in os.listdir(audio_directory):
    if filename.endswith(".wav"):
        # Extract label
        label = filename.split("_")[1]
        label = 1 if label == 'lie' else 0

        # Append data to the list
        data.append((filename, label))

# Specify the path for the metadata CSV file
csv_file_path = "/content/drive/MyDrive/all_audio/metadata.csv"

# Write data to the CSV file
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "label"])
    writer.writerows(data)

print(f"Metadata file created at {csv_file_path}")


Metadata file created at /content/drive/MyDrive/all_audio/metadata.csv


### Split data into sliding windows

In [19]:
# Windowing function
def window_audio(audio_array, window_size=10, overlap=0.75):
    sr = 16000
    window_size_samples = int(window_size * sr)
    overlap_samples = int(window_size_samples * overlap)

    windows = []
    for i in range(0, len(audio_array) - window_size_samples, overlap_samples):
        window = audio_array[i:i + window_size_samples]
        windows.append(window)

    return windows


In [18]:
def get_text_embeddings(texts, tokenizer, model):
    encoded_texts = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        model_output = model(**encoded_texts)

    text_embeddings = model_output.last_hidden_state[:, 0, :]

    return text_embeddings

In [17]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000*10, truncation=True)
    return inputs

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)

    # Calculate precision, recall, and f1 score
    precision = precision_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')
    recall = recall_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')
    f1 = f1_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1}

In [6]:
from collections import defaultdict
from datasets import Dataset as DT

def apply_window(dataset):
    windowed_dataset = defaultdict(list)

    for instance_id, example in enumerate(dataset):
        audio_array = example["audio"]["array"]
        windows = window_audio(audio_array)

        for window in windows:
            windowed_dataset["audio"].append({"array": window})
            windowed_dataset["label"].append(example["label"])
            windowed_dataset["instance_id"].append(instance_id)

    windowed_dataset = DT.from_dict(windowed_dataset)
    return windowed_dataset


### Training


In [7]:
from datasets import load_dataset
from datasets import Audio
from datasets import DatasetDict
from datasets import Dataset as DT
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score,confusion_matrix,precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
from collections import defaultdict
from transformers import EarlyStoppingCallback
import torch

### TO BE RUN AFTER QUESTION PART IS TAKEN OUT FROM THE CUSTOM DATASET.
### BECAUSE WE WONT NEED WINDOWS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def train_model_custom_data(dataset):
  for fold, (train_idx, test_idx) in enumerate(kf.split(dataset)):
      print(f"\n----- Fold {fold + 1} -----")

      train_dataset = dataset.select(train_idx)
      test_dataset = dataset.select(test_idx)
      print(train_dataset)
      print(test_dataset)
      train_dataset = apply_window(train_dataset)
      test_dataset = apply_window(test_dataset)
      print(train_dataset)
      print(test_dataset)

      encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns="audio", batched=True)
      encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns="audio", batched=True)

      num_labels = 2
      model = AutoModelForAudioClassification.from_pretrained(
         "facebook/hubert-base-ls960", num_labels=num_labels
      )
      model.to(device)

      training_args = TrainingArguments(
          output_dir= "hubert_deception-1",
          evaluation_strategy="epoch",
          save_strategy="epoch",
          learning_rate=3e-5,
          per_device_train_batch_size=8,
          gradient_accumulation_steps=4,
          per_device_eval_batch_size=8,
          num_train_epochs=10,
          warmup_ratio=0.1,
          logging_steps=10,
          load_best_model_at_end=True,
          push_to_hub=False,
        )

      feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

      trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_train_dataset,
        eval_dataset=encoded_test_dataset,
        tokenizer=feature_extractor,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(3, 0.0)]
    )

      trainer.train()

      eval_results = trainer.evaluate()


      print(f"Fold {fold + 1} - F1 Score: {eval_results['eval_f1_score']}")
      all_f1_scores.append(eval_results['eval_f1_score'])

  mean_f1_score = np.mean(all_f1_scores)
  print(f"\nMean F1 Score across all folds: {mean_f1_score}")



Using device: cuda


In [8]:
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
import torch
from datasets import load_dataset
from datasets import Audio
from datasets import DatasetDict
from datasets import Dataset as DT
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score,confusion_matrix,precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
from collections import defaultdict
from transformers import EarlyStoppingCallback
import torch

In [20]:
def train_model(dataset):
    for fold, (train_idx, test_idx) in enumerate(kf.split(dataset)):
        print(f"\n----- Fold {fold + 1} -----")

        train_dataset = dataset.select(train_idx)
        test_dataset = dataset.select(test_idx)
        print(train_dataset)
        print(test_dataset)
        train_dataset = apply_window(train_dataset)
        test_dataset = apply_window(test_dataset)
        print(train_dataset)
        print(test_dataset)

        encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns="audio", batched=True)
        encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns="audio", batched=True)


        num_labels = 2
        model = AutoModelForAudioClassification.from_pretrained(
            "facebook/hubert-base-ls960", num_labels=num_labels
        )
        model.to(device)

        training_args = TrainingArguments(
            output_dir="hubert_deception-1",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=3e-5,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            per_device_eval_batch_size=8,
            num_train_epochs=10,
            warmup_ratio=0.1,
            logging_steps=10,
            load_best_model_at_end=True,
            push_to_hub=False,
        )

        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=encoded_train_dataset,
            eval_dataset=encoded_test_dataset,
            tokenizer=feature_extractor,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(3, 0.0)]
        )

        trainer.train()

        eval_results = trainer.evaluate()


        predictions = trainer.predict(encoded_test_dataset)

        # Apply averaging using instance_id
        instance_id_predictions = defaultdict(list)
        for i, instance_id in enumerate(test_dataset["instance_id"]):
            instance_id_predictions[instance_id].append(predictions.predictions[i])


        for instance_id, instance_predictions in instance_id_predictions.items():
            mean_prediction = np.mean(instance_predictions, axis=0)
            eval_results[f'eval_predictions_{instance_id}'] = mean_prediction


        print(f"Fold {fold + 1} - F1 Score: {eval_results['eval_f1_score']}")
        all_f1_scores.append(eval_results['eval_f1_score'])

    mean_f1_score = np.mean(all_f1_scores)
    print(f"\nMean F1 Score across all folds: {mean_f1_score}")


In [13]:
#read data
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/all_audio",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
num_labels = 2
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_f1_scores = []
validation_indices = list(range(15)) + list(range(len(dataset['label']) - 25, 15))
validation_dataset = dataset.select(validation_indices)
dataset = dataset.select([i for i in range(len(dataset)) if i not in validation_indices])

train_model(dataset)

Resolving data files:   0%|          | 0/116 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'label'],
    num_rows: 115
})

----- Fold 1 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 80
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 20
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 233
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 61
})


Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.700474,0.631672,0.87623,0.704758
2,0.660900,0.767249,0.631672,0.87623,0.704758
4,0.553700,0.74283,0.631672,0.87623,0.704758


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 - F1 Score: 0.7047578669681552

----- Fold 2 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 80
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 20
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 233
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 61
})


Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.686581,0.760669,0.557377,0.848965
2,0.663100,0.590267,0.775093,0.622951,0.530727
4,0.548800,0.401764,0.854287,0.852459,0.851237
6,0.333300,0.326562,0.901639,0.901639,0.901639
8,0.270100,0.336436,0.886098,0.885246,0.885433
9,0.260700,0.330942,0.886098,0.885246,0.885433


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 2 - F1 Score: 0.9016393442622951

----- Fold 3 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 80
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 20
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 234
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 60
})


Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.698655,0.6525,0.9,0.72931
2,0.650400,0.708292,0.6525,0.9,0.72931
4,0.558800,0.748442,0.666925,0.583333,0.553337


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 3 - F1 Score: 0.7293103448275863

----- Fold 4 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 80
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 20
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 235
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 59
})


Map:   0%|          | 0/235 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.706425,0.967576,0.704237,0.811663
2,0.657300,0.586065,0.971751,0.830508,0.882893
4,0.533500,0.265893,0.977401,0.932203,0.94792
6,0.303400,0.225156,0.979661,0.949153,0.959361
8,0.252600,0.206903,0.979661,0.949153,0.959361
9,0.225100,0.207464,0.979661,0.949153,0.959361


Fold 4 - F1 Score: 0.971448748991122

----- Fold 5 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 80
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 20
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 241
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 53
})


Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.676697,0.861534,0.641509,0.50141
1,0.690300,0.645466,0.861534,0.641509,0.50141
2,0.669100,0.601098,0.861534,0.641509,0.50141
4,0.580300,0.452929,0.817385,0.811321,0.813179
5,0.475800,0.38296,0.888256,0.867925,0.870362
6,0.349500,0.36634,0.851198,0.830189,0.833323
8,0.302400,0.410573,0.832619,0.830189,0.831098
9,0.253100,0.411121,0.832619,0.830189,0.831098


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 5 - F1 Score: 0.833322885977559

Mean F1 Score across all folds: 0.8280958382053436


# CUSTOM DATASET

In [14]:
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/RecordingsWav",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
num_labels = 2
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_f1_scores = []
validation_indices = list(range(15)) + list(range(len(dataset['label']) - 25, 15))
validation_dataset = dataset.select(validation_indices)
dataset = dataset.select([i for i in range(len(dataset)) if i not in validation_indices])

train_model(dataset)

Resolving data files:   0%|          | 0/121 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio', 'label'],
    num_rows: 120
})

----- Fold 1 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 84
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 21
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 104
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 24
})


Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.668011,0.894444,0.666667,0.533333
1,No log,0.641099,0.894444,0.666667,0.533333
2,No log,0.631175,0.894444,0.666667,0.533333
4,0.666200,0.632132,0.894444,0.666667,0.533333
5,0.666200,0.632541,0.894444,0.666667,0.533333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 - F1 Score: 0.5333333333333333

----- Fold 2 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 84
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 21
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 105
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 23
})


Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.699264,0.639036,0.884783,0.713505
2,0.664200,0.798666,0.639036,0.884783,0.713505
4,0.664200,0.827266,0.639036,0.884783,0.713505


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 2 - F1 Score: 0.7135046113306983

----- Fold 3 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 84
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 21
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 101
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 27
})


Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.681838,0.945199,0.703704,0.58132
1,No log,0.651877,0.945199,0.703704,0.58132
2,No log,0.637392,0.945199,0.703704,0.58132
4,0.671400,0.624011,0.945199,0.703704,0.58132
5,0.671400,0.62451,0.945199,0.703704,0.58132
6,0.648500,0.625722,0.945199,0.703704,0.58132
8,0.648500,0.62713,0.945199,0.703704,0.58132


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 3 - F1 Score: 0.5813204508856683

----- Fold 4 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 84
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 21
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 99
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 29
})


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.682248,0.793639,0.586207,0.883283
1,No log,0.669815,0.793639,0.586207,0.883283
2,No log,0.674357,0.793639,0.586207,0.883283
4,0.659000,0.701163,0.793639,0.586207,0.883283


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 4 - F1 Score: 0.8832833583208396

----- Fold 5 -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 84
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 21
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 103
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 25
})


Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.64857,0.7744,0.88,0.82383
1,No log,0.607086,0.7744,0.88,0.82383
2,No log,0.555938,0.7744,0.88,0.82383


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.64857,0.7744,0.88,0.82383
1,No log,0.607086,0.7744,0.88,0.82383
2,No log,0.555938,0.7744,0.88,0.82383
4,0.683600,0.537885,0.7744,0.88,0.82383
5,0.683600,0.537069,0.7744,0.88,0.82383
6,0.668400,0.536427,0.7744,0.88,0.82383


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 5 - F1 Score: 0.8238297872340425

Mean F1 Score across all folds: 0.7070543082209164


# Train Model on Bag of Lies Dataset

### Generate Metadata

In [None]:
import pandas as pd
import re
import os
import csv

annot = pd.read_csv('/content/drive/MyDrive/bag-of-lies/BagOfLies/Annotations.csv')
annot[['video','truth']]
annot['video'] = annot['video'].str.replace("./Finalised/","")
annot['video'] = annot['video'].str.replace("/","_")
annot['video'] = annot['video'].str.replace("_video.mp4","")
annot['video'] = annot['video'].str.replace("_video.mp4","")
def rearrange_string(input_str):
    parts = input_str.split('_')
    rearranged_str = f"{parts[2]}_{parts[3]}_{parts[0]}_{parts[1]}"
    return rearranged_str

for i in range(len(annot['video'])):
  annot['video'][i] = rearrange_string(annot['video'][i])


directory = "/content/drive/MyDrive/bag-of-lies/BagOfLies/Audio"
data = []
for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        user_run = filename.replace(".wav", "")
        label = annot.copy().loc[annot['video'] == user_run]['truth'].reset_index()['truth'][0]
        if label == 1:
          label = 0
        else:
          label = 1
        data.append((filename, label))

csv_file_path = "/content/drive/MyDrive/bag-of-lies/BagOfLies/Audio/metadata.csv"


with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "label"])
    writer.writerows(data)

print(f"Metadata file created at {csv_file_path}")



  annot['video'] = annot['video'].str.replace("./Finalised/","")
  annot['video'] = annot['video'].str.replace("_video.mp4","")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annot['video'][i] = rearrange_string(annot['video'][i])


Metadata file created at /content/drive/MyDrive/bag-of-lies/BagOfLies/Audio/metadata.csv


### Training

In [None]:
from datasets import load_dataset
from datasets import Audio
from datasets import DatasetDict, Dataset
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score,confusion_matrix,precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
from collections import defaultdict
from transformers import EarlyStoppingCallback

In [16]:
#read data
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/Audio-BagofLies",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)

first_indices = list(range(15))
last_indices = list(range(len(dataset['label']) - 15, len(dataset['label'])))
validation_indices = first_indices + last_indices
validation_dataset = dataset.select(validation_indices)

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
num_labels = 2
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_f1_scores = []

dataset = dataset.select([i for i in range(len(dataset)) if i not in validation_indices])

train_model(dataset)


Resolving data files:   0%|          | 0/326 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 325
})

----- Fold 1 -----
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 236
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 59
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 212
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 60
})


Map:   0%|          | 0/212 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.692386,0.7525,0.55,0.840323
1,0.695700,0.677804,0.726573,0.666667,0.656109
2,0.681000,0.659097,0.639583,0.616667,0.613784
4,0.661600,0.614443,0.715476,0.65,0.636573
5,0.635300,0.607497,0.661927,0.633333,0.629241
6,0.635300,0.61176,0.578114,0.566667,0.566667
8,0.615700,0.616854,0.639583,0.616667,0.613784


  _warn_prf(average, modifier, msg_start, len(result))


Fold 1 - F1 Score: 0.6292410714285714

----- Fold 2 -----
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 236
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 59
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 224
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 48
})


Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
1,No log,0.690126,0.597222,0.5625,0.547561
2,0.690700,0.677298,0.6293,0.5625,0.529432
3,0.668700,0.663293,0.657171,0.520833,0.885939
4,0.668700,0.639626,0.606481,0.5625,0.904789
5,0.636700,0.660634,0.770833,0.541667,0.9
6,0.614300,0.650057,0.770833,0.541667,0.9
7,0.614300,0.656285,0.770833,0.541667,0.9


Fold 2 - F1 Score: 0.9047887323943662

----- Fold 3 -----
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 236
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 59
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 214
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 58
})


Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
0,No log,0.702953,0.776579,0.743103,0.708772
1,0.687300,0.722381,0.581094,0.812069,0.642492
2,0.666500,0.738356,0.922079,0.846552,0.836299
4,0.634800,0.686725,0.577737,0.586207,0.581307
5,0.632600,0.7122,0.526922,0.517241,0.521508
6,0.632600,0.70334,0.55546,0.568966,0.560784


  _warn_prf(average, modifier, msg_start, len(result))


Fold 3 - F1 Score: 0.6596366995073891

----- Fold 4 -----
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 236
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 59
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 217
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 55
})


Map:   0%|          | 0/217 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
1,No log,0.711484,0.830435,0.85,0.823679
2,0.684000,0.713575,0.838581,0.940909,0.945623
3,0.665800,0.741539,0.838581,0.940909,0.945623
4,0.665800,0.72224,0.838581,0.940909,0.945623


Fold 4 - F1 Score: 0.823678963110668

----- Fold 5 -----
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 236
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 59
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 221
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 51
})


Map:   0%|          | 0/221 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
1,No log,0.697177,0.61955,0.861765,0.690196
2,0.686600,0.702742,0.61955,0.861765,0.690196
3,0.670300,0.631127,0.608997,0.54902,0.539948
4,0.670300,0.593075,0.664516,0.666667,0.665334
5,0.635000,0.57271,0.786281,0.784314,0.784994
6,0.624300,0.580577,0.723982,0.705882,0.708158
7,0.624300,0.629248,0.714795,0.647059,0.642157
8,0.636900,0.595921,0.695652,0.666667,0.668464


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 5 - F1 Score: 0.7849938556423751

Mean F1 Score across all folds: 0.760467864416674
