In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install datasets --quiet
! pip install evaluate --quiet
! pip install transformers --quiet
!pip install huggingface_hub --quiet

!pip install accelerate -U --quiet
!pip install transformers[torch] --quiet
!pip install shap --quiet

In [14]:
from datasets import load_dataset
from datasets import Audio
from datasets import DatasetDict
from datasets import Dataset as DT

import numpy as np

from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from transformers import AutoFeatureExtractor
from transformers import EarlyStoppingCallback

import torch
import evaluate

from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
from collections import defaultdict
from transformers import EarlyStoppingCallback
import torch

from collections import defaultdict
from datasets import Dataset as DT

import os
import csv



In [4]:

audio_directory = "/content/drive/MyDrive/all_audio"
#text_directory = "/content/drive/MyDrive/all_audio/all_text"
data = []

for filename in os.listdir(audio_directory):
    if filename.endswith(".wav"):
        # Extract label
        label = filename.split("_")[1]
        label = 1 if label == 'lie' else 0

        # Append data to the list
        data.append((filename, label))

# Specify the path for the metadata CSV file
csv_file_path = "/content/drive/MyDrive/all_audio/metadata.csv"

# Write data to the CSV file
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["file_name", "label"])
    writer.writerows(data)

print(f"Metadata file created at {csv_file_path}")

Metadata file created at /content/drive/MyDrive/all_audio/metadata.csv


In [5]:
def window_audio(audio_array, window_size=10, overlap=0.75):
    sr = 16000
    window_size_samples = int(window_size * sr)
    overlap_samples = int(window_size_samples * overlap)

    windows = []
    for i in range(0, len(audio_array) - window_size_samples, overlap_samples):
        window = audio_array[i:i + window_size_samples]
        windows.append(window)

    return windows

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000*10, truncation=True)
    return inputs

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)

    f1 = f1_score(y_true=eval_pred.label_ids, y_pred=predictions, average='weighted')
    accuracy = accuracy_score(y_true=eval_pred.label_ids, y_pred=predictions)

    return {
        "f1_score": f1,
        "accuracy": accuracy
    }


In [8]:
def apply_window(dataset):
    windowed_dataset = defaultdict(list)

    for instance_id, example in enumerate(dataset):
        audio_array = example["audio"]["array"]
        windows = window_audio(audio_array)

        for window in windows:
            windowed_dataset["audio"].append({"array": window})
            windowed_dataset["label"].append(example["label"])
            windowed_dataset["instance_id"].append(instance_id)

    windowed_dataset = DT.from_dict(windowed_dataset)
    return windowed_dataset

In [17]:
def evaluate_test_set(test_dataset,train_val_dataset,training_args,modelname):
    encoded_train_val_dataset = train_val_dataset.map(preprocess_function, remove_columns="audio", batched=True)
    encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns="audio", batched=True)

    model = AutoModelForAudioClassification.from_pretrained(
        "facebook/hubert-base-ls960", num_labels=num_labels
    )
    model.to(device)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_train_val_dataset,
        eval_dataset=encoded_test_dataset,
        tokenizer=feature_extractor,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(3, 0.0)]
    )

    trainer.train()
    trainer.save_model(f"/content/drive/MyDrive/{modelname}/model_{modelname}_final")
    eval_results = trainer.evaluate()

    predictions = trainer.predict(encoded_test_dataset)

    # Apply averaging using instance_id
    instance_id_predictions = defaultdict(list)
    for i, instance_id in enumerate(test_dataset["instance_id"]):
        instance_id_predictions[instance_id].append(predictions.predictions[i])

    for instance_id, instance_predictions in instance_id_predictions.items():
        mean_prediction = np.mean(instance_predictions, axis=0)
        eval_results[f'eval_predictions_{instance_id}'] = mean_prediction

    print(f"\nTest Set Evaluation - F1 Score: {eval_results['eval_f1_score']}")

In [33]:
def train_model(train_val_dataset, test_dataset,modelname):
    loo = LeaveOneOut()
    all_f1_scores = []
    all_accuracies = []
    for fold, (train_idx, test_idx) in enumerate(loo.split(train_val_dataset)):
        print(f"\n----- Fold {fold + 1} Test Index : {test_idx} -----")

        train_dataset = train_val_dataset.select(train_idx)
        test_dataset = train_val_dataset.select(test_idx)
        print(train_dataset)
        print(test_dataset)
        train_dataset = apply_window(train_dataset)
        test_dataset = apply_window(test_dataset)
        print(train_dataset)
        print(test_dataset)

        encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns="audio", batched=True)
        encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns="audio", batched=True)


        num_labels = 2
        model = AutoModelForAudioClassification.from_pretrained(
            "facebook/hubert-base-ls960", num_labels=num_labels
        )
        model.to(device)

        training_args = TrainingArguments(
            output_dir="hubert_deception-1",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=3e-5,
            per_device_train_batch_size=8,
            gradient_accumulation_steps=4,
            per_device_eval_batch_size=8,
            num_train_epochs=2,
            warmup_ratio=0.1,
            logging_steps=10,
            load_best_model_at_end=True,
            push_to_hub=False,
        )

        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=encoded_train_dataset,
            eval_dataset=encoded_test_dataset,
            tokenizer=feature_extractor,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(3, 0.0)]
        )

        trainer.train()
        #trainer.save_model(f"/content/drive/MyDrive/hubert_deception-1/model_{modelname}")
        eval_results = trainer.evaluate()


        predictions = trainer.predict(encoded_test_dataset)

        # Apply averaging using instance_id
        instance_id_predictions = defaultdict(list)
        for i, instance_id in enumerate(test_dataset["instance_id"]):
            instance_id_predictions[instance_id].append(predictions.predictions[i])


        for instance_id, instance_predictions in instance_id_predictions.items():
            mean_prediction = np.mean(instance_predictions, axis=0)
            eval_results[f'eval_predictions_{instance_id}'] = mean_prediction


        print(f"Fold {fold + 1} - F1 Score: {eval_results['eval_f1_score']}")
        all_f1_scores.append(eval_results['eval_f1_score'])

    mean_f1_score = np.mean(all_f1_scores)
    print(f"\nMean F1 Score across all folds: {mean_f1_score}")

    evaluate_test_set(test_dataset,train_val_dataset=train_val_dataset,training_args = training_args,modelname = modelname)




##Trial Data

In [35]:
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/all_audio",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)

train_val_dataset = dataset.select(range(2))
last_20_indices = range(len(train_val_dataset) - 1, len(train_val_dataset))
test_dataset = dataset.select(last_20_indices)
print(train_val_dataset)
print(test_dataset)


feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
num_labels = 2
train_model(train_val_dataset,test_dataset,"trial")

Resolving data files:   0%|          | 0/116 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'label'],
    num_rows: 115
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 2
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 1
})

----- Fold 1 Test Index : [0] -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 1
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 1
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 7
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 1
})


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score,Accuracy
1,No log,0.68096,1.0,1.0
2,No log,0.653334,1.0,1.0


Checkpoint destination directory hubert_deception-1/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory hubert_deception-1/checkpoint-2 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Fold 1 - F1 Score: 1.0

----- Fold 2 Test Index : [1] -----
Dataset({
    features: ['audio', 'label'],
    num_rows: 1
})
Dataset({
    features: ['audio', 'label'],
    num_rows: 1
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 1
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 7
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score,Accuracy
1,No log,0.695386,0.444444,0.285714
2,No log,0.671336,1.0,1.0


Checkpoint destination directory hubert_deception-1/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory hubert_deception-1/checkpoint-2 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Fold 2 - F1 Score: 1.0

Mean F1 Score across all folds: 1.0


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score,Accuracy
1,No log,0.695386,0.444444,0.285714
2,No log,0.661727,1.0,1.0


Checkpoint destination directory hubert_deception-1/checkpoint-1 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory hubert_deception-1/checkpoint-2 already exists and is non-empty. Saving will proceed but saved results may be invalid.



Test Set Evaluation - F1 Score: 1.0


## Bag of Lies Data

In [36]:
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/Audio-BagofLies",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)

train_val_dataset = dataset.select(range(260))
last_20_indices = range(len(train_val_dataset) - 65, len(train_val_dataset))
test_dataset = dataset.select(last_20_indices)
print(train_val_dataset)
print(test_dataset)


feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
num_labels = 2
train_model(train_val_dataset,test_dataset,"trial")

Resolving data files:   0%|          | 0/326 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 325
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 80
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 20
})

----- Fold 1 Test Index : [0] -----
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 79
})
Dataset({
    features: ['audio', 'label', 'text_x', 'text_y'],
    num_rows: 1
})


KeyboardInterrupt: 

## Custom Dataset

In [38]:
dataset = load_dataset("audiofolder", data_dir="/content/drive/MyDrive/RecordingsWav",split='train')
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
print(dataset)

train_val_dataset = dataset.select(range(96))
last_20_indices = range(len(train_val_dataset) - 24, len(train_val_dataset))
test_dataset = dataset.select(last_20_indices)
print(train_val_dataset)
print(test_dataset)


feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
num_labels = 2
train_model(train_val_dataset,test_dataset,"trial")

Resolving data files:   0%|          | 0/121 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'label', 'transcription'],
    num_rows: 120
})
Dataset({
    features: ['audio', 'label', 'transcription'],
    num_rows: 96
})
Dataset({
    features: ['audio', 'label', 'transcription'],
    num_rows: 24
})

----- Fold 1 Test Index : [0] -----
Dataset({
    features: ['audio', 'label', 'transcription'],
    num_rows: 95
})
Dataset({
    features: ['audio', 'label', 'transcription'],
    num_rows: 1
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 111
})
Dataset({
    features: ['audio', 'label', 'instance_id'],
    num_rows: 1
})


Map:   0%|          | 0/111 [00:00<?, ? examples/s]

KeyboardInterrupt: 