In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import spacy
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import sentiwordnet as swn

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import StratifiedKFold, KFold
import en_core_web_sm
nltk.download("sentiwordnet")
nltk.download('punkt')
nlp = en_core_web_sm.load()

# new packages to combine
from sklearn.ensemble import VotingClassifier

import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore all warnings
warnings.filterwarnings("ignore")

# Ignore specific warnings
# For example, to ignore ConvergenceWarning:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preparing dataset (uncomment for first time)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def create_data_array(audio_dir, text_dir):
    data_array = []
    audio_files = os.listdir(audio_dir)
    for audio_file in audio_files:
        audio_path = os.path.join(audio_dir, audio_file)
        text_file = audio_file.replace('.wav', '.txt')
        text_path = os.path.join(text_dir, text_file)

        if audio_file == 'metadata.csv':
          continue

        if 'truth' in audio_path:
          label = 0
        elif 'lie' in audio_path:
          label =1

        data = {
            'audio_path': audio_path,
            'text_path': text_path,
            'label': label
        }

        data_array.append(data)
    return data_array


In [4]:
audio_dir = '/content/drive/MyDrive/all_audio'
audio_files = os.listdir(audio_dir)

text_dir = '/content/drive/MyDrive/all_text'
text_files = os.listdir(text_dir)


data_ = create_data_array(audio_dir,text_dir)
data_


[{'audio_path': '/content/drive/MyDrive/all_audio/trial_truth_052.wav',
  'text_path': '/content/drive/MyDrive/all_text/trial_truth_052.txt',
  'label': 0},
 {'audio_path': '/content/drive/MyDrive/all_audio/trial_truth_002.wav',
  'text_path': '/content/drive/MyDrive/all_text/trial_truth_002.txt',
  'label': 0},
 {'audio_path': '/content/drive/MyDrive/all_audio/trial_truth_038.wav',
  'text_path': '/content/drive/MyDrive/all_text/trial_truth_038.txt',
  'label': 0},
 {'audio_path': '/content/drive/MyDrive/all_audio/trial_lie_055.wav',
  'text_path': '/content/drive/MyDrive/all_text/trial_lie_055.txt',
  'label': 1},
 {'audio_path': '/content/drive/MyDrive/all_audio/trial_truth_023.wav',
  'text_path': '/content/drive/MyDrive/all_text/trial_truth_023.txt',
  'label': 0},
 {'audio_path': '/content/drive/MyDrive/all_audio/trial_truth_035.wav',
  'text_path': '/content/drive/MyDrive/all_text/trial_truth_035.txt',
  'label': 0},
 {'audio_path': '/content/drive/MyDrive/all_audio/trial_lie_01

# Functions for Text Features

In [6]:
glove_file = "/content/drive/MyDrive/INTERSPEECH /glove.6B.50d.txt"
filler_words1 = ['like','well','actually','basically','honestly','anyway','right','i mean','sort of','kind of','just',\
                      'i guess']
filler_words2 = ['um','uh','hm','ah','so','ok']
first_person_pronounces = ['i','we','me']
third_person_pronounces = ['he','she','they']
informal_words = ['gonna', 'wanna', 'gotta', "ain't", 'gonna', 'dunno', 'kinda', 'sorta',
                     'shoulda', 'coulda', 'woulda', "might've", "must've", 'oughta', 'gonna', 'ya',
                     "ya'll", 'wanna', 'gonna', 'gonna']
formal_prons=['we','they','their','theirs','themselves','us','our','ours','ourselves','it','its','itself']
informal_prons=['I','you','me','my','mine','your','yours','myself','yourself','yourselves']

flatten = lambda l: [item for sublist in l for item in sublist]
nltk_to_sentiwordnet = {
    "NN": "n",
    "VB": "v",
    "JJ": "a",
    "RB": "r",
}

In [7]:
def load_glove_model(glove_file):
    print("Loading GloVe Model")
    with open(glove_file, encoding='utf8') as f:
        content = f.readlines()
    model = {}
    for line in content:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array([float(val) for val in split_line[1:]])
        model[word] = embedding
    print("Done.", len(model), " words loaded!")
    return model

In [8]:
def tokenize_text(text):
    text = text.lower()
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]
    return sentences,words

In [9]:
def is_valid_word(word):
    pattern = r'\b' + re.escape(word) + r'\b'
    is_word = re.match(pattern, word, flags=re.IGNORECASE)
    return bool(is_word)

def get_valid_words(text):
    sentences, words = tokenize_text(text)
    wordslist = [element for nestedlist in words for element in nestedlist]
    words = [word for word in wordslist if is_valid_word(word)]
    return words

In [10]:
def extract_linguistic_complexity(text):
    sentences, words = tokenize_text(text)

    if len(sentences)!=0:
        #Compute average words per sentence
        avg_words_per_sent = sum(len(sent.split()) for sent in sentences)/len(sentences)

        #Compute average characters per word
        valid_words = get_valid_words(text)
        avg_characters_per_word = sum(len(word) for word in valid_words)/len(valid_words)

        #Compute average characters per sentence
        avg_characters_per_sent = sum(len([ele for ele in sent if ele.isalpha() or ele.isdigit()]) for sent in sentences)/len(sentences)

        #Compute unique words per total no of words ration
        types = set(valid_words)
        type_token_ratio = len(types)/len(words)
        return avg_words_per_sent,avg_characters_per_word,avg_characters_per_sent, type_token_ratio
    else:
        return 0,0,0,0

In [11]:
def get_word_embeddings(text, glove_model):
    embeddings = []
    words = get_valid_words(text)
    for word in words:
        if word in glove_model:
            embeddings.append(glove_model[word])
    avg_embeddings = np.mean(embeddings, axis=0)
    return np.array(avg_embeddings)

In [12]:
def extract_filler_words(text):
    text = text.lower()
    pattern1 = r'\b(?:%s)\b' % '|'.join(filler_words1)
    pattern2 = r'\b(?:%s)\w*\b' % '|'.join(filler_words2)
    matches1 = re.findall(pattern1, text)
    matches2 = re.findall(pattern2, text)
    filler_words_count = len(matches1) + len(matches2)
    return filler_words_count

In [13]:
def extract_pronounces(text):
    words = get_valid_words(text)
    first_person_count = sum(words.count(item) for item in first_person_pronounces)
    third_person_count = sum(words.count(item) for item in third_person_pronounces)
    return first_person_count, third_person_count

In [14]:
# get sentiment score
def get_sentiment(text):
    pos_scores = []
    neg_scores = []
    subj_scores = []
    sentences, words = tokenize_text(text)
    tagged_sentence_words = flatten(nltk.pos_tag_sents(words))

    for word, pos in tagged_sentence_words:
        swn_pos = nltk_to_sentiwordnet.get(pos[:2], None)
        if swn_pos == None:
            continue
        synsets = list(swn.senti_synsets(word.lower(), pos=swn_pos))

        if len(synsets) == 0:
            continue

        for synset in synsets[:1]:
            pos_scores.append(synset.pos_score())
            neg_scores.append(synset.neg_score())
            subj_scores.append(1 - synset.obj_score())
    if sum(subj_scores) == 0:
        return 0,0,0
    return np.average(pos_scores, weights=subj_scores) , np.average(neg_scores, weights=subj_scores), np.mean(subj_scores)

In [15]:
# get coherence score
def calculate_coherence(text):
    # Tokenize the text into sentences
    sentences, words = tokenize_text(text)

    if len(sentences)>1:
        # Vectorize the sentences
        sentence_vectors = [nlp(sent).vector for sent in sentences]

        # Calculate cosine similarity between consecutive sentence pairs
        similarities = cosine_similarity(sentence_vectors[:-1], sentence_vectors[1:])

        # Calculate average similarity as coherence score
        coherence = similarities.mean()
        return coherence
    else:
        coherence = 0
        num_dependencies = 0
        tokens = nlp(text)
        for token in tokens:
            if token.dep_ != 'punct':  # Ignore punctuation tokens
                num_dependencies += 1
                if token.dep_ != 'ROOT':  # Ignore root token
                    coherence += 1

        coherence_score = coherence / num_dependencies if num_dependencies > 0 else 0
        return coherence_score


In [16]:
def extract_informal_linguistics(text):
    text = text.lower()
    pattern1 = r'\b(?:%s)\b' % '|'.join(informal_words)
    pattern2 = r'\b(?:%s)\b' % '|'.join(informal_prons)
    informal_words_count = len(re.findall(pattern1, text))
    informal_prons_count = len(re.findall(pattern2, text))
    valid_words = get_valid_words(text)
    informal_words_ratio = informal_words_count/len(valid_words)
    informal_prons_ratio = informal_prons_count/len(valid_words)
    return informal_words_ratio, informal_prons_ratio

In [17]:
def extract_features_text(glove_model, text_path = 0, text = 0):
    if text_path:
        with open(text_path, encoding='utf8') as f:
            text = f.readline()

    glove_features_text = get_word_embeddings(text, glove_model)
    linguistic_complexity_features_text = extract_linguistic_complexity(text)
    filler_words_features_text = extract_filler_words(text)
    pronounces_features_text = extract_pronounces(text)
    senti_score1, senti_score2, senti_score3 = get_sentiment(text)
    coherence_score = calculate_coherence(text)
    informal_words_ratio, informal_prons_ratio  = extract_informal_linguistics(text)
    avg_words_per_sent,avg_characters_per_word,avg_characters_per_sent, type_token_ratio = extract_linguistic_complexity(text)
    combined_features_text = np.concatenate((glove_features_text,linguistic_complexity_features_text, filler_words_features_text, \
                                        pronounces_features_text, senti_score1, senti_score2, senti_score3, coherence_score, \
                                        informal_words_ratio, informal_prons_ratio, avg_words_per_sent,avg_characters_per_word,\
                                        avg_characters_per_sent, type_token_ratio ), axis=None)
    return combined_features_text


# Functions for Audio Features

In [18]:
def extract_mfcc(audio_data,sampling_rate):
    mfcc_features_audio = librosa.feature.mfcc(y=audio_data,sr=sampling_rate,n_mfcc=13)
    mfcc_scaled_features_audio = np.mean(mfcc_features_audio.T,axis=0)
    return mfcc_scaled_features_audio

In [19]:
def extract_spectral_centroid(audio_data,sampling_rate):
    spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sampling_rate)
    spectral_centroids_mean = np.mean(spectral_centroids.T,axis=0)
    return spectral_centroids_mean

In [20]:
def extract_spectral_contrast(audio_data,sampling_rate):
    spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sampling_rate)
    spectral_contrast_mean = np.mean(spectral_contrast.T,axis=0)
    return spectral_contrast_mean

In [21]:
def extract_spectral_rolloff(audio_data,sampling_rate):
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sampling_rate)
    spectral_rolloff_mean = np.mean(spectral_rolloff.T,axis=0)
    return spectral_rolloff_mean


In [22]:
def extract_spectral_bandwidth(audio_data,sampling_rate):
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sampling_rate)
    spectral_bandwidth_mean = np.mean(spectral_bandwidth.T,axis=0)
    return spectral_bandwidth_mean

In [23]:
def extract_zcr(audio_data):
    zcr = librosa.feature.zero_crossing_rate(y=audio_data)
    zcr_mean = np.mean(zcr.T,axis=0)
    return zcr_mean


In [24]:
def extract_rms(audio_data):
    rms = librosa.feature.rms(y=audio_data)
    rms_mean = np.mean(rms.T,axis=0)
    return rms_mean

In [25]:
def extract_pitch(audio_data, sampling_rate):
    pitches = librosa.yin(audio_data, fmin=50, fmax=2000)
    pitches_mean = np.mean(pitches.T, axis=0)
    return pitches_mean


In [26]:
def extract_pause_duration(audio_data,sampling_rate):
    # Calculate the number of speech frames
    speech_frames = librosa.effects.split(audio_data, top_db=20)

    # Calculate the pause duration
    speech_time = [librosa.samples_to_time(frame, sr=sampling_rate) for frame in speech_frames]
    speech_duration = sum(np.subtract(item[1],item[0]) for item in speech_time)

    # Calculate the total duration of the audio signal
    total_duration = librosa.get_duration(y=audio_data, sr=sampling_rate)

    pause_duration = total_duration - speech_duration

    # Calculate the pause ratio
    pause_ratio = pause_duration / total_duration

    return pause_duration

In [27]:
def extract_features_audio(file_name):
    audio_data, sampling_rate = librosa.load(file_name)
    mfcc_features_audio = extract_mfcc(audio_data, sampling_rate)
    spectral_centroid = extract_spectral_centroid(audio_data, sampling_rate)
    spectral_contrast = extract_spectral_contrast(audio_data, sampling_rate)
    spectral_rolloff = extract_spectral_rolloff(audio_data, sampling_rate)
    zcr = extract_zcr(audio_data)
    rms = extract_rms(audio_data)
    pitch = extract_pitch(audio_data, sampling_rate)
    pause_duration = extract_pause_duration(audio_data, sampling_rate)

    combined_features_audio = np.concatenate(
        (mfcc_features_audio, spectral_centroid, spectral_contrast, spectral_rolloff, zcr, rms, pitch, pause_duration),
        axis=None
    )

    return combined_features_audio


# Feature extraction from both

In [28]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

extracted_features = []
extracted_features_text = []
extracted_features_audio = []
glove_model = load_glove_model(glove_file)

for audio in tqdm(data_):
    audio_features = extract_features_audio(audio['audio_path'])
    text_features = extract_features_text(glove_model, text_path = audio['text_path'],)
    combined_features = np.concatenate((audio_features,text_features))
    extracted_features.append([combined_features, audio['label']])
    extracted_features_text.append([text_features, audio['label']])
    extracted_features_audio.append([audio_features, audio['label']])


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading GloVe Model
Done. 400000  words loaded!


100%|██████████| 115/115 [01:23<00:00,  1.39it/s]


In [29]:
print('Shape of extracted_features_audio:\n')
print('number of data: ', len(extracted_features))
print('dimensions of each vectorised audio: ', len(extracted_features[0][0]))

Shape of extracted_features_audio:

number of data:  115
dimensions of each vectorised audio:  93


# Time for Testing

In [30]:
from sklearn.metrics import make_scorer
def evaluate_classifier(extracted_features, classifier_name, params,
                        parameter_mode=0, list_of_ids=None, n_exp=-1,
                        train_percentage=0.80):

    extracted_features_df = pd.DataFrame(extracted_features,columns=['features','label'])
    X = np.array(extracted_features_df['features'].tolist())
    y = np.array(extracted_features_df['label'].tolist())

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1-train_percentage,random_state=7)
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    for Ci, C in enumerate(params):
        if classifier_name == "svm_rbf":
            classifier = SVC(C = C, kernel='rbf')
            classifier.fit(X_train,y_train)
            Y_pred = classifier.predict(X_test)
            Y_train_pred = classifier.predict(X_train)

        cm_train = confusion_matrix(y_train,Y_train_pred)
        accuracy_train = float(cm_train.diagonal().sum())/len(y_train)

        cm_test = confusion_matrix(y_test,Y_pred)
        accuracy_test = float(cm_test.diagonal().sum())/len(y_test)

        print ("\nc : ",C )

        #print ("confusion matrix for training dataset: ",cm_train)
        print("Accuracy for training dataset: ", accuracy_train)
        print("F1 Score for training dataset: ", f1_score(y_train, Y_train_pred))
        #print ("confusion matrix for testing dataset: ",cm_test)
        print("Accuracy for testing dataset: ", accuracy_test)
        print("F1 Score for testing dataset: ", f1_score(y_test, Y_pred))

        return classifier

def weighted_accuracy(true_labels, predicted_labels):

    # Calculate the size of each class
    class_1_size = sum(true_labels)
    class_0_size = len(true_labels) - class_1_size

    # Calculate the weights based on the proportion of samples in each class
    total_samples = len(true_labels)
    weight_0 = class_0_size / total_samples
    weight_1 = class_1_size / total_samples

    # Compute accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)

    # Compute weighted accuracy
    weighted_accuracy = (accuracy * weight_0) + ((1 - accuracy) * weight_1)

    return weighted_accuracy

def evaluate_classifier_k_fold(extracted_features, classifier_name, params,
                        parameter_mode=0, list_of_ids=None, n_exp=-1,
                        train_percentage=0.80, random_state=7, k=5, return_df = False):
    scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score, average='weighted')
    }
    extracted_features_df = pd.DataFrame(extracted_features,columns=['features','label'])
    print(extracted_features_df.shape)
    X = np.array(extracted_features_df['features'].tolist())
    y = np.array(extracted_features_df['label'].tolist())
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1-train_percentage,random_state=7)

    # dataframe to save results
    df = pd.DataFrame()
    df['fold'] = list(range(1,6))

    # Initialize the KFold object
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state = 42)
    # kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # Split your data using the kf object

    for Ci, C in enumerate(params):
        accuracy_ls = []
        F1_score_ls = []
        for fold, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
            X_train_val, X_val = X_train[train_index], X_train[test_index]
            y_train_val, y_val = y_train[train_index], y_train[test_index]
            # scale x features
            scaler = StandardScaler()
            scaler.fit(X_train_val)
            X_train_val = scaler.transform(X_train_val)
            X_val = scaler.transform(X_val)

            # modelling
            classifier = SVC(C = C, kernel='rbf')
            classifier.fit(X_train_val,y_train_val)
            Y_val_pred = classifier.predict(X_val)
            Y_train_pred = classifier.predict(X_train_val)

            # evaluation
            # training set
            train_report = classification_report(y_train_val, Y_train_pred, output_dict=True)
            accuracy_train = train_report['weighted avg']['precision']
            f1_train = train_report['weighted avg']['f1-score']
            # validation set
            val_report = classification_report(y_val, Y_val_pred, output_dict=True)
            accuracy_val = val_report['weighted avg']['precision']
            f1_val = val_report['weighted avg']['f1-score']

            accuracy_ls.append(accuracy_val)
            F1_score_ls.append(f1_val)

        df[str(C) + '_accuracy'] = accuracy_ls
        df[str(C) + '_F1'] = F1_score_ls
        if return_df:
            continue

        # test set
        classifier = SVC(C = C, kernel='rbf')
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        classifier.fit(X_train,y_train)
        X_test = scaler.transform(X_test)
        Y_pred = classifier.predict(X_test)

        print ("\nc : ", C )
        print("Accuracy for training dataset: ", sum(accuracy_ls)/5)
        print("F1 Score for training dataset: ", sum(F1_score_ls)/5)

        test_report = classification_report(y_test, Y_pred, output_dict=True)
        accuracy_test = test_report['weighted avg']['precision']
        f1_test = test_report['weighted avg']['f1-score']

        print("Accuracy for testing dataset: ", accuracy_test)  # accuracy_score(y_test, Y_pred)
        print("F1 Score for testing dataset: ", f1_test)  # f1_score(y_test, Y_pred)

    if return_df:
        return df


In [31]:
trial_clf = evaluate_classifier(extracted_features, 'svm_rbf', np.array([20.0]),
                        train_percentage=0.80)


c :  20.0
Accuracy for training dataset:  1.0
F1 Score for training dataset:  1.0
Accuracy for testing dataset:  0.6956521739130435
F1 Score for testing dataset:  0.6666666666666666


In [32]:
trial_aud = evaluate_classifier_k_fold(extracted_features_audio, 'svm_rbf', np.array([20.0]),
                        train_percentage=0.80)

(115, 2)

c :  20.0
Accuracy for training dataset:  0.7219117140169772
F1 Score for training dataset:  0.7025644920381763
Accuracy for testing dataset:  0.8350586611456177
F1 Score for testing dataset:  0.8240802675585283


In [33]:
df = evaluate_classifier_k_fold(extracted_features, 'svm_rbf', np.array([20.0]),
                        train_percentage=0.80, return_df = True)
df

(115, 2)


Unnamed: 0,fold,20.0_accuracy,20.0_F1
0,1,0.689223,0.678828
1,2,0.423392,0.421053
2,3,0.607504,0.607407
3,4,0.721501,0.719577
4,5,0.675325,0.6625


## Test Trial-SVM on Bag of Lies dataset

In [34]:
metadata = pd.read_csv('/content/drive/MyDrive/Audio-BagofLies/metadata.csv')
audio_dir_bog = '/content/drive/MyDrive/Audio-BagofLies'

output_dir = '/content/drive/MyDrive/text_bag_of_lies'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate through each row in the DataFrame
for index, row in metadata.iterrows():
    # Extract the text and ID
    text = row['text_x']
    file_name = f"{row['file_name']}.txt"
    file_name = file_name.replace('.wav','')

    # Write the text to a separate text file
    with open(os.path.join(output_dir, file_name), 'w') as file:
        file.write(text)

In [35]:
metadata[metadata['file_name'] == 'run_9_User_28.wav']['label'].reset_index()['label'][0]

0

In [36]:
def create_data_array(audio_dir, text,metadata):
    data_array = []
    audio_files = os.listdir(audio_dir)
    for idx, audio_file in enumerate(audio_files):
        if (audio_file == 'metadata.csv') | (audio_file == '.ipynb_checkpoints'):
            continue

        label = metadata[metadata['file_name'] == audio_file]['label'].reset_index()['label'][0]
        data = {
            'audio_path':  audio_dir + '/' +  audio_file,
            'text_path': text_dir + '/' + audio_file.replace('.wav','.txt'),
            'label': label
        }
        data_array.append(data)
    return data_array

audio_dir = '/content/drive/MyDrive/Audio-BagofLies'
text_dir = '/content/drive/MyDrive/text_bag_of_lies'
data = create_data_array(audio_dir, text_dir,metadata)
print(data)
extracted_features_new = []
extracted_features_text_new = []
extracted_features_audio_new = []
# glove_model = load_glove_model(glove_file)

for audio in tqdm(data):
    audio_features = extract_features_audio(audio['audio_path'])
    text_features = extract_features_text(glove_model, text_path = audio['text_path'],)
    combined_features = np.concatenate((audio_features,text_features))
    extracted_features_new.append([combined_features, audio['label']])
    extracted_features_text_new.append([text_features, audio['label']])
    extracted_features_audio_new.append([audio_features, audio['label']])


[{'audio_path': '/content/drive/MyDrive/Audio-BagofLies/run_8_User_27.wav', 'text_path': '/content/drive/MyDrive/text_bag_of_lies/run_8_User_27.txt', 'label': 1}, {'audio_path': '/content/drive/MyDrive/Audio-BagofLies/run_0_User_20.wav', 'text_path': '/content/drive/MyDrive/text_bag_of_lies/run_0_User_20.txt', 'label': 0}, {'audio_path': '/content/drive/MyDrive/Audio-BagofLies/run_2_User_18.wav', 'text_path': '/content/drive/MyDrive/text_bag_of_lies/run_2_User_18.txt', 'label': 1}, {'audio_path': '/content/drive/MyDrive/Audio-BagofLies/run_8_User_20.wav', 'text_path': '/content/drive/MyDrive/text_bag_of_lies/run_8_User_20.txt', 'label': 0}, {'audio_path': '/content/drive/MyDrive/Audio-BagofLies/run_1_User_18.wav', 'text_path': '/content/drive/MyDrive/text_bag_of_lies/run_1_User_18.txt', 'label': 1}, {'audio_path': '/content/drive/MyDrive/Audio-BagofLies/run_3_User_18.wav', 'text_path': '/content/drive/MyDrive/text_bag_of_lies/run_3_User_18.txt', 'label': 0}, {'audio_path': '/content/dr

100%|██████████| 324/324 [02:06<00:00,  2.57it/s]


In [42]:
bag_of_lies_clf = evaluate_classifier_k_fold(extracted_features_audio_new, 'svm_rbf', np.array([20.0]),
                        train_percentage=0.80)
bag_of_lies_clf = evaluate_classifier(extracted_features_audio_new, 'svm_rbf', np.array([20.0]),
                        train_percentage=0.80)

(324, 2)

c :  20.0
Accuracy for training dataset:  0.5448288971095122
F1 Score for training dataset:  0.542887386845489
Accuracy for testing dataset:  0.5287912087912088
F1 Score for testing dataset:  0.5104948182747044

c :  20.0
Accuracy for training dataset:  0.8996138996138996
F1 Score for training dataset:  0.8907563025210083
Accuracy for testing dataset:  0.5076923076923077
F1 Score for testing dataset:  0.5294117647058824


Train SVM Model on DeceitBank Dataset

In [43]:
def create_data_array(audio_dir, text):
    data_array = []
    audio_files = os.listdir(audio_dir)
    print(len(audio_files))
    print(text)
    for idx, audio_file in enumerate(audio_files):
        if (audio_file == 'metadata.csv') | (audio_file == '.ipynb_checkpoints'):
          continue
        if 'lie' in audio_file:
            label = 1
        else:
            label = 0
        data = {
            'audio_path':  audio_dir + '/' +  audio_file,
            'text': text.loc[idx, 'transcription'],
            'label': label
        }
        data_array.append(data)
    return data_array

text = pd.read_csv('/content/drive/MyDrive/Copy of transcriptions.csv', index_col=0)
audio_dir = '/content/drive/MyDrive/RecordingsWav'

data = create_data_array(audio_dir, text)

extracted_features_new1 = []
extracted_features_text_new1 = []
extracted_features_audio_new1 = []
# glove_model = load_glove_model(glove_file)

for audio in tqdm(data):
    audio_features = extract_features_audio(audio['audio_path'])
    text_features = extract_features_text(glove_model, text = audio['text'])
    combined_features = np.concatenate((audio_features,text_features))
    extracted_features_new1.append([combined_features, audio['label']])
    extracted_features_text_new1.append([text_features, audio['label']])
    extracted_features_audio_new1.append([audio_features, audio['label']])


122
     Unnamed: 0                                      transcription  \
0             0   What is the due date for your next credit car...   
1             1   What is the current interest rate on your sav...   
2             2   What is your date of birth? My date of birth ...   
3             3   can you provide details about your most recen...   
4             4   What is the credit limit on your credit card?...   
..          ...                                                ...   
115         115   How did you hear about our bank or services? ...   
116         116   Can you provide your account number? Yes, my ...   
117         117   Can you provide details about your most recen...   
118         118     What is the credit limits on your credit card?   
119         119   What is the due date for your next credit car...   

                                             file_name  
0    ../data/bankcall2/Ava Thompson_lie_audio107082...  
1    ../data/bankcall2/Ava Thompson_lie_a

100%|██████████| 120/120 [00:46<00:00,  2.59it/s]


In [52]:
deceitbank_clf = evaluate_classifier_k_fold(extracted_features_new1, 'svm_rbf', np.array([10.0]),
                        train_percentage=0.80)
deceitbank_clf = evaluate_classifier(extracted_features_new1, 'svm_rbf', np.array([10.0]),
                        train_percentage=0.80)

(120, 2)

c :  10.0
Accuracy for training dataset:  0.5762575556538405
F1 Score for training dataset:  0.5860671868294374
Accuracy for testing dataset:  0.5833333333333334
F1 Score for testing dataset:  0.5833333333333334

c :  10.0
Accuracy for training dataset:  1.0
F1 Score for training dataset:  1.0
Accuracy for testing dataset:  0.5833333333333334
F1 Score for testing dataset:  0.5454545454545454


Cross -Dataset Evaluations

In [40]:
from sklearn import svm
from sklearn.metrics import f1_score, accuracy_score

def cross_dataset(model,data):
  extracted_features_df = pd.DataFrame(data,columns=['features','label'])
  X_new = np.array(extracted_features_df['features'].tolist())
  y_new = np.array(extracted_features_df['label'].tolist())
  y_pred = model.predict(X_new)

  f1 = f1_score(y_new, y_pred, average='binary')
  print("F1 Score:", f1)

  # Compute the accuracy
  accuracy = accuracy_score(y_new, y_pred)
  print("Accuracy:", accuracy)


In [45]:
cross_dataset(trial_clf,extracted_features_new)

F1 Score: 0.6470588235294118
Accuracy: 0.4782608695652174


In [46]:
cross_dataset(trial_clf,extracted_features_new1)

F1 Score: 0.5542168674698795
Accuracy: 0.38333333333333336


In [47]:
cross_dataset(bag_of_lies_clf,extracted_features_new1)

F1 Score: 0.0
Accuracy: 0.6166666666666667


In [49]:
cross_dataset(bag_of_lies_clf,extracted_features)

F1 Score: 0.0
Accuracy: 0.5030864197530864


In [50]:
cross_dataset(deceitbank_clf,extracted_features)

F1 Score: 0.0
Accuracy: 0.5217391304347826


In [51]:
cross_dataset(deceitbank_clf,extracted_features_new)

F1 Score: 0.0
Accuracy: 0.6166666666666667
