In [34]:
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import os

In [9]:
# Correct the re-filtering by reloading the file and excluding the correct columns only
df = pd.read_csv('data-files/transcript_data_w_lies_and_LDAtopics.csv').drop(columns=['Unnamed: 0', 'YouTube URL', 'Chunk Number', 'Video ID', 
                                                   'Speech ID', 'Speaker', 'Middle Class / American Dream', 
                                                   'National Security / Immigration', 'Appreciation', 
                                                   'Lives at Stake', 'Economic Growth / Job Creation'])


In [11]:
sentiment_df = pd.read_csv('data-files/individual_sentiment_analysis.csv')

In [None]:
# Identify the max sentiment for each row and assign the respective label ('neg', 'neu', 'pos')
sentiment_df['sentiment'] = sentiment_df[['neg', 'neu', 'pos']].idxmax(axis=1)

# Select only 'Chunk Filename' and 'sentiment' columns for merging
sentiment_column = sentiment_df[['Chunk Filename', 'sentiment']]

# Merge this sentiment column with the previously filtered DataFrame on 'Chunk Filename'
df_with_sentiment = df.merge(sentiment_column, on='Chunk Filename', how='left')

# Move the 'lie' column to the end
df_with_sentiment = df_with_sentiment[[col for col in df_with_sentiment.columns if col != 'lie'] + ['lie']]


In [32]:
# Modify the file paths in the 'Chunk Filename' column by replacing the old path with 'audio-files/'
def update_file_paths(df):
    df['Chunk Filename'] = df['Chunk Filename'].apply(
        lambda x: x.replace("/Users/milanvaghani/Desktop/Unstructed Machine Learning/", "audio-files/")
        if x.startswith("/Users/milanvaghani/Desktop/Unstructed Machine Learning/VP") else x
    )
    return df
# Re-apply the process to filter out rows where the audio file does not exist
def file_exists(row):
    return os.path.isfile(row['Chunk Filename'])

In [35]:
df_with_sentiment = update_file_paths(df_with_sentiment)
# Filter out rows where the file does not exist
df_with_sentiment = df_with_sentiment[df_with_sentiment.apply(file_exists, axis=1)]

In [36]:
combined_file_path = 'data-files/combined.csv'
df_with_sentiment.to_csv(combined_file_path, index=False)

In [41]:
def extract_audio_features(file_name):
    try:
        audio_data, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        mfccs_scaled_std = np.std(mfccs.T, axis=0)
        features = np.hstack((mfccs_scaled, mfccs_scaled_std))
        return features
    except Exception as e:
        print(f"Error encountered while parsing file: {file_name}. Exception: {e}")
        return np.zeros(80)

def main():
    # Load the combined data
    combined_df = pd.read_csv('data-files/combined.csv')

    # Extract features and labels
    audio_features = []
    labels = []

    for index, row in combined_df.iterrows():
        audio_path = row['Chunk Filename']
        label = row['lie']
        
        # Extract audio features
        audio_data = extract_audio_features(audio_path)
        
        # Collect other features and convert to float
        other_data = pd.to_numeric(row.drop(['Chunk Filename', 'Transcript', 'lie']), errors='coerce').values.astype(float)
        
        # Combine features and append
        combined_data = np.hstack((audio_data, other_data))
        audio_features.append(combined_data)
        labels.append(label)

    # Convert lists to arrays
    X = np.array(audio_features, dtype=float)
    y = np.array([int(label) for label in labels])

    # Handle any NaNs in case of remaining issues
    X = np.nan_to_num(X)

    # Split the dataset
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Build the neural network model with class weight adjustment
    model = Sequential([
        Dense(256, input_shape=(X_train.shape[1],), activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile with class weights to emphasize "True" class
    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )

    # Set class weights
    class_weights = {0: 1, 1: 30} 

    # Train the model with class weights
    history = model.fit(
        X_train,
        y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        class_weight=class_weights,
        verbose=1
    )

    # Evaluate the model
    y_pred_prob = model.predict(X_val).flatten()
    y_pred = (y_pred_prob > 0.5).astype(int)

    print("Classification Report:")
    print(classification_report(y_val, y_pred, target_names=['False', 'True']))
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))

    # Predict on test data
    test_features = []
    for index, row in combined_df.iterrows():
        audio_path = row['Chunk Filename']
        audio_data = extract_audio_features(audio_path)
        other_data = pd.to_numeric(row.drop(['Chunk Filename', 'Transcript', 'lie']), errors='coerce').values.astype(float)
        combined_data = np.hstack((audio_data, other_data))
        test_features.append(combined_data)

    X_test = np.array(test_features, dtype=float)
    X_test = np.nan_to_num(X_test)
    X_test = scaler.transform(X_test)

    test_predictions_prob = model.predict(X_test).flatten()
    test_predictions = (test_predictions_prob > 0.5).astype(int)

    # Map numeric predictions back to boolean values
    test_labels = [bool(pred) for pred in test_predictions]

    # Save predictions
    combined_df['predicted_label'] = test_labels
    combined_df['probability_true'] = test_predictions_prob
    combined_df.to_csv('test_predictions_combined.csv', index=False)
    print("Predictions saved to 'test_predictions_combined.csv'.")

if __name__ == "__main__":
    main()

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6638 - loss: 3.2848 - val_accuracy: 0.3143 - val_loss: 0.8141
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3709 - loss: 1.9309 - val_accuracy: 0.2381 - val_loss: 1.2187
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2742 - loss: 1.7156 - val_accuracy: 0.3714 - val_loss: 1.1008
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3591 - loss: 1.8489 - val_accuracy: 0.4095 - val_loss: 1.0793
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3591 - loss: 1.6518 - val_accuracy: 0.4190 - val_loss: 0.9398
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4880 - loss: 1.4692 - val_accuracy: 0.4952 - val_loss: 0.9131
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━