In [31]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

EEG_DATA_DIR = r'C:\Users\User\Documents\Lie detect data\AugmentedEEGData'
POLY_DATA_DIR = r'C:\Users\User\Documents\Lie detect data\CombinedPolyData'
K_FOLDS = 5  # Number of folds for cross-validation

def pad_sequence(sequence, target_length):
    """Pad the sequence to the target length."""
    pad_length = target_length - sequence.shape[1]
    if pad_length > 0:
        return np.pad(sequence, ((0, 0), (0, pad_length)), mode='constant')
    else:
        return sequence[:, :target_length]

def load_eeg_data(data_dir):
    X, y = [], []
    lie_count, truth_count = 0, 0
    file_sample_counts = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.pkl'):
            file_path = os.path.join(data_dir, file_name)
            data = pd.read_pickle(file_path)
            print(f"EEG File: {file_name}, Shape: {data.shape}, Type: {type(data)}")
            label = 0 if 'lie' in file_name.lower() else 1
            X.extend(data)
            y.extend([label] * data.shape[0])
            file_sample_counts.append(data.shape[0])
            if label == 0:
                lie_count += data.shape[0]
            else:
                truth_count += data.shape[0]
    print(f"Loaded from EEG {data_dir}: {lie_count} lie samples, {truth_count} truth samples")
    return np.array(X), np.array(y), file_sample_counts

def load_poly_data(data_dir):
    X, y = [], []
    lie_count, truth_count = 0, 0
    max_length = 0
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.pkl'):
            file_path = os.path.join(data_dir, file_name)
            data = pd.read_pickle(file_path)
            print(f"Poly File: {file_name}, Shape: {data.shape}, Type: {type(data)}")
            max_length = max(max_length, data.shape[1])
            label = 0 if 'lie' in file_name.lower() else 1
            X.append(data)
            y.append(label)
            if label == 0:
                lie_count += 1
            else:
                truth_count += 1
    print(f"Loaded from Poly {data_dir}: {lie_count} lie samples, {truth_count} truth samples")
    
    # Pad all poly samples to the maximum length
    X_padded = np.array([pad_sequence(x, max_length) for x in X])
    return X_padded, np.array(y)

class CombinedDataset(Dataset):
    def __init__(self, eeg_X, eeg_y, poly_X, poly_y, file_sample_counts):
        self.eeg_X = torch.tensor(eeg_X, dtype=torch.float32)
        self.eeg_y = torch.tensor(eeg_y, dtype=torch.long)
        self.poly_X = torch.tensor(poly_X, dtype=torch.float32)
        self.poly_y = torch.tensor(poly_y, dtype=torch.long)
        
        # Create a mapping from EEG sample index to Poly file index
        self.eeg_to_poly_map = []
        poly_index = 0
        for count in file_sample_counts:
            self.eeg_to_poly_map.extend([poly_index] * count)
            poly_index += 1

    def __len__(self):
        return len(self.eeg_X)

    def __getitem__(self, idx):
        eeg_sample = self.eeg_X[idx]
        eeg_label = self.eeg_y[idx]
        poly_idx = self.eeg_to_poly_map[idx]
        poly_sample = self.poly_X[poly_idx]
        poly_label = self.poly_y[poly_idx]
        
        return eeg_sample, eeg_label, poly_sample, poly_label

# ... (keep the data loading and preprocessing code)

# Create full dataset
full_dataset = CombinedDataset(eeg_X, eeg_y, poly_X, poly_y, file_sample_counts)

# Print dataset size and shapes
print("Total number of samples:", len(full_dataset))
print("EEG data shape:", eeg_X.shape)
print("Poly data shape:", poly_X.shape)

# Initialize K-Fold
kfold = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)

# Example of how to use the dataset with DataLoader
train_loader = DataLoader(full_dataset, batch_size=32, shuffle=True)

# Print a sample batch and verify alignment
for eeg_batch, eeg_labels, poly_batch, poly_labels in train_loader:
    print("EEG batch shape:", eeg_batch.shape)
    print("EEG labels shape:", eeg_labels.shape)
    print("Poly batch shape:", poly_batch.shape)
    print("Poly labels shape:", poly_labels.shape)
    
    # Verify label alignment
    print("Labels match:", torch.all(eeg_labels == poly_labels))
    
    # Print a few sample indices and their corresponding data
    for i in range(5):  # Print first 5 samples in the batch
        eeg_sample = eeg_batch[i]
        poly_sample = poly_batch[i]
        print(f"Sample {i}:")
        print(f"  EEG label: {eeg_labels[i].item()}, Poly label: {poly_labels[i].item()}")
        print(f"  EEG data (first 5 values): {eeg_sample[0, :5]}")
        print(f"  Poly data (first 5 values): {poly_sample[0, :5]}")
    
    break  # Just print the first batch

Total number of samples: 935
EEG data shape: (935, 65, 125)
Poly data shape: (90, 4, 4475)
EEG batch shape: torch.Size([32, 65, 125])
EEG labels shape: torch.Size([32])
Poly batch shape: torch.Size([32, 4, 4475])
Poly labels shape: torch.Size([32])
Labels match: tensor(True)
Sample 0:
  EEG label: 1, Poly label: 1
  EEG data (first 5 values): tensor([-0.1975, -0.1601, -0.1389, -0.1437, -0.1563])
  Poly data (first 5 values): tensor([-0.1500, -0.1582, -0.1666, -0.1751, -0.1834])
Sample 1:
  EEG label: 0, Poly label: 0
  EEG data (first 5 values): tensor([-0.0554, -0.0628, -0.0727, -0.0840, -0.0956])
  Poly data (first 5 values): tensor([0.0327, 0.0250, 0.0169, 0.0086, 0.0002])
Sample 2:
  EEG label: 0, Poly label: 0
  EEG data (first 5 values): tensor([0.0052, 0.0109, 0.0121, 0.0067, 0.0043])
  Poly data (first 5 values): tensor([0.1210, 0.1152, 0.1088, 0.1020, 0.0951])
Sample 3:
  EEG label: 0, Poly label: 0
  EEG data (first 5 values): tensor([-0.0308, -0.0299, -0.0163,  0.0025,  0.01

In [28]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, SubsetRandomSampler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

class EEGNet(nn.Module):
    def __init__(self, num_classes=2):
        super(EEGNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, (1, 51), padding='same')
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.depthwiseConv2d = nn.Conv2d(16, 32, (65, 1), groups=16, padding='same')
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.activation = nn.ELU()
        self.pooling = nn.AvgPool2d((1, 4))
        self.dropout = nn.Dropout(0.5)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * 65 * 31, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.depthwiseConv2d(x)
        x = self.batchnorm2(x)
        x = self.activation(x)
        x = self.pooling(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

class PolygraphNet(nn.Module):
    def __init__(self, num_classes=2):
        super(PolygraphNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, (1, 51), padding='same')
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.depthwiseConv2d = nn.Conv2d(16, 32, (4, 1), groups=16, padding='same')
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.activation = nn.ELU()
        self.pooling = nn.AvgPool2d((1, 4))
        self.dropout = nn.Dropout(0.5)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * 4 * 1118, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.depthwiseConv2d(x)
        x = self.batchnorm2(x)
        x = self.activation(x)
        x = self.pooling(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

class EnsembleModel(nn.Module):
    def __init__(self, eeg_model, poly_model):
        super(EnsembleModel, self).__init__()
        self.eeg_model = eeg_model
        self.poly_model = poly_model
        self.fc = nn.Linear(4, 2)

    def forward(self, eeg_input, poly_input):
        eeg_output = self.eeg_model(eeg_input)
        poly_output = self.poly_model(poly_input)
        combined_output = torch.cat((eeg_output, poly_output), dim=1)
        output = self.fc(combined_output)
        return output

def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    all_labels, all_predictions = [], []

    with torch.no_grad():
        for eeg_X_batch, eeg_y_batch, poly_X_batch, poly_y_batch in data_loader:
            eeg_X_batch, poly_X_batch = eeg_X_batch.to(device), poly_X_batch.to(device)
            labels = eeg_y_batch.to(device)  # Use EEG labels (they should be the same as Poly labels)
            
            outputs = model(eeg_X_batch, poly_X_batch)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = correct / total
    precision = precision_score(all_labels, all_predictions, average='binary', zero_division=1)
    recall = recall_score(all_labels, all_predictions, average='binary', zero_division=1)
    f1 = f1_score(all_labels, all_predictions, average='binary', zero_division=1)
    conf_matrix = confusion_matrix(all_labels, all_predictions, labels=[0, 1])

    return total_loss / len(data_loader), accuracy, precision, recall, f1, conf_matrix

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for eeg_X_batch, eeg_y_batch, poly_X_batch, poly_y_batch in train_loader:
            eeg_X_batch, poly_X_batch = eeg_X_batch.to(device), poly_X_batch.to(device)
            labels = eeg_y_batch.to(device)  # Use EEG labels (they should be the same as Poly labels)
            
            optimizer.zero_grad()
            outputs = model(eeg_X_batch, poly_X_batch)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss}')
        
        # Evaluate on validation set
        val_loss, val_accuracy, val_precision, val_recall, val_f1, val_conf_matrix = evaluate_model(model, val_loader, criterion, device)
        print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
        print(f'Precision: {val_precision}, Recall: {val_recall}, F1-score: {val_f1}')
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
    
    return best_model_state

# K-Fold Cross-Validation
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 100
results = []

for fold, (train_ids, val_ids) in enumerate(kfold.split(full_dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = SubsetRandomSampler(train_ids)
    val_subsampler = SubsetRandomSampler(val_ids)

    # Define data loaders for training and validation data in this fold
    train_loader = DataLoader(full_dataset, batch_size=32, sampler=train_subsampler)
    val_loader = DataLoader(full_dataset, batch_size=32, sampler=val_subsampler)

    # Initialize models
    eeg_model = EEGNet(num_classes=2).to(device)
    poly_model = PolygraphNet(num_classes=2).to(device)
    ensemble_model = EnsembleModel(eeg_model, poly_model).to(device)
    criterion = nn.CrossEntropyLoss()
    ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=0.001)

    # Train the model
    best_model_state = train_model(ensemble_model, train_loader, val_loader, criterion, ensemble_optimizer, num_epochs, device)

    # Load the best model and evaluate on validation set
    ensemble_model.load_state_dict(best_model_state)
    val_loss, val_accuracy, val_precision, val_recall, val_f1, val_conf_matrix = evaluate_model(ensemble_model, val_loader, criterion, device)
    
    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print(f'Precision: {val_precision}, Recall: {val_recall}, F1-score: {val_f1}')
    print('Confusion Matrix:')
    print(val_conf_matrix)
    
    results.append({
        'fold': fold,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'val_precision': val_precision,
        'val_recall': val_recall,
        'val_f1': val_f1
    })

# Print average results
avg_accuracy = np.mean([r['val_accuracy'] for r in results])
avg_precision = np.mean([r['val_precision'] for r in results])
avg_recall = np.mean([r['val_recall'] for r in results])
avg_f1 = np.mean([r['val_f1'] for r in results])

print("\nAverage results across all folds:")
print(f"Accuracy: {avg_accuracy:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1-score: {avg_f1:.4f}")

FOLD 0
--------------------------------
Epoch 1/100, Loss: 0.5202799517816553
Validation Loss: 0.0633908886811696, Validation Accuracy: 0.9679144385026738
Precision: 0.9824561403508771, Recall: 0.9655172413793104, F1-score: 0.9739130434782609
Epoch 2/100, Loss: 0.07112023739076297
Validation Loss: 0.016409436907755055, Validation Accuracy: 0.9946524064171123
Precision: 0.9914529914529915, Recall: 1.0, F1-score: 0.9957081545064378
Epoch 3/100, Loss: 0.02421794292604318
Validation Loss: 0.011455707989322642, Validation Accuracy: 1.0
Precision: 1.0, Recall: 1.0, F1-score: 1.0
Epoch 4/100, Loss: 0.017079513973537058
Validation Loss: 0.007598093691437195, Validation Accuracy: 0.9946524064171123
Precision: 1.0, Recall: 0.9913793103448276, F1-score: 0.9956709956709957
Epoch 5/100, Loss: 0.009633043012399867
Validation Loss: 0.003353922327126687, Validation Accuracy: 1.0
Precision: 1.0, Recall: 1.0, F1-score: 1.0
Epoch 6/100, Loss: 0.0025871901602840808
Validation Loss: 0.001970864577742759, V