In [3]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold


# Constants
EEG_DATA_DIR = r'C:\Users\User\Documents\Lie detect data\AugmentedEEGData'
POLY_DATA_DIR = r'C:\Users\User\Documents\Lie detect data\CombinedPolyData'
K_FOLDS = 5  # Number of folds for cross-validation

def pad_sequence(sequence, target_length):
    """Pad the sequence to the target length."""
    pad_length = target_length - sequence.shape[1]
    if pad_length > 0:
        return np.pad(sequence, ((0, 0), (0, pad_length)), mode='constant')
    else:
        return sequence[:, :target_length]

def load_eeg_data(data_dir):
    X, y = [], []
    lie_count, truth_count = 0, 0
    file_sample_counts = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.pkl'):
            file_path = os.path.join(data_dir, file_name)
            try:
                data = pd.read_pickle(file_path)
                print(f"EEG File: {file_name}, Shape: {data.shape}, Type: {type(data)}")
                label = 0 if 'lie' in file_name.lower() else 1
                X.extend(data)
                y.extend([label] * data.shape[0])
                file_sample_counts.append(data.shape[0])
                if label == 0:
                    lie_count += data.shape[0]
                else:
                    truth_count += data.shape[0]
            except Exception as e:
                print(f"Error loading EEG file {file_name}: {str(e)}")
    print(f"Loaded from EEG {data_dir}: {lie_count} lie samples, {truth_count} truth samples")
    return np.array(X), np.array(y), file_sample_counts

def load_poly_data(data_dir):
    X, y = [], []
    lie_count, truth_count = 0, 0
    max_length = 0
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.pkl'):
            file_path = os.path.join(data_dir, file_name)
            try:
                data = pd.read_pickle(file_path)
                print(f"Poly File: {file_name}, Shape: {data.shape}, Type: {type(data)}")
                max_length = max(max_length, data.shape[1])
                label = 0 if 'lie' in file_name.lower() else 1
                X.append(data)
                y.append(label)
                if label == 0:
                    lie_count += 1
                else:
                    truth_count += 1
            except Exception as e:
                print(f"Error loading Poly file {file_name}: {str(e)}")
    print(f"Loaded from Poly {data_dir}: {lie_count} lie samples, {truth_count} truth samples")
    
    # Pad all poly samples to the maximum length
    X_padded = np.array([pad_sequence(x, max_length) for x in X])
    return X_padded, np.array(y)

class CombinedDataset(Dataset):
    def __init__(self, eeg_X, eeg_y, poly_X, poly_y, file_sample_counts):
        self.eeg_X = torch.tensor(eeg_X, dtype=torch.float32)
        self.eeg_y = torch.tensor(eeg_y, dtype=torch.long)
        self.poly_X = torch.tensor(poly_X, dtype=torch.float32)
        self.poly_y = torch.tensor(poly_y, dtype=torch.long)
        
        # Create a mapping from EEG sample index to Poly file index
        self.eeg_to_poly_map = []
        poly_index = 0
        for count in file_sample_counts:
            self.eeg_to_poly_map.extend([poly_index] * count)
            poly_index += 1

    def __len__(self):
        return len(self.eeg_X)

    def __getitem__(self, idx):
        eeg_sample = self.eeg_X[idx]
        eeg_label = self.eeg_y[idx]
        poly_idx = self.eeg_to_poly_map[idx]
        poly_sample = self.poly_X[poly_idx]
        poly_label = self.poly_y[poly_idx]
        
        return eeg_sample, eeg_label, poly_sample, poly_label

def create_file_based_splits(file_sample_counts, n_splits=5):
    file_indices = np.arange(len(file_sample_counts))
    group_kfold = GroupKFold(n_splits=n_splits)
    return list(group_kfold.split(X=file_indices, groups=file_indices))

# Load data
print("Loading EEG data...")
eeg_X, eeg_y, file_sample_counts = load_eeg_data(EEG_DATA_DIR)
print("Loading Poly data...")
poly_X, poly_y = load_poly_data(POLY_DATA_DIR)

# Check for class imbalance
unique, counts = np.unique(eeg_y, return_counts=True)
print("EEG Class distribution:", dict(zip(unique, counts)))
unique, counts = np.unique(poly_y, return_counts=True)
print("Poly Class distribution:", dict(zip(unique, counts)))

# Print dataset size and shapes
print("EEG data shape:", eeg_X.shape)
print("Poly data shape:", poly_X.shape)



Loading EEG data...
EEG File: augmented_lie_1.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_10.pkl, Shape: (9, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_11.pkl, Shape: (5, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_12.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_13.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_14.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_15.pkl, Shape: (5, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_16.pkl, Shape: (9, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_17.pkl, Shape: (10, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_18.pkl, Shape: (10, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_19.pkl, Shape: (11, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_2.pkl, Shape: (5, 65, 125), Type: <class 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, SubsetRandomSampler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
from sklearn.preprocessing import StandardScaler

num_epochs = 50

class EEGNet(nn.Module):
    def __init__(self, num_classes=2):
        super(EEGNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, (1, 51), padding='same')
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.depthwiseConv2d = nn.Conv2d(16, 32, (65, 1), groups=16, padding='same')
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.activation = nn.ELU()
        self.pooling = nn.AvgPool2d((1, 4))
        self.dropout = nn.Dropout(0.5)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * 65 * 31, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.depthwiseConv2d(x)
        x = self.batchnorm2(x)
        x = self.activation(x)
        x = self.pooling(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

class PolygraphNet(nn.Module):
    def __init__(self, num_classes=2):
        super(PolygraphNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, (1, 51), padding='same')
        self.batchnorm1 = nn.BatchNorm2d(16)
        self.depthwiseConv2d = nn.Conv2d(16, 32, (4, 1), groups=16, padding='same')
        self.batchnorm2 = nn.BatchNorm2d(32)
        self.activation = nn.ELU()
        self.pooling = nn.AvgPool2d((1, 4))
        self.dropout = nn.Dropout(0.5)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(32 * 4 * 1118, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.conv1(x)
        x = self.batchnorm1(x)
        x = self.depthwiseConv2d(x)
        x = self.batchnorm2(x)
        x = self.activation(x)
        x = self.pooling(x)
        x = self.dropout(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

class EnsembleModel(nn.Module):
    def __init__(self, eeg_model, poly_model):
        super(EnsembleModel, self).__init__()
        self.eeg_model = eeg_model
        self.poly_model = poly_model
        self.fc = nn.Linear(4, 2)

    def forward(self, eeg_input, poly_input):
        eeg_output = self.eeg_model(eeg_input)
        poly_output = self.poly_model(poly_input)
        combined_output = torch.cat((eeg_output, poly_output), dim=1)
        output = self.fc(combined_output)
        return output

def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    all_labels, all_predictions = [], []

    with torch.no_grad():
        for eeg_X_batch, eeg_y_batch, poly_X_batch, poly_y_batch in data_loader:
            eeg_X_batch, poly_X_batch = eeg_X_batch.to(device), poly_X_batch.to(device)
            labels = eeg_y_batch.to(device)  # Use EEG labels (they should be the same as Poly labels)
            
            outputs = model(eeg_X_batch, poly_X_batch)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = correct / total
    precision = precision_score(all_labels, all_predictions, average='binary', zero_division=1)
    recall = recall_score(all_labels, all_predictions, average='binary', zero_division=1)
    f1 = f1_score(all_labels, all_predictions, average='binary', zero_division=1)
    conf_matrix = confusion_matrix(all_labels, all_predictions, labels=[0, 1])

    return total_loss / len(data_loader), accuracy, precision, recall, f1, conf_matrix

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, save_path):
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for eeg_X_batch, eeg_y_batch, poly_X_batch, poly_y_batch in train_loader:
            eeg_X_batch, poly_X_batch = eeg_X_batch.to(device), poly_X_batch.to(device)
            labels = eeg_y_batch.to(device)  # Use EEG labels (they should be the same as Poly labels)
            
            optimizer.zero_grad()
            outputs = model(eeg_X_batch, poly_X_batch)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss}')
        
        # Evaluate on validation set
        val_loss, val_accuracy, val_precision, val_recall, val_f1, val_conf_matrix = evaluate_model(model, val_loader, criterion, device)
        print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
        print(f'Precision: {val_precision}, Recall: {val_recall}, F1-score: {val_f1}')
        
        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'val_accuracy': val_accuracy,
                'val_precision': val_precision,
                'val_recall': val_recall,
                'val_f1': val_f1,
            }, save_path)
            print(f"Model saved to {save_path}")
    
    return save_path

# Load data
print("Loading EEG data...")
eeg_X, eeg_y, file_sample_counts = load_eeg_data(EEG_DATA_DIR)
print("Loading Poly data...")
poly_X, poly_y = load_poly_data(POLY_DATA_DIR)

# Create splits based on file indices
splits = create_file_based_splits(file_sample_counts, n_splits=5)

for fold, (train_file_indices, val_file_indices) in enumerate(splits):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Split data based on file indices
    train_sample_counts = [file_sample_counts[i] for i in train_file_indices]
    val_sample_counts = [file_sample_counts[i] for i in val_file_indices]
    
    train_eeg_indices = np.concatenate([np.arange(sum(file_sample_counts[:i]), 
                                                  sum(file_sample_counts[:i+1])) 
                                        for i in train_file_indices])
    val_eeg_indices = np.concatenate([np.arange(sum(file_sample_counts[:i]), 
                                                sum(file_sample_counts[:i+1])) 
                                      for i in val_file_indices])

    # Split EEG data
    train_eeg_X, train_eeg_y = eeg_X[train_eeg_indices], eeg_y[train_eeg_indices]
    val_eeg_X, val_eeg_y = eeg_X[val_eeg_indices], eeg_y[val_eeg_indices]

    # Split Poly data
    train_poly_X, train_poly_y = poly_X[train_file_indices], poly_y[train_file_indices]
    val_poly_X, val_poly_y = poly_X[val_file_indices], poly_y[val_file_indices]

    # Normalize data for this fold
    eeg_scaler = StandardScaler()
    poly_scaler = StandardScaler()


    train_eeg_X_2d = train_eeg_X.reshape(-1, train_eeg_X.shape[-1])  
    train_eeg_X_2d_scaled = eeg_scaler.fit_transform(train_eeg_X_2d)
    train_eeg_X_scaled = train_eeg_X_2d_scaled.reshape(train_eeg_X.shape)
    
    # Reshape and transform validation data
    val_eeg_X_2d = val_eeg_X.reshape(-1, val_eeg_X.shape[-1])
    val_eeg_X_2d_scaled = eeg_scaler.transform(val_eeg_X_2d)
    val_eeg_X_scaled = val_eeg_X_2d_scaled.reshape(val_eeg_X.shape)
    
    train_poly_X_2d = train_poly_X.reshape(-1, train_poly_X.shape[-1])
    train_poly_X_2d_scaled = poly_scaler.fit_transform(train_poly_X_2d)
    train_poly_X_scaled = train_poly_X_2d_scaled.reshape(train_poly_X.shape)
    
    val_poly_X_2d = val_poly_X.reshape(-1, val_poly_X.shape[-1])
    val_poly_X_2d_scaled = poly_scaler.transform(val_poly_X_2d)
    val_poly_X_scaled = val_poly_X_2d_scaled.reshape(val_poly_X.shape)

    # Create datasets for this fold
    train_dataset = CombinedDataset(train_eeg_X_scaled, train_eeg_y, train_poly_X_scaled, train_poly_y, train_sample_counts)
    val_dataset = CombinedDataset(val_eeg_X_scaled, val_eeg_y, val_poly_X_scaled, val_poly_y, val_sample_counts)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    # Initialize models
    eeg_model = EEGNet(num_classes=2).to(device)
    poly_model = PolygraphNet(num_classes=2).to(device)
    ensemble_model = EnsembleModel(eeg_model, poly_model).to(device)
    criterion = nn.CrossEntropyLoss()
    ensemble_optimizer = optim.Adam(ensemble_model.parameters(), lr=0.0001)

    # Train and evaluate model (rest of your code remains the same)
    ...
    # Train the model and save it
    save_path = f'ensemble_model_fold_{fold}.pth'
    best_model_path = train_model(ensemble_model, train_loader, val_loader, criterion, ensemble_optimizer, num_epochs, device, save_path)

    # Load the best model and evaluate on validation set
    checkpoint = torch.load(best_model_path)
    ensemble_model.load_state_dict(checkpoint['model_state_dict'])
    val_loss, val_accuracy, val_precision, val_recall, val_f1, val_conf_matrix = evaluate_model(ensemble_model, val_loader, criterion, device)
    
    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    print(f'Precision: {val_precision}, Recall: {val_recall}, F1-score: {val_f1}')
    print('Confusion Matrix:')
    print(val_conf_matrix)
    
    results.append({
        'fold': fold,
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'val_precision': val_precision,
        'val_recall': val_recall,
        'val_f1': val_f1
    })

# Print average results
avg_accuracy = np.mean([r['val_accuracy'] for r in results])
avg_precision = np.mean([r['val_precision'] for r in results])
avg_recall = np.mean([r['val_recall'] for r in results])
avg_f1 = np.mean([r['val_f1'] for r in results])

print("\nAverage results across all folds:")
print(f"Accuracy: {avg_accuracy:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1-score: {avg_f1:.4f}")

# Save the final model (you can choose to save the model from the best fold instead)
final_model_path = 'final_ensemble_model.pth'
torch.save({
    'model_state_dict': ensemble_model.state_dict(),
    'avg_accuracy': avg_accuracy,
    'avg_precision': avg_precision,
    'avg_recall': avg_recall,
    'avg_f1': avg_f1,
}, final_model_path)
print(f"Final model saved to {final_model_path}")

Loading EEG data...
EEG File: augmented_lie_1.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_10.pkl, Shape: (9, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_11.pkl, Shape: (5, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_12.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_13.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_14.pkl, Shape: (6, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_15.pkl, Shape: (5, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_16.pkl, Shape: (9, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_17.pkl, Shape: (10, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_18.pkl, Shape: (10, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_19.pkl, Shape: (11, 65, 125), Type: <class 'numpy.ndarray'>
EEG File: augmented_lie_2.pkl, Shape: (5, 65, 125), Type: <class 