# Data Processing

In [7]:
import os
import pickle
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold

# Set the data path
data_path = r"C:\Users\User\Documents\Lie detect data\npy 데이터"

# Function to load data
def load_data(data_path):
    data = {}
    for file_name in os.listdir(data_path):
        if file_name.endswith('.pkl'):
            with open(os.path.join(data_path, file_name), 'rb') as file:
                data[file_name] = pickle.load(file)
    return data

# Function to extract subject IDs
def extract_subject_ids(data):
    truth_subjects = set()
    lie_subjects = set()

    for name in data.keys():
        parts = name.split('_')
        if len(parts) >= 5:
            subject_id = parts[4].replace('.pkl', '')
        elif len(parts) == 4:
            subject_id = parts[3].replace('.pkl', '')
        else:
            continue
        
        if 'truth' in name:
            truth_subjects.add(subject_id)
        elif 'lie' in name:
            lie_subjects.add(subject_id)

    # Remove lie IDs from truth IDs
    truth_subjects = [subj for subj in truth_subjects if subj not in lie_subjects]
    
    return truth_subjects, list(lie_subjects)

# Function to randomly select 4 subjects (2 truth, 2 lie)
def select_random_subjects(truth_subjects, lie_subjects, num_each=2):
    test_truth_subjects = random.sample(truth_subjects, num_each)
    test_lie_subjects = random.sample(lie_subjects, num_each)
    
    train_truth_subjects = [subj for subj in truth_subjects if subj not in test_truth_subjects]
    train_lie_subjects = [subj for subj in lie_subjects if subj not in test_lie_subjects]
    
    return test_truth_subjects, test_lie_subjects, train_truth_subjects, train_lie_subjects

# Load the data
data = load_data(data_path)

# Classify subject IDs
truth_subjects, lie_subjects = extract_subject_ids(data)

# Randomly select 4 subjects
test_truth_subjects, test_lie_subjects, train_truth_subjects, train_lie_subjects = select_random_subjects(truth_subjects, lie_subjects)

# Function to split data into training and testing sets
def split_data(data, test_truth_subjects, test_lie_subjects):
    train_data = {}
    test_data = {}
    
    for key, value in data.items():
        parts = key.split('_')
        if len(parts) >= 5:
            subject_id = parts[4].replace('.pkl', '')
        elif len(parts) == 4:
            subject_id = parts[3].replace('.pkl', '')
        else:
            continue
            
        if subject_id in test_truth_subjects or subject_id in test_lie_subjects:
            test_data[key] = value
        else:
            train_data[key] = value
                
    return train_data, test_data

# Split the data into training and testing sets
train_data, test_data = split_data(data, test_truth_subjects, test_lie_subjects)

# Output the results
print("Number of training data:", len(train_data))
print("Number of testing data:", len(test_data))
print("Test truth subjects:", test_truth_subjects)
print("Test lie subjects:", test_lie_subjects)



Number of training data: 152
Number of testing data: 56
Test truth subjects: ['09', '12']
Test lie subjects: ['13', '06']


# Model