In [26]:
import os
import numpy as np
import pandas as pd

In [27]:
def test_split(total_video_data, total_video_data_label, test_set_filepath):
    """
    Split the dataset into a training set and a test set (with labels 0 or 1),
    using pre-defined test samples from filepaths. Removes [ROI] prefix to match dict keys.

    Parameters:
        total_video_data (dict[str, np.ndarray]): filename -> video data.
        total_video_data_label (dict[str, int]): filename -> label (0 or 1).
        test_set_filepath (str): Path to txt file listing full file paths of test samples.

    Returns:
        X_train, X_test, y_train, y_test
    """
    with open(test_set_filepath, 'r') as f:
        test_filepaths = f.read().splitlines()

    # Normalize test filenames: strip path and remove '[ROI]' prefix
    test_filenames = [os.path.basename(path).replace('[ROI]', '') for path in test_filepaths]

    # Ensure all test filenames exist in data
    for fn in test_filenames:
        assert fn in total_video_data, f"Missing {fn} in total_video_data"

    # Split into test
    X_test = np.array([total_video_data[fn] for fn in test_filenames])
    y_test = np.array([total_video_data_label[fn] for fn in test_filenames])

    # Remaining = train
    train_filenames = [fn for fn in total_video_data if fn not in test_filenames]
    X_train = np.array([total_video_data[fn] for fn in train_filenames])
    y_train = np.array([total_video_data_label[fn] for fn in train_filenames])

    return X_train, X_test, y_train, y_test


In [28]:
total_video_data = np.load('../data/roi-video/total_data.npy', allow_pickle=True).item()
total_video_data_label = np.load('../data/roi-video/total_label.npy', allow_pickle=True).item()
total_cultivar = np.load('../data/roi-video/total_cultivar.npy', allow_pickle=True).item()
print('total_video_data len:', len(total_video_data))
print('total_video_data s label len:', len(total_video_data_label))
print('total_cultivar len:', len(total_cultivar))

# convert to pandas dataframe
X_train, X_test, y_train, y_test = test_split(total_video_data, total_video_data_label, r'C:\Users\Mud\Desktop\2ndPaper\roi_data\size-21\test_data_filenames.txt')

# reshape the data back to 3D
X_train = X_train.reshape(-1, 600, 21, 21)
X_test = X_test.reshape(-1, 600, 21, 21)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)


# save the data
np.save('./data/roi-video/train/X_train.npy', X_train)
np.save('./data/roi-video/train/y_train.npy', y_train)
np.save('./data/roi-video/test/X_test.npy', X_test)
np.save('./data/roi-video/test/y_test.npy', y_test)

total_video_data len: 2783
total_video_data s label len: 2783
total_cultivar len: 2783
X_train shape: (2223, 600, 21, 21)
X_test shape: (560, 600, 21, 21)
y_train shape: (2223,)
y_test shape: (560,)


In [29]:
total_time_series_data = np.load('../data/mean-time-series/total_data.npy', allow_pickle=True).item()
total_time_series_data_label = np.load('../data/mean-time-series/total_label.npy', allow_pickle=True).item()
total_cultivar = np.load('../data/roi-video/total_cultivar.npy', allow_pickle=True).item()
print('total_time_series_data len:', len(total_time_series_data))
print('total_time_series_data s label len:', len(total_time_series_data_label))
print('total_cultivar len:', len(total_cultivar))

# convert to pandas dataframe
X_train, X_test, y_train, y_test = test_split(total_time_series_data, total_time_series_data_label, r'C:\Users\Mud\Desktop\2ndPaper\roi_data\size-21\test_data_filenames.txt')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

# save the data
np.save('./data/mean-time-series/train/X_train.npy', X_train)
np.save('./data/mean-time-series/train/y_train.npy', y_train)
np.save('./data/mean-time-series/test/X_test.npy', X_test)
np.save('./data/mean-time-series/test/y_test.npy', y_test)

total_time_series_data len: 2783
total_time_series_data s label len: 2783
total_cultivar len: 2783
X_train shape: (2223, 600)
X_test shape: (560, 600)
y_train shape: (2223,)
y_test shape: (560,)


In [30]:
total_features_data = np.load('../data/features/total_data.npy', allow_pickle=True).item()
total_features_data_label = np.load('../data/features/total_label.npy', allow_pickle=True).item()
total_cultivar = np.load('../data/features/total_cultivar.npy', allow_pickle=True).item()
# add cultivar info to the features data
for key in total_features_data:
    total_features_data[key]['cultivar'] = total_cultivar[key]
print('total_features_data len', len(total_features_data))
print('total_features_data s label len:', len(total_features_data_label))

# convert to pandas dataframe
X_train, X_test, y_train, y_test = test_split(total_features_data, total_features_data_label, r'C:\Users\Mud\Desktop\2ndPaper\roi_data\size-21\test_data_filenames.txt')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

# save the data
np.save('./data/features/train/X_train.npy', X_train)
np.save('./data/features/train/y_train.npy', y_train)
np.save('./data/features/test/X_test.npy', X_test)
np.save('./data/features/test/y_test.npy', y_test)

total_features_data len 2783
total_features_data s label len: 2783
X_train shape: (2223,)
X_test shape: (560,)
y_train shape: (2223,)
y_test shape: (560,)
