In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import cv2
import imageio
import os
import pandas as pd

In [20]:
round_numbers = [5, 6]
 # npy data path -> {'label': 0/1, 'cultivar': 'cultivar_name'}
total_video_map = {}



for round_number in round_numbers:
    roi_data_folder = f'C:/Users/Mud/Desktop/2ndPaper/roi_data/size-50/round_{round_number}/'
    data_status_csv = f'C:/Users/Mud/Desktop/2ndPaper/roi_data/csv/stat_all_data_extend_mean_normalization_pca_round_{round_number}.csv'

   
    # Load the data status
    data_status = pd.read_csv(data_status_csv)
    data_status_this_round = data_status[data_status['dev_stage'] == round_number]


    # Load the data and append without flattening
    for index, row in data_status_this_round.iterrows():
        filename = row['filename']
        cultivar = row['cultivar']
        mortality = row['mortality']
        treatment = row['treatment']
        npy_data_path = f'{roi_data_folder}[ROI]{filename}'
        video_data = np.load(npy_data_path)

        if cultivar == 'RIES':

            if 1 <= treatment < 5:
                if mortality:
                    total_video_map[npy_data_path] = {'label': 1, 'cultivar': cultivar}
                else:
                    total_video_map[npy_data_path] = {'label': 0, 'cultivar': cultivar}
                print(f'Loaded {filename} with shape {video_data.shape}')

Loaded Ries5.0_B01_N01.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N02.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N03.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N05.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N06.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N07.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N08.npy with shape (600, 50, 50)
Loaded Ries5.0_B01_N09.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N01.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N02.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N03.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N04.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N05.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N06.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N07.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N08.npy with shape (600, 50, 50)
Loaded Ries5.0_B02_N09.npy with shape (600, 50, 50)
Loaded Ries5.0_B03_N01.npy with shape (600, 50, 50)
Loaded Ries5.0_B03_N02.npy with shape (600, 50, 50)
Loaded Ries5

In [21]:
print('shape of total_video_map:', len(total_video_map.values()))

shape of total_video_map: 718


In [22]:
# cultivar-specific data
# split the train set into train and validation sets, 80% train, 20% validation
video_paths = list(total_video_map.keys())
labels = [total_video_map[path]['label'] for path in video_paths]
X = pd.DataFrame(index=video_paths)  # empty DataFrame with video paths as index
y = pd.Series(labels, index=video_paths)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_videos, valid_videos, train_labels, valid_labels = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
test_videos, test_labels = X_test, y_test

print('Train:', train_videos.shape, train_labels.shape)
print('Valid:', valid_videos.shape, valid_labels.shape)
print('Test:', test_videos.shape, test_labels.shape)

Train: (459, 0) (459,)
Valid: (115, 0) (115,)
Test: (144, 0) (144,)


In [12]:
def balanced_test_split(X, y, cultivar_infos, each_cultivar_size=70, test_size=0.2, random_state=None):
    """
    Split the dataset into a training set and a balanced test set (1:1 for classes).
    The test set contains a fixed number of samples (each_cultivar_size) per cultivar for each class (alive and dead).

    Parameters:
        X (pd.DataFrame): Features dataframe.
        y (pd.Series): Target labels.
        cultivar_infos (pd.Series): Cultivar information corresponding to X.
        each_cultivar_size (int): Number of samples per cultivar per class (alive and dead).
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int): Random seed for reproducibility.

    Returns:
        X_train, X_test, y_train, y_test: Train-test splits with the test set balanced per cultivar.
    """
    if random_state:
        np.random.seed(random_state)
    
    test_indices = []

    # Get unique cultivars
    unique_cultivars = cultivar_infos.unique()
    
    for cultivar in unique_cultivars:
        # Get indices of samples for this cultivar
        cultivar_indices = cultivar_infos[cultivar_infos == cultivar].index

        # Separate by class
        class_0_indices = y.loc[cultivar_indices][y == 0].index
        class_1_indices = y.loc[cultivar_indices][y == 1].index

        # Ensure we have enough samples for each class
        min_class_0 = min(len(class_0_indices), each_cultivar_size)
        min_class_1 = min(len(class_1_indices), each_cultivar_size)

        # Randomly sample indices for the test set
        sampled_class_0 = np.random.choice(class_0_indices, min_class_0, replace=False)
        sampled_class_1 = np.random.choice(class_1_indices, min_class_1, replace=False)

        # Add to test indices
        test_indices.extend(sampled_class_0)
        test_indices.extend(sampled_class_1)

    # Create the test set
    X_test = X.loc[test_indices]
    y_test = y.loc[test_indices]

    # Create the training set by excluding the test indices
    train_indices = X.index.difference(test_indices)
    X_train = X.loc[train_indices]
    y_train = y.loc[train_indices]

    return X_train, X_test, y_train, y_test


video_paths = list(total_video_map.keys())
labels = [total_video_map[path]['label'] for path in video_paths]
cultivars = [total_video_map[path]['cultivar'] for path in video_paths]


X = pd.DataFrame(index=video_paths)  # empty DataFrame with video paths as index
y = pd.Series(labels, index=video_paths)
cultivar_infos = pd.Series(cultivars, index=video_paths)


X_train, X_test, y_train, y_test = balanced_test_split(X, y, cultivar_infos, each_cultivar_size=70, random_state=42)

In [13]:
# split the train set into train and validation sets, 80% train, 20% validation
train_videos, valid_videos, train_labels, valid_labels = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
test_videos, test_labels = X_test, y_test

print('Train:', train_videos.shape, train_labels.shape)
print('Valid:', valid_videos.shape, valid_labels.shape)
print('Test:', test_videos.shape, test_labels.shape)

Train: (1878, 0) (1878,)
Valid: (470, 0) (470,)
Test: (560, 0) (560,)


In [23]:
# normalize the data to have same starting pt for all time series
# input shape: (n_samples, n_timepoints, width, height), want each samples - its mean of 0-30th timepts
def normalize_curves_mean_method(unnormalized_data):
    mean = np.mean(unnormalized_data[:30, :, :], axis=0, keepdims=True)
    normalized_data = unnormalized_data - mean
    assert normalized_data.shape == unnormalized_data.shape, 'Shape mismatch after normalization'
    return normalized_data

In [24]:
size = 21

def generate_avi(data, label, video_type):
    os.makedirs(f'./data-size-{size}/RIES/{video_type}', exist_ok=True)  # Ensure base directory exists
    target_size = (size, size)  # Target size for resizing

    for path in data.index:
        video_data = np.load(path)
        video_data = normalize_curves_mean_method(video_data)
        label_value = label.loc[path]
        label_str = 'alive' if label_value == 1 else 'dead'
        output_dir = f'./data-size-{size}/RIES/{video_type}/{label_str}'
        os.makedirs(output_dir, exist_ok=True)  # Ensure label-specific directory exists
        
        filename = f'{label_str}_{os.path.basename(path).replace(".npy", "")}.avi'
        filepath = os.path.join(output_dir, filename)

        fourcc = cv2.VideoWriter_fourcc(*'XVID')
        out = cv2.VideoWriter(filepath, fourcc, 30.0, target_size)

        for frame in video_data:
            frame = cv2.normalize(frame, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
            frame = cv2.resize(frame, target_size)
            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
            out.write(frame)

        out.release()
        print(f'Saved video: {filepath}')


generate_avi(train_videos, train_labels, 'train')
generate_avi(valid_videos, valid_labels, 'val')
generate_avi(test_videos, test_labels, 'test')

Saved video: ./data-size-21/RIES/train/dead\dead_[ROI]Ries5.1F_B37_N09.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries5.0_B11_N04.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries6.0_B12_N01.avi
Saved video: ./data-size-21/RIES/train/dead\dead_[ROI]Ries5.1F_B34_N01.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries6.0_B16_N03.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries6.0_B04_N04.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries5.0_B23_N01.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries6.0_B01_N06.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries5.0_B19_N06.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries5.0_B21_N08.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries6.0_B02_N04.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries5.0_B27_N05.avi
Saved video: ./data-size-21/RIES/train/alive\alive_[ROI]Ries5.0_B25_N06.avi
Saved video: .