In [4]:
import os
from glob import glob
from pathlib import Path

cwd = Path.cwd()
print(cwd)
prepped_data_dir = cwd / "PreppedData"

/home/fs1620/MLBD_2024_25/Research_Project/LiaDataAnalysis/PreppedData


In [6]:
# If current path does not end in "PreppedData", move into it
if cwd.name != "PreppedData":
    if prepped_data_dir.exists():
        os.chdir(prepped_data_dir)
        print(f"Changed working directory to: {prepped_data_dir}")
    else:
        raise FileNotFoundError(f"'Annotations' directory not found at: {prepped_data_dir}")
else:
    print(f"Already in {cwd.name} directory.")

Already in PreppedData directory.


In [16]:
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
import os

def generate_strat_folds(
    data = (),
    save_path='',
    n_splits=5,
    random_state=42,
    shuffle=True,
    return_folds=True
):
    """
    Generates stratified cross-validation folds and saves them to a JSON file.

    Args:
        x_path (str): Path to X.npy (features).
        y_path (str): Path to y.npy (labels).
        save_path (str): Path to save folds JSON.
        n_splits (int): Number of cross-validation folds.
        random_state (int): Random seed for reproducibility.
        shuffle (bool): Whether to shuffle before splitting.
        return_folds (bool): If True, returns folds as a list of dicts.

    Returns:
        List[dict]: List of folds with 'train_idx' and 'val_idx' if return_folds is True.
    """
    
   # Access arrays by their saved names
    X = data['X']
    y = data['y']
    
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    folds = []
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        folds.append({
            'fold': fold_idx,
            'train_idx': train_idx.tolist(),
            'val_idx': val_idx.tolist()
        })

    save_dir = os.path.dirname(save_path)
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
    with open(save_path, 'w') as f:
        json.dump(folds, f, indent=2)

    print(f"Saved {n_splits} stratified folds to: {save_path}")
    
    if return_folds:
        return folds

In [18]:
import numpy as np

# Load the compressed .npz file
data = np.load('data_compressed.npz')

folds = generate_strat_folds(
    data = data,
    save_path='stratfolds5.json',
    n_splits=5
)

X shape: (21455, 862)
y shape: (21455,)
Saved 5 stratified folds to: stratfolds5.json
