# DI Automated Scripts
This notebook splits the dataset into train, validation and test split. 
<br>
<br>
![UofC logo](../assets/images/uofc_logo-black.jpg)

In [2]:
#import libraries
import os
import pandas as pd
from dotenv import load_dotenv
import json
import shutil

In [23]:
records_folder = "../records" 
json_dir = "../records/JSON/all_data"
curr_dir = os.getcwd()

In [24]:
# load root directory
dotenv_path = os.path.join(curr_dir, ".env")
load_dotenv(dotenv_path)
root_path = os.getenv("ROOT_FOLDER")
input(f"Is this the right directory - {root_path}?")

''

## Run functions

In [25]:
def download_subset(json_file, download_dir):
    """ Download a subset of the DI dataset and split them into
    training, validation, and test.

    Args:
        json_file (str): Path to the JSON file containing metadata.
        download_dir (str): Directory to download data to.

    Return:
        Mapping of the directories containing the subsections of data.
    """
    with open(json_file, "r") as file:
        metadata = json.load(file)

   # Define paths for train, validation, and test folders
    train_dir = os.path.join(download_dir, 'Train')
    validation_dir = os.path.join(download_dir, 'Validation')
    test_dir = os.path.join(download_dir, 'Test')

    # Create folders if they don't exist
    for folder in [train_dir, validation_dir, test_dir]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # Define alias IDs for train, validation, and test
    train_ID = [15, 5, 9, 11, 14, 6, 13, 18, 17, 16, 3, 1]
    val_ID = [7, 4, 12]
    test_ID = [8, 2]

   
    # Iterate through metadata and copy videos to respective folders based on alias ID
    for video_data in metadata:
        local_path = video_data.get('local path')
        alias_id = int(video_data.get('alias'))
        destination_folder = None
        if alias_id in train_ID:
            destination_folder = train_dir
        elif alias_id in val_ID:
            destination_folder = validation_dir
        elif alias_id in test_ID:
            destination_folder = test_dir

        if destination_folder:
            destination_path = os.path.join(destination_folder, os.path.basename(local_path))
            shutil.copy(local_path, destination_path)
            print(f"Copied {local_path} to {destination_folder}")
        else:
            print(f"No destination folder found for video with alias {alias_id}")

  

download_dir = './records/DI_subset/'
subset_paths = download_subset('rgb_complete.json', download_dir=download_dir)
