# Colab Setup

In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Navigate to the directory
%cd /content/drive/My Drive/Coding/idea/modules/testscr/

/content/drive/My Drive/Coding/idea/modules/testscr


# General Setup

In [3]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Utils import setup
from utils import selfutil
importlib.reload(selfutil)

#from utils.createdatautil import observe_datadir, get_datagov, validate_dir
from utils.selfutil import get_vocab, set_seed

# Other imports
import torch
import pandas as pd
from IPython.display import display

# Threads for parallel processing
THREADS = 1

# Set random seed for reproducibility
set_seed(0)

# Observig Directories

In [5]:
def observe_datadir(data_dirs):
    """
    Compare file counts across multiple data directories and flag rows with differing counts.
    Args:
        data_dirs (list): List of directory paths to compare.
    """
    # Dictionary to store subdirectory counts for each directory
    results = {}

    for data_dir in data_dirs:
        # Extract the last level of the data directory path
        dir_name = os.path.basename(os.path.normpath(data_dir))

        # Initialize a dictionary to store subdirectory names and file counts
        subdir_file_counts = {}

        # Loop through the subdirectories and count the files
        for subdir, _, files in os.walk(data_dir):
            # Skip the root directory itself
            if subdir != data_dir:
                subdir_name = os.path.basename(subdir)  # Get the subdirectory name
                subdir_file_counts[subdir_name] = len(files)  # Count the number of files

        # Store the results in the main dictionary
        results[dir_name] = subdir_file_counts

    # Get the union of all subdirectory names across all directories
    all_subdirs = set()
    for subdir_counts in results.values():
        all_subdirs.update(subdir_counts.keys())

    # Custom sort function for subdirectories
    def custom_sort_key(subdir_name):
        # Check if the name starts with train, val, or test
        prefixes = ("train_", "val_", "test_")
        if not subdir_name.startswith(prefixes):
            # Directories without train/val/test prefixes come first
            return (0, subdir_name)
        else:
            # Split prefix and suffix
            prefix, suffix = subdir_name.split("_", 1)
            prefix_order = {"train": 1, "val": 2, "test": 3}
            return (1, suffix, prefix_order.get(prefix, 4))

    # Sort the subdirectories using the custom sort key
    sorted_subdirs = sorted(all_subdirs, key=custom_sort_key)

    # Initialize a DataFrame with all subdirectory names and directory names as columns
    df = pd.DataFrame(
        columns=[os.path.basename(os.path.normpath(d)) for d in data_dirs]
    )
    df["Subdirectory"] = sorted_subdirs
    df.set_index("Subdirectory", inplace=True)

    # Populate the DataFrame with file counts, filling missing values with zero
    for dir_name, subdir_counts in results.items():
        df[dir_name] = df.index.map(subdir_counts).fillna(0).astype(int)

    # Add the "diff" column to flag rows with differing counts
    df["diff"] = df.nunique(axis=1).apply(lambda x: "Y" if x > 1 else "")

    # Set pandas options to display all rows and columns
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)

    # Display the DataFrame
    display(df)

# Observe a list of directories
observe_datadir(['../data/', '../l_data/', '../n_data/'])

Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
all_test,0,0,0,
all_train,0,0,0,
all_val,0,0,0,
datagov,0,0,2000,Y
enron,624,624,624,
enron_clean,478,448,400,Y
manual,0,0,50,Y
temp,0,0,0,
train_big,800,800,0,Y
val_big,100,100,0,Y


# Data.gov file collection

## Retrieve all the links in json

In [6]:
import os
import requests
import asyncio
import aiohttp
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
import time

async def get_datagov_links(data_dir, max_size_mb=100, timeout=10):
    # Ensure the directory exists
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # File to save the validated links
    links_file = os.path.join(data_dir, "validated_links.json")

    # Initialize lists
    all_links = []

    # Fetch all links from pages 1 to 10 (adjusted for testing)
    for page_number in tqdm(range(1, 431), desc="Getting Links"):
        while True:  # Keep retrying for the current page if 403 is encountered
            try:
                response = requests.get(
                    base_url := f"https://catalog.data.gov/dataset/?res_format=EXCEL&_res_format_limit=0&_bureauCode_limit=0&page={page_number}",
                    timeout=timeout,
                )
                response.raise_for_status()  # Raise exception for HTTP errors
                soup = BeautifulSoup(response.content, "html.parser")
                links = [
                    link["href"]
                    for link in soup.find_all("a", href=True)
                    if ".xls" in link["href"].lower()
                ]
                all_links.extend(links)
                break  # Exit the retry loop once successful

            except requests.exceptions.HTTPError as e:
                # Retry only for 403 errors
                if response.status_code == 403:
                    print(f"Page {page_number}: 403 PING")
                    time.sleep(60)
                else:
                    print(f"Page {page_number}: {e}")
                    return  # Exit for other HTTP errors

            except requests.RequestException as e:
                print(f"Page {page_number}: {e}")
                return

            except Exception as ec:
                print(f"Page {page_number}: {ec}")
                return

    # Print total number of links found
    print(f"\n\nInitial Links: {len(all_links)}")

    # Validation logic
    validated_links = []

    async with aiohttp.ClientSession() as session:
        sem = asyncio.Semaphore(10)  # Limit concurrency to 10

        async def validate_link(url):
            """Validate a single link."""
            try:
                async with sem:
                    async with session.head(
                        url, timeout=timeout, allow_redirects=True
                    ) as response:
                        if (
                            response.status == 200
                            and "Content-Length" in response.headers
                        ):
                            file_size_mb = int(response.headers["Content-Length"]) / (
                                1024 * 1024
                            )
                            if file_size_mb <= max_size_mb:
                                return url
                        elif response.status == 403:
                            # Allow restricted header links
                            return url
            except asyncio.TimeoutError:
                pass
            except Exception:
                pass
            return None

        # Validate all links with a progress bar
        for result in tqdm(
            asyncio.as_completed([validate_link(link) for link in all_links]),
            total=len(all_links),
            desc="Validating Links",
        ):
            valid_link = await result
            if valid_link:
                validated_links.append(valid_link)

    # Update all_links to only contain validated links
    all_links = validated_links

    # Print validated total
    print(f"\nValidated Links: {len(all_links)}")

    # Write the validated links to a JSON file
    with open(links_file, "w") as f:
        json.dump(all_links, f, indent=4)

    print(f"Validated links saved to {links_file}")


In [7]:
# # Set the directory to store the JSON file
# data_dir = "../n_data/"

# # Call the function with asyncio
# await get_datagov_links(data_dir, max_size_mb=100, timeout=10)


## Download all the files in one folder

In [8]:
import os
import json
import aiohttp
import asyncio
from tqdm import tqdm

async def get_datagov(
    json_file="../n_data/validated_links.json",
    data_dir="../n_data/datagov/",
    max_size_mb=2,
    timeout=900,
    max_downloads=None  # New parameter to limit the number of downloads
):
    """
    Downloads files from a list of validated links stored in a JSON file.

    Args:
        json_file (str): Path to the JSON file containing links.
        data_dir (str): Directory to save the downloaded files.
        max_size_mb (int): Maximum file size allowed (in MB).
        timeout (int): Timeout value for requests in seconds.
        max_downloads (int, optional): Maximum number of links to process. Defaults to None (no limit).
    """
    # Check if JSON file is valid
    if not os.path.isfile(json_file):
        raise FileNotFoundError(f"JSON file '{json_file}' does not exist.")

    # Check if the saving directory exists
    if not os.path.isdir(data_dir):
        raise FileNotFoundError(f"Directory '{data_dir}' does not exist.")

    # Load the links from the JSON file
    with open(json_file, "r") as f:
        all_links = json.load(f)

    # Apply the max_downloads limit if specified
    if max_downloads is not None:
        all_links = all_links[:max_downloads]
        print(f"Processing first {max_downloads} links out of {len(all_links)} total links.")

    # Initialize counters
    download_count = 0

    async with aiohttp.ClientSession() as session:
        tasks = []
        with tqdm(total=len(all_links), desc="Downloading Files") as pbar:

            async def download_file(url):
                """Download a single file from the given URL."""
                nonlocal download_count  # Access the counter in the outer scope
                try:
                    # Check file size using HEAD request before downloading
                    async with session.head(
                        url, timeout=timeout, allow_redirects=True
                    ) as response:
                        if (
                            response.status == 200
                            and "Content-Length" in response.headers
                        ):
                            file_size_mb = int(response.headers["Content-Length"]) / (
                                1024 * 1024
                            )
                            if file_size_mb > max_size_mb:
                                # Skip files larger than the maximum size
                                pbar.update(1)
                                return
                        elif response.status == 403:
                            # Allow downloads if headers are restricted
                            pass
                        else:
                            # Skip other HTTP errors
                            pbar.update(1)
                            return

                    # Combine directory and filename
                    filename = os.path.join(data_dir, url.split("/")[-1])

                    # Make the GET request to download the file
                    async with session.get(
                        url, timeout=timeout, allow_redirects=True
                    ) as response:
                        if response.status == 200 or response.status == 403:
                            with open(filename, "wb") as f:
                                f.write(await response.read())
                            download_count += 1
                        pbar.update(1)

                except asyncio.TimeoutError:
                    pbar.update(1)
                except Exception:
                    pbar.update(1)

            # Add tasks for concurrent downloads
            for url in all_links:
                tasks.append(asyncio.create_task(download_file(url)))

            # Limit concurrency with aiohttp Semaphore
            sem = asyncio.Semaphore(5)

            async def limited_task(task):
                async with sem:
                    return await task

            await asyncio.gather(*[limited_task(task) for task in tasks])

    print(f"Total files downloaded: {download_count}")

In [None]:
# Example usage
await get_datagov(
    json_file="../n_data/validated_links.json",
    data_dir="../n_data/datagov/",
    max_size_mb=2,
    timeout=900,
    max_downloads=3000  # Process up to 1000 links
)

# Observe a list of directories
observe_datadir(['../data/', '../l_data/', '../n_data/'])

Processing first 3000 links out of 3000 total links.


Downloading Files: 100%|██████████| 3000/3000 [03:14<00:00, 15.42it/s]

Total files downloaded: 2682





Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
datagov,0,0,2479,Y
enron,624,624,624,
enron_clean,478,448,624,Y
manual,0,0,50,Y
temp,0,0,0,
train_big,800,800,0,Y
val_big,100,100,0,Y
test_big,100,100,0,Y
train_manual,40,40,0,Y
val_manual,5,5,0,Y


# Validate Directories

Function to validate each directory as in clean it up make sure all files are of a particular size and that process_spreadsheet doesnt take as much time


## Create vocab object

To be used to check process_spreadsheet so just set it up

In [9]:
# Utils import setup
from utils import selfutil
importlib.reload(selfutil)

#from utils.createdatautil import observe_datadir, get_datagov, validate_dir
from utils.selfutil import get_vocab, set_seed

# Directory containing spreadsheets for vocab training
vocab_dir = '../n_data/enron/'

# Create the vocabulary object
spreadsheet_vocab = get_vocab(vocab_dir, vocab_size=150000, space=True, case='both', threads=THREADS)

Getting Vocab: 100%|██████████| 622/622 [00:24<00:00, 25.62it/s] 


624(P) = 622(G) + 2(E)
Unique Tokens: 32484
Vocab Size: 32488


In [10]:
import time
import random
import warnings
import os
from tqdm import tqdm
from joblib import Parallel, delayed

# Utils import setup
from utils import selfutil, parseutil
import importlib
importlib.reload(selfutil)
importlib.reload(parseutil)

from utils.selfutil import get_vocab, get_fileList
from utils.parseutil import process_spreadsheet


def validate_dir(data_dir, vocab, rows=100, cols=100, filesize=2, timeout=2, numfiles=100, threads=4):
    """
    Validate files in a directory based on multiple conditions:
    - Supported file extensions
    - File size limits
    - Processing time limits
    - Warnings during processing
    - Exceptions during processing
    - Ensures no more than numfiles are valid for processing.

    Args:
        data_dir (str): Path to the directory to validate.
        vocab (object): Vocabulary object for processing spreadsheets.
        rows (int): Maximum allowed rows in the spreadsheet.
        cols (int): Maximum allowed columns in the spreadsheet.
        filesize (int): Maximum allowed file size in MB.
        timeout (int): Maximum allowed processing time in seconds.
        numfiles (int): Maximum number of files to process.
        threads (int): Number of threads for parallel processing.
    """
    # If directory doesn't exist, raise an error
    if not os.path.exists(data_dir):
        raise ValueError(f'DNE: {data_dir}')

    # Get valid and invalid files
    file_paths, failed_files = get_fileList(data_dir)

    # Total files in directory
    total_files = len(file_paths) + len(failed_files)

    # Initialize error counter starting from failed files due to invalid extensions
    ec = len(failed_files)

    # Remove files with invalid extensions immediately
    print("\nDeleting files with invalid extensions...")
    for file_path in tqdm(failed_files, desc="Deleting Invalid Extension Files", unit="file"):
        try:
            os.remove(file_path)
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")

    # Define a helper function to process each file
    def process_file(file_path):
        nonlocal ec  # Access the error counter defined outside
        try:
            # Check file size
            if (os.path.getsize(file_path) / (1024 * 1024)) > filesize:
                ec += 1
                os.remove(file_path)
                return None  # File removed due to size limit

            # Process file and handle warnings
            start_time = time.time()
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")
                x_tok, y_tok = process_spreadsheet(file_path, vocab=vocab, max_rows=rows, max_cols=cols)

                # Check for specific warnings
                for warning in w:
                    if "OLE2 inconsistency: SSCS size is 0 but SSAT size is non-zero" in str(warning.message):
                        ec += 1
                        os.remove(file_path)
                        return None  # File removed due to specific warning

                    if issubclass(warning.category, UserWarning):
                        ec += 1
                        os.remove(file_path)
                        return None  # File removed due to general warning

            # Check processing time
            if time.time() - start_time > timeout:
                ec += 1
                os.remove(file_path)
                return None  # File removed due to timeout

        except Exception:
            # Handle exceptions and remove file
            ec += 1
            os.remove(file_path)
            return None  # File removed due to exception

        return None  # File passed all checks and remains in the directory

    # Process files in parallel
    print("\nStarting parallel validation...")
    parallel = Parallel(n_jobs=max(1, os.cpu_count()//threads), backend="loky")
    _ = parallel(delayed(process_file)(file_path) for file_path in tqdm(file_paths, desc="Validating Files", unit="file"))

    # Count remaining files in the directory
    remaining_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
    remaining_files_count = len(remaining_files)

    # Adjust the file count to match numfiles if necessary
    if remaining_files_count < numfiles:
        print("\nLess than required files present.")
    elif remaining_files_count > numfiles:
        # Randomly remove files until remaining_files_count == numfiles
        files_to_remove = random.sample(remaining_files, remaining_files_count - numfiles)
        print(f"\nRemoving {len(files_to_remove)} files to reach the required file count of {numfiles}...")
        for file_path in tqdm(files_to_remove, desc="Removing Extra Files", unit="file"):
            full_path = os.path.join(data_dir, file_path)
            try:
                os.remove(full_path)
            except Exception as e:
                print(f"Error deleting {full_path}: {e}")
        remaining_files_count = numfiles  # Update the remaining files count after deletion
    else:
        print("\nCorrect number of files present.")

    # Print final status
    print(f'\nTotal files processed: {total_files}')
    print(f'Total files remaining: {remaining_files_count}')
    print(f'Total files deleted due to errors: {ec}')


## Validate Enron First

Clean up the directory for basically filesize and timeout right now no editing of number of files

In [None]:
# Define the folder and parameters
data_dir = "../n_data/enron_clean/"
rows = 100
cols = 100
filesize = 2  # File size in MB
timeout = 10  # Timeout in seconds
numfiles = 400  # Maximum number of files to process
threads = 2  # Number of threads

# Call the function
validate_dir(data_dir, spreadsheet_vocab, rows=rows, cols=cols, filesize=filesize, timeout=timeout, numfiles=numfiles, threads=threads)

# Observe a list of directories
observe_datadir(['../data/', '../l_data/', '../n_data/'])


Deleting files with invalid extensions...


Deleting Invalid Extension Files: 100%|██████████| 2/2 [00:00<00:00, 450.03file/s]



Starting parallel validation...


Validating Files: 100%|██████████| 622/622 [00:50<00:00, 12.29file/s]



Removing 47 files to reach the required file count of 400...


Removing Extra Files: 100%|██████████| 47/47 [00:00<00:00, 702.16file/s]


Total files processed: 624
Total files remaining: 400
Total files deleted due to errors: 2





Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
datagov,0,0,2479,Y
enron,624,624,624,
enron_clean,478,448,400,Y
manual,0,0,50,Y
temp,0,0,0,
train_big,800,800,0,Y
val_big,100,100,0,Y
test_big,100,100,0,Y
train_manual,40,40,0,Y
val_manual,5,5,0,Y


## Validate datagov

In [None]:
# Define the folder and parameters
data_dir = "../n_data/datagov/"
rows = 100
cols = 100
filesize = 2  # File size in MB
timeout = 10  # Timeout in seconds
numfiles = 2000  # Maximum number of files to process
threads = 2  # Number of threads

# Call the function
validate_dir(data_dir, spreadsheet_vocab, rows=rows, cols=cols, filesize=filesize, timeout=timeout, numfiles=numfiles, threads=threads)

# Observe a list of directories
observe_datadir(['../data/', '../l_data/', '../n_data/'])


Deleting files with invalid extensions...


Deleting Invalid Extension Files: 100%|██████████| 19/19 [00:00<00:00, 523.54file/s]



Starting parallel validation...


Validating Files: 100%|██████████| 2460/2460 [09:02<00:00,  4.53file/s]



Removing 300 files to reach the required file count of 2000...


Removing Extra Files: 100%|██████████| 300/300 [00:00<00:00, 643.66file/s]



Total files processed: 2479
Total files remaining: 2000
Total files deleted due to errors: 19


Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
datagov,0,0,2000,Y
enron,624,624,624,
enron_clean,478,448,400,Y
manual,0,0,50,Y
temp,0,0,0,
train_big,800,800,0,Y
val_big,100,100,0,Y
test_big,100,100,0,Y
train_manual,40,40,0,Y
val_manual,5,5,0,Y


# Combine the 3 to create All folder in 80-10-10

## Split enron_clean and datagov into 3 folders and proportion

In [11]:
import os
import shutil
from tqdm import tqdm
import random

def split_and_copy_files(source_dir, target_base_dir, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits files from a source directory into train, validation, and test folders
    with an 80-10-10 split and copies them to respective target directories.

    Args:
        source_dir (str): Path to the source directory containing the files.
        target_base_dir (str): Path to the base directory for all_train, all_val, and all_test.
        train_ratio (float): Ratio of files to allocate for training.
        val_ratio (float): Ratio of files to allocate for validation.
        test_ratio (float): Ratio of files to allocate for testing.
    """
    # Sanity check for directory existence
    if not os.path.exists(source_dir):
        raise ValueError(f"Source directory '{source_dir}' does not exist.")
    if not os.path.exists(target_base_dir):
        os.makedirs(target_base_dir)

    # Target directories
    train_dir = os.path.join(target_base_dir, 'all_train')
    val_dir = os.path.join(target_base_dir, 'all_val')
    test_dir = os.path.join(target_base_dir, 'all_test')

    # Create target directories if they don't exist
    for dir_path in [train_dir, val_dir, test_dir]:
        os.makedirs(dir_path, exist_ok=True)

    # Get the list of all files
    files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
    random.shuffle(files)  # Shuffle files to ensure randomness

    # Determine split sizes
    total_files = len(files)
    train_count = int(total_files * train_ratio)
    val_count = int(total_files * val_ratio)
    test_count = total_files - train_count - val_count

    # Split files
    train_files = files[:train_count]
    val_files = files[train_count:train_count + val_count]
    test_files = files[train_count + val_count:]

    # Copy files to respective directories
    print(f"Copying {train_count} files to {train_dir}")
    for file in tqdm(train_files, desc="Copying to all_train"):
        shutil.copy(os.path.join(source_dir, file), os.path.join(train_dir, file))

    print(f"Copying {val_count} files to {val_dir}")
    for file in tqdm(val_files, desc="Copying to all_val"):
        shutil.copy(os.path.join(source_dir, file), os.path.join(val_dir, file))

    print(f"Copying {test_count} files to {test_dir}")
    for file in tqdm(test_files, desc="Copying to all_test"):
        shutil.copy(os.path.join(source_dir, file), os.path.join(test_dir, file))

    print(f"Files successfully split and copied from '{source_dir}' to '{target_base_dir}'.")

# Paths
base_dir = "../n_data/"
source_dirs = {
    "enron_clean": os.path.join(base_dir, "enron_clean"),
    "datagov": os.path.join(base_dir, "datagov")
}
target_base_dir = os.path.join(base_dir, "")

# Split files for each source directory
for source_name, source_path in source_dirs.items():
    print(f"\nProcessing '{source_name}'...")
    split_and_copy_files(source_path, target_base_dir, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)

# Observe a list of directories
observe_datadir(['../data/', '../l_data/', '../n_data/'])


Processing 'enron_clean'...
Copying 320 files to ../n_data/all_train


Copying to all_train: 100%|██████████| 320/320 [00:23<00:00, 13.76it/s]


Copying 40 files to ../n_data/all_val


Copying to all_val: 100%|██████████| 40/40 [00:02<00:00, 16.00it/s]


Copying 40 files to ../n_data/all_test


Copying to all_test: 100%|██████████| 40/40 [00:02<00:00, 16.65it/s]


Files successfully split and copied from '../n_data/enron_clean' to '../n_data/'.

Processing 'datagov'...
Copying 1600 files to ../n_data/all_train


Copying to all_train: 100%|██████████| 1600/1600 [01:53<00:00, 14.13it/s]


Copying 200 files to ../n_data/all_val


Copying to all_val: 100%|██████████| 200/200 [00:07<00:00, 27.96it/s]


Copying 200 files to ../n_data/all_test


Copying to all_test: 100%|██████████| 200/200 [00:10<00:00, 18.98it/s]

Files successfully split and copied from '../n_data/datagov' to '../n_data/'.





## Copy manual in proportions now

In [13]:
import os
import shutil
from tqdm import tqdm

def copy_manual_to_all(manual_base_dir, target_base_dir):
    """
    Copies files from manual_train, manual_val, and manual_test folders into
    all_train, all_val, and all_test respectively.

    Args:
        manual_base_dir (str): Base directory containing manual_* folders.
        target_base_dir (str): Base directory containing all_* folders.
    """
    folder_map = {
        "manual_train": "all_train",
        "manual_val": "all_val",
        "manual_test": "all_test"
    }

    # Loop through the folder map and copy files
    for manual_folder, target_folder in folder_map.items():
        source_dir = os.path.join(manual_base_dir, manual_folder)
        target_dir = os.path.join(target_base_dir, target_folder)

        # Check if source and target directories exist
        if not os.path.isdir(source_dir):
            print(f"Source directory '{source_dir}' does not exist. Skipping...")
            continue
        os.makedirs(target_dir, exist_ok=True)

        # Get list of files and copy with a progress bar
        files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
        print(f"Copying files from {manual_folder} to {target_folder}...")
        for file in tqdm(files, desc=f"Copying to {target_folder}", unit="file"):
            shutil.copy(os.path.join(source_dir, file), os.path.join(target_dir, file))

# Define base directories
manual_base_dir = "../n_data/"
target_base_dir = "../n_data/"

# Call the function to copy files
copy_manual_to_all(manual_base_dir, target_base_dir)

# Observe a list of directories
observe_datadir(['../data/', '../l_data/', '../n_data/'])

Copying files from manual_train to all_train...


Copying to all_train: 100%|██████████| 40/40 [00:31<00:00,  1.26file/s]


Copying files from manual_val to all_val...


Copying to all_val: 100%|██████████| 5/5 [00:04<00:00,  1.21file/s]


Copying files from manual_test to all_test...


Copying to all_test: 100%|██████████| 5/5 [00:04<00:00,  1.19file/s]


Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
all_test,0,0,245,Y
all_train,0,0,1960,Y
all_val,0,0,245,Y
datagov,0,0,2000,Y
enron,624,624,624,
enron_clean,478,448,400,Y
manual_test,0,0,5,Y
manual_train,0,0,40,Y
manual_val,0,0,5,Y
temp,0,0,0,


# Create base clean 2k folders

In [16]:
import os
import shutil
import random
from tqdm import tqdm

def create_2k_folders(all_dir, target_base):
    """
    Create 2k_train, 2k_val, 2k_test folders by randomly copying files
    from all_train, all_val, all_test.

    Args:
        all_dir (str): Base directory containing all_train, all_val, all_test.
        target_base (str): Base directory to create 2k_train, 2k_val, 2k_test.
    """
    # Define source and target folder mappings
    splits = {
        "train": {"source": os.path.join(all_dir, "all_train"), "count": 1600},
        "val": {"source": os.path.join(all_dir, "all_val"), "count": 200},
        "test": {"source": os.path.join(all_dir, "all_test"), "count": 200},
    }

    targets = {
        "train": os.path.join(target_base, "2k_train"),
        "val": os.path.join(target_base, "2k_val"),
        "test": os.path.join(target_base, "2k_test"),
    }

    # Ensure target directories exist
    for split, target_dir in targets.items():
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
            print(f"Created: {target_dir}")

    # Randomly select and copy files
    for split, details in splits.items():
        source_dir = details["source"]
        target_dir = targets[split]
        num_files = details["count"]

        # List all files in source directory
        files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
        if len(files) < num_files:
            raise ValueError(f"Not enough files in {source_dir} to copy {num_files} files")

        # Randomly select files
        selected_files = random.sample(files, num_files)

        # Copy files with progress bar
        print(f"Copying {num_files} files from {source_dir} to {target_dir}")
        for file in tqdm(selected_files, desc=f"{split.capitalize()} Copy", unit="file"):
            shutil.copy(os.path.join(source_dir, file), os.path.join(target_dir, file))

    print("\nFile copying complete!")

# Example usage
all_directory = "../n_data/"  # Base directory containing all_train, all_val, all_test
output_base = "../n_data/"    # Base directory where 2k_ folders will be created

create_2k_folders(all_directory, output_base)
observe_datadir(['../data/', '../l_data/', '../n_data/'])

Created: ../n_data/2k_train
Created: ../n_data/2k_val
Created: ../n_data/2k_test
Copying 1600 files from ../n_data/all_train to ../n_data/2k_train


Train Copy: 100%|██████████| 1600/1600 [00:27<00:00, 57.14file/s] 


Copying 200 files from ../n_data/all_val to ../n_data/2k_val


Val Copy: 100%|██████████| 200/200 [00:01<00:00, 101.71file/s]


Copying 200 files from ../n_data/all_test to ../n_data/2k_test


Test Copy: 100%|██████████| 200/200 [00:02<00:00, 99.91file/s] 



File copying complete!


Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2k_test,0,0,200,Y
2k_train,0,0,1600,Y
2k_val,0,0,200,Y
all_test,0,0,245,Y
all_train,0,0,1960,Y
all_val,0,0,245,Y
datagov,0,0,2000,Y
enron,624,624,624,
enron_clean,478,448,400,Y
manual_test,0,0,5,Y


# Make split folders now

In [17]:
import os
import shutil
import random
from tqdm import tqdm

def create_progressive_splits(base_dir, splits):
    """
    Create progressively smaller folder splits based on a 0.5 random sampling ratio.

    Args:
        base_dir (str): Base directory containing the initial 2k_ folders.
        splits (list): List of split sizes to create. Example: ["1k", "500", "250", "100", "50"].
    """
    # Define starting folder names and ratios
    current_split = "2k"
    split_files = {
        "1k": [800, 100, 100],
        "500": [400, 50, 50],
        "250": [200, 25, 25],
        "100": [80, 10, 10],
        "50": [40, 5, 5]
    }

    # Helper function to copy random files
    def copy_random_files(src_dir, dest_dir, num_files):
        all_files = [f for f in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, f))]
        if len(all_files) < num_files:
            raise ValueError(f"Not enough files in {src_dir} to copy {num_files}")
        selected_files = random.sample(all_files, num_files)

        for file in tqdm(selected_files, desc=f"Copying to {os.path.basename(dest_dir)}", leave=False):
            shutil.copy2(os.path.join(src_dir, file), os.path.join(dest_dir, file))

    # Loop through splits to create folders
    for split in tqdm(splits, desc="Creating Splits"):
        print(f"\nCreating {split}_ folders from {current_split}_ folders...")

        # Define source and destination folders
        src_train = os.path.join(base_dir, f"{current_split}_train")
        src_val = os.path.join(base_dir, f"{current_split}_val")
        src_test = os.path.join(base_dir, f"{current_split}_test")

        dest_train = os.path.join(base_dir, f"{split}_train")
        dest_val = os.path.join(base_dir, f"{split}_val")
        dest_test = os.path.join(base_dir, f"{split}_test")

        # Create destination folders with progress bars
        for folder in [dest_train, dest_val, dest_test]:
            os.makedirs(folder, exist_ok=True)
            tqdm.write(f"Created folder: {folder}")

        # Copy files with progress bars
        copy_random_files(src_train, dest_train, split_files[split][0])
        copy_random_files(src_val, dest_val, split_files[split][1])
        copy_random_files(src_test, dest_test, split_files[split][2])

        current_split = split  # Update for the next iteration

    print("\nProgressive splits created successfully!")

# Example usage
base_directory = "../n_data/"
splits_to_create = ["1k", "500", "250", "100", "50"]

create_progressive_splits(base_directory, splits_to_create)
observe_datadir(['../data/', '../l_data/', '../n_data/'])

Creating Splits:   0%|          | 0/5 [00:00<?, ?it/s]


Creating 1k_ folders from 2k_ folders...
Created folder: ../n_data/1k_train
Created folder: ../n_data/1k_val
Created folder: ../n_data/1k_test



Copying to 1k_train:   0%|          | 0/800 [00:00<?, ?it/s][A
Copying to 1k_train:   1%|          | 8/800 [00:00<00:10, 76.16it/s][A
Copying to 1k_train:   2%|▏         | 17/800 [00:00<00:20, 38.34it/s][A
Copying to 1k_train:   3%|▎         | 22/800 [00:10<09:09,  1.41it/s][A
Copying to 1k_train:   4%|▍         | 33/800 [00:11<04:30,  2.84it/s][A
Copying to 1k_train:   6%|▌         | 44/800 [00:11<02:37,  4.80it/s][A
Copying to 1k_train:   7%|▋         | 56/800 [00:11<01:36,  7.73it/s][A
Copying to 1k_train:   8%|▊         | 66/800 [00:11<01:07, 10.90it/s][A
Copying to 1k_train:  10%|▉         | 76/800 [00:11<00:48, 15.01it/s][A
Copying to 1k_train:  11%|█         | 86/800 [00:11<00:35, 20.31it/s][A
Copying to 1k_train:  12%|█▏        | 98/800 [00:11<00:24, 28.42it/s][A
Copying to 1k_train:  14%|█▎        | 108/800 [00:11<00:19, 35.88it/s][A
Copying to 1k_train:  15%|█▌        | 120/800 [00:11<00:14, 46.73it/s][A
Copying to 1k_train:  16%|█▋        | 131/800 [00:11<00:11


Creating 500_ folders from 1k_ folders...
Created folder: ../n_data/500_train
Created folder: ../n_data/500_val
Created folder: ../n_data/500_test



Copying to 500_train:   0%|          | 0/400 [00:00<?, ?it/s][A
Copying to 500_train:   2%|▏         | 9/400 [00:00<00:20, 19.50it/s][A
Copying to 500_train:   3%|▎         | 11/400 [00:03<02:55,  2.21it/s][A
Copying to 500_train:   5%|▍         | 19/400 [00:03<01:15,  5.06it/s][A
Copying to 500_train:   7%|▋         | 29/400 [00:04<00:38,  9.74it/s][A
Copying to 500_train:  10%|█         | 41/400 [00:04<00:21, 16.97it/s][A
Copying to 500_train:  13%|█▎        | 53/400 [00:04<00:13, 25.78it/s][A
Copying to 500_train:  16%|█▌        | 64/400 [00:04<00:09, 34.65it/s][A
Copying to 500_train:  19%|█▉        | 76/400 [00:04<00:07, 45.87it/s][A
Copying to 500_train:  22%|██▏       | 87/400 [00:04<00:05, 56.00it/s][A
Copying to 500_train:  25%|██▍       | 99/400 [00:04<00:04, 67.62it/s][A
Copying to 500_train:  28%|██▊       | 110/400 [00:04<00:03, 72.82it/s][A
Copying to 500_train:  30%|███       | 121/400 [00:04<00:03, 79.18it/s][A
Copying to 500_train:  33%|███▎      | 132/40


Creating 250_ folders from 500_ folders...
Created folder: ../n_data/250_train
Created folder: ../n_data/250_val
Created folder: ../n_data/250_test



Copying to 250_train:   0%|          | 0/200 [00:00<?, ?it/s][A
Copying to 250_train:   2%|▎         | 5/200 [00:00<00:19,  9.93it/s][A
Copying to 250_train:   3%|▎         | 6/200 [00:00<00:36,  5.36it/s][A
Copying to 250_train:   9%|▉         | 18/200 [00:01<00:08, 22.05it/s][A
Copying to 250_train:  14%|█▍        | 28/200 [00:01<00:04, 35.37it/s][A
Copying to 250_train:  20%|█▉        | 39/200 [00:01<00:03, 49.96it/s][A
Copying to 250_train:  26%|██▌       | 51/200 [00:01<00:02, 64.75it/s][A
Copying to 250_train:  32%|███▏      | 63/200 [00:01<00:01, 76.83it/s][A
Copying to 250_train:  38%|███▊      | 75/200 [00:01<00:01, 87.35it/s][A
Copying to 250_train:  44%|████▎     | 87/200 [00:01<00:01, 94.95it/s][A
Copying to 250_train:  50%|████▉     | 99/200 [00:01<00:01, 100.01it/s][A
Copying to 250_train:  56%|█████▌    | 111/200 [00:01<00:00, 101.15it/s][A
Copying to 250_train:  61%|██████    | 122/200 [00:02<00:00, 102.91it/s][A
Copying to 250_train:  67%|██████▋   | 134/


Creating 100_ folders from 250_ folders...
Created folder: ../n_data/100_train
Created folder: ../n_data/100_val
Created folder: ../n_data/100_test



Copying to 100_train:   0%|          | 0/80 [00:00<?, ?it/s][A
Copying to 100_train:   5%|▌         | 4/80 [00:00<00:06, 12.61it/s][A
Copying to 100_train:  18%|█▊        | 14/80 [00:00<00:01, 39.59it/s][A
Copying to 100_train:  32%|███▎      | 26/80 [00:00<00:00, 63.03it/s][A
Copying to 100_train:  45%|████▌     | 36/80 [00:00<00:00, 73.71it/s][A
Copying to 100_train:  59%|█████▉    | 47/80 [00:00<00:00, 82.71it/s][A
Copying to 100_train:  72%|███████▎  | 58/80 [00:00<00:00, 90.26it/s][A
Copying to 100_train:  88%|████████▊ | 70/80 [00:00<00:00, 91.96it/s][A
                                                                     [A
Copying to 100_val:   0%|          | 0/10 [00:00<?, ?it/s][A
Copying to 100_val:  90%|█████████ | 9/10 [00:00<00:00, 84.37it/s][A
                                                                  [A
Copying to 100_test:   0%|          | 0/10 [00:00<?, ?it/s][A
Copying to 100_test:  90%|█████████ | 9/10 [00:00<00:00, 89.62it/s][A
Creating Splits:


Creating 50_ folders from 100_ folders...
Created folder: ../n_data/50_train
Created folder: ../n_data/50_val
Created folder: ../n_data/50_test



Copying to 50_train:   0%|          | 0/40 [00:00<?, ?it/s][A
Copying to 50_train:  28%|██▊       | 11/40 [00:00<00:00, 104.37it/s][A
Copying to 50_train:  55%|█████▌    | 22/40 [00:00<00:00, 107.59it/s][A
Copying to 50_train:  82%|████████▎ | 33/40 [00:00<00:00, 105.14it/s][A
                                                                     [A
Copying to 50_val:   0%|          | 0/5 [00:00<?, ?it/s][A
                                                        [A
Copying to 50_test:   0%|          | 0/5 [00:00<?, ?it/s][A
Creating Splits: 100%|██████████| 5/5 [00:35<00:00,  7.09s/it]



Progressive splits created successfully!


Unnamed: 0_level_0,data,l_data,n_data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100_test,0,0,10,Y
100_train,0,0,80,Y
100_val,0,0,10,Y
1k_test,0,0,100,Y
1k_train,0,0,800,Y
1k_val,0,0,100,Y
250_test,0,0,25,Y
250_train,0,0,200,Y
250_val,0,0,25,Y
2k_test,0,0,200,Y


In [26]:
# Observe renamed directory now
observe_datadir(['../data/'])

Unnamed: 0_level_0,data,diff
Subdirectory,Unnamed: 1_level_1,Unnamed: 2_level_1
0_datagov,2000,
0_enron,624,
0_enron_clean,400,
100_test,10,
100_train,80,
100_val,10,
1k_test,100,
1k_train,800,
1k_val,100,
250_test,25,
