# Shuffle the data

1) Load all CSV files from both datasets into separate DataFrames.
2) Shuffle each DataFrame individually.
3) Calculate the number of unwarranted rows to distribute per new CSV.
4) Create new CSV files by iteratively taking slices from the shuffled DataFrames.
5) Ensure that each new CSV has a proportional amount of warranted and unwarranted data.
6) Save the new CSV files.

In [2]:
import os
import pandas as pd
from sklearn.utils import shuffle


warranted_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\warranted_data\\'
unwarranted_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\cleaned_data\\unwarranted_data\\'
output_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\'

# Function to load and shuffle CSVs from a directory into a single DataFrame
def load_and_shuffle_csvs(directory):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return shuffle(combined_df)

# Load and shuffle warranted and unwarranted datasets
warranted_df = load_and_shuffle_csvs(warranted_dir)
unwarranted_df = load_and_shuffle_csvs(unwarranted_dir)

# Calculate the number of unwarranted rows to distribute per new CSV
num_output_csvs = 1
unwarranted_per_csv = len(unwarranted_df) // num_output_csvs

# Create and save new CSV files with mixed data
for i in range(num_output_csvs):
    # Slice the warranted DataFrame for this CSV
    warranted_slice = warranted_df[i*unwarranted_per_csv:(i+1)*unwarranted_per_csv]
    
    # Slice the unwarranted DataFrame, wrap around if we reach the end
    start_idx = i * unwarranted_per_csv
    end_idx = start_idx + unwarranted_per_csv
    if end_idx > len(unwarranted_df):
        end_idx -= len(unwarranted_df)
        unwarranted_slice = pd.concat([unwarranted_df[start_idx:], unwarranted_df[:end_idx]])
    else:
        unwarranted_slice = unwarranted_df[start_idx:end_idx]
    
    # Combine the slices and shuffle
    mixed_df = shuffle(pd.concat([warranted_slice, unwarranted_slice], ignore_index=True))
    
    # Save to a new CSV file
    output_file = os.path.join(output_dir, f'mixed_data_{i+1}.csv')
    mixed_df.to_csv(output_file, index=False)
    print(f'Saved shuffled and mixed data to "{output_file}"')


  del sys.path[0]


Saved shuffled and mixed data to "C:\Users\ericb\Desktop\Research\542_Project\train_test_data\mixed_data_1.csv"


# Split into train and test

In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Directory containing the data
dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\'

# Percentage of data to be used for testing (e.g., 0.2 for 20%)
test_size = 0.2

# Iterate over each CSV file in the directory
for filename in os.listdir(dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(dir, filename)
        df = pd.read_csv(file_path)

        # Split the data into training and testing sets
        train_df, test_df = train_test_split(df, test_size=test_size)

        # Save the subsets to new CSV files
        train_df.to_csv(os.path.join(dir, f'train_{filename}'), index=False)
        test_df.to_csv(os.path.join(dir, f'test_{filename}'), index=False)

        print(f"Data from {filename} split into training and test sets and saved.")


Data from mixed_data_1.csv split into training and test sets and saved.
