# Shuffle the data

1) Load all CSV files from both datasets into separate DataFrames.
2) Shuffle each DataFrame individually.
3) Calculate the number of unwarranted rows to distribute per new CSV.
4) Create new CSV files by iteratively taking slices from the shuffled DataFrames.
5) Ensure that each new CSV has a proportional amount of warranted and unwarranted data.
6) Save the new CSV files.

In [2]:
import os
import pandas as pd
from sklearn.utils import shuffle


warranted_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\warranted_pre_shuffle\\'
unwarranted_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\unwarranted_pre_shuffle\\'
output_dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\'

# Function to load and shuffle CSVs from a directory into a single DataFrame
def load_and_shuffle_csvs(directory):
    all_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    df_list = [pd.read_csv(file) for file in all_files]
    combined_df = pd.concat(df_list, ignore_index=True)
    return shuffle(combined_df)

# Load and shuffle warranted and unwarranted datasets
warranted_df = load_and_shuffle_csvs(warranted_dir)
unwarranted_df = load_and_shuffle_csvs(unwarranted_dir)

# Calculate the number of unwarranted rows to distribute per new CSV
num_output_csvs = 1
unwarranted_per_csv = len(unwarranted_df) // num_output_csvs

# Create and save new CSV files with mixed data
for i in range(num_output_csvs):
    # Slice the warranted DataFrame for this CSV
    warranted_slice = warranted_df[i*unwarranted_per_csv:(i+1)*unwarranted_per_csv]
    
    # Slice the unwarranted DataFrame, wrap around if we reach the end
    start_idx = i * unwarranted_per_csv
    end_idx = start_idx + unwarranted_per_csv
    if end_idx > len(unwarranted_df):
        end_idx -= len(unwarranted_df)
        unwarranted_slice = pd.concat([unwarranted_df[start_idx:], unwarranted_df[:end_idx]])
    else:
        unwarranted_slice = unwarranted_df[start_idx:end_idx]
    
    # Combine the slices and shuffle
    mixed_df = shuffle(pd.concat([warranted_slice, unwarranted_slice], ignore_index=True))
    
    # Save to a new CSV file
    output_file = os.path.join(output_dir, f'mixed_data_{i+1}.csv')
    mixed_df.to_csv(output_file, index=False)
    print(f'Saved shuffled and mixed data to "{output_file}"')


# Split into train and test

In [10]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Directory containing the data
dir = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\'

# Percentage of data to be used for testing (e.g., 0.2 for 20%)
test_size = 0.2

# Iterate over each CSV file in the directory
for filename in os.listdir(dir):
    if filename.endswith('.csv'):
        file_path = os.path.join(dir, filename)
        df = pd.read_csv(file_path)

        # Split the data into training and testing sets
        train_df, test_df = train_test_split(df, test_size=test_size)

        # Save the subsets to new CSV files
        train_df.to_csv(os.path.join(dir, f'train_{filename}'), index=False)
        test_df.to_csv(os.path.join(dir, f'test_{filename}'), index=False)

        print(f"Data from {filename} split into training and test sets and saved.")


Data from mixed_data_1.csv split into training and test sets and saved.


In [3]:
import pandas as pd

csv = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\train_test_data\\test\\test_mixed_data_1.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv)

In [4]:
# print("\nFirst 5 rows of 'text' column:")
# print(df['body'].head())

# print("\nData type of 'text' column:")
# print(df['body'].dtype)

# print("\nCheck for NaN values in 'text' column:")
# print(df['body'].isna().sum())

# print("\nSample values from 'text' column:")
# print(df['body'].sample(5))

# print("\nFirst 5 rows of 'label' column:")
# print(df['label'].head())

# print("\nData type of 'label' column:")
# print(df['label'].dtype)

# print("\nCheck for NaN values in 'label' column:")
# print(df['label'].isna().sum())

# print("\nSample values from 'label' column:")
# print(df['label'].sample(50))

print(len(df))





57382


Unnamed: 0,filename,body,subject,comments,text_plain,text_html,text_not_managed,defects,defects_categories,number of unsubscribe links,...,email size (bytes),dkim-signature,label,UNSAFE_LINK_URL_count,UNSAFE_IMAGE_URL_count,UNSAFE_BUTTON_URL_count,LINK_URL_count,IMAGE_URL_count,BUTTON_URL_count,unsafe_to_safe_link_ratio
0,16835595.270957@public.govdelivery.com.txt,The safety and effectiveness of surgical mesh ...,FDA MedWatch - Labeling updates for BD mesh pr...,[if (gte mso 9)|(IE)]> <![endif] [if (gte mso ...,0,0,1,[{'multipart/alternative': ['StartBoundaryNotF...,"{'MultipartInvariantViolationDefect', 'StartBo...",1,...,27889,Present,0,0,0,0,17,9,5,0.0
1,1594384034.494026.144896.txt,bruce：您好！ 新《劳动合同法》《工伤保险条例》实操应对技巧与有效调岗调薪、裁员解雇、 ...,周玥彤 采用劳务派遣用工方式能否异地参保,No Comments Found in email.,1,1,0,[],set(),0,...,27479,Absent,1,1,0,0,3,0,0,1.0
2,k0udv08g2ei2un32lvblh8388577@convertkit-mail2....,"Hello Honey, Those ""mud pits"" in your yard may...",Put an end to muddy paths in your yard! pig_face,[if mso]> <![endif] [if mso]> <![endif],0,0,1,[{'multipart/alternative': ['StartBoundaryNotF...,"{'MultipartInvariantViolationDefect', 'StartBo...",2,...,8116,Present,0,0,0,0,8,4,2,0.0
3,Glx8nlcKR5iLrQZ287-TbQ@geopod-ismtpd-28.txt,"August 6, 2023 [View in Your Browser]( LINK_UR...",A cruel end for USWNT,[if !mso]><! <![endif] [if mso]> 96 <![endif] ...,0,0,1,[{'multipart/alternative': ['StartBoundaryNotF...,"{'MultipartInvariantViolationDefect', 'StartBo...",1,...,74027,Present,0,3,0,0,18,17,0,0.051724
4,0100018a8951394e-167ca078-1fe8-48e3-901e-71927...,Save now with 2 new deals for notebook. IMAGE_...,Save now with 2 new deals for notebook.,[if mso]> <![endif] [if mso]> <![endif],0,1,0,[],set(),2,...,20327,Present,0,0,0,0,8,7,0,0.0


In [39]:
# save the DataFrame to a new CSV file
df.to_csv(csv, index=False)