# Create Bot-IoT Dataset Subset

This notebook creates a balanced subset of 300,000 rows from the merged Bot-IoT dataset.

## Import Required Libraries

Import pandas and other necessary libraries for data manipulation and file I/O operations.

In [None]:
import pandas as pd
import os

## Define Configuration Variables

Set up configuration variables for the subset creation process.

In [None]:
# Configuration
TARGET_N = 300000  # Target number of rows for the subset
RANDOM_STATE = 42  # For reproducibility

# Data directory and file paths
DATA_DIR = "/Users/nawara/Desktop/LLM-Clustering-Paper/Bot-IoT-Dataset"
MERGED_FILE = os.path.join(DATA_DIR, "UNSW_2018_IoT_Botnet_Full_Merged.csv")

# Alternative: if using individual files instead of merged
FILES = [
    os.path.join(DATA_DIR, "UNSW_2018_IoT_Botnet_Full5pc_1.csv"),
    os.path.join(DATA_DIR, "UNSW_2018_IoT_Botnet_Full5pc_2.csv"),
    os.path.join(DATA_DIR, "UNSW_2018_IoT_Botnet_Full5pc_3.csv"),
    os.path.join(DATA_DIR, "UNSW_2018_IoT_Botnet_Full5pc_4.csv"),
]

print(f"Target subset size: {TARGET_N:,} rows")
print(f"Random state: {RANDOM_STATE}")

## Read and Sample Data from Multiple Files

Iterate through each CSV file, read it, and sample approximately equal numbers of rows from each file.

In [None]:
# Calculate rows to sample from each file
per_file = TARGET_N // len(FILES)
print(f"Sampling approximately {per_file:,} rows from each of {len(FILES)} files\n")

subsets = []
for fp in FILES:
    print(f"Reading sample from {fp} ...")
    
    # Read CSV with low_memory=False to avoid mixed type warnings
    df = pd.read_csv(fp, low_memory=False)
    print(f"  Full file shape: {df.shape}")
    
    # Sample rows
    if len(df) > per_file:
        df = df.sample(n=per_file, random_state=RANDOM_STATE)
        print(f"  Sampled shape: {df.shape}")
    else:
        print(f"  File has {len(df):,} rows (less than target {per_file:,}), using all rows")
    
    subsets.append(df)

print(f"\nCollected {len(subsets)} subsets")

## Combine Subsets into Single DataFrame

Use pd.concat() to combine all sampled subsets into a single DataFrame.

In [None]:
df_sub = pd.concat(subsets, ignore_index=True)
print("Combined subset created successfully!")

## Inspect Subset Data

Display the shape and columns of the combined subset to verify data integrity.

In [None]:
print("Subset shape:", df_sub.shape)
print(f"\nTotal rows: {len(df_sub):,}")
print(f"Total columns: {len(df_sub.columns)}")
print("\nColumn names:")
print(df_sub.columns.tolist())

## Save Subset to CSV

Save the combined subset DataFrame to a CSV file for further analysis.

In [None]:
# Save subset
out_csv = os.path.join(DATA_DIR, "bot_iot_5pc_subset_300k.csv")
df_sub.to_csv(out_csv, index=False)
print(f"âœ“ Subset saved to: {out_csv}")
print(f"  File size: {os.path.getsize(out_csv) / (1024**2):.2f} MB")