# Dataset Preparation (Whatsapp Review)
---
### operations:
- Step 1: Load Data
- Step 2: Filter Rows based on Column value
- Step 2.1: Delete columns (optional)
- Step 3: Split into 3 files CSV based on size (15 MB, 2 MB, 25 KB)
- Step 3.1: write any remaining data to CSV

In [7]:
import pandas as pd
import csv
import os

In [8]:
# --- Configuration ---
SOURCE_FILE = './raw/whatsapp_reviews_multilingual.csv'
output_file = './ready-to-process/whatsapp-review/whatsapp_reviews_multilingual.csv_output.csv'
FILTER_COLUMN = 'language'
FILTER_VALUE = 'id'

COLUMNS_TO_DELETE = ['score', 'language', 'country']

# Define your target files and sizes (in bytes)
# 15 MB = 15 * 1024 * 1024
# 2 MB  = 2 * 1024 * 1024
# 25 KB = 25 * 1024
TARGETS = [
    {'filename': 'data_part_1_15MB.csv', 'size_bytes': 15728640},
    {'filename': 'data_part_2_2MB.csv',  'size_bytes': 2097152},
    {'filename': 'data_part_3_25KB.csv', 'size_bytes': 25600}
]

# File for any data left over after all targets are met
LEFTOVER_FILE = 'data_part_4_remaining.csv'
OUTPUT_FOLDER = 'ready-to-process/whatsapp-review'

In [9]:
# --- Step 1: Load from CSV ---
print(f"Loading data from {SOURCE_FILE}...")
try:
    df = pd.read_csv(SOURCE_FILE)
    print(f"Data loaded. Original shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: The file '{SOURCE_FILE}' was not found.")
    exit()

Loading data from ./raw/whatsapp_reviews_multilingual.csv...
Data loaded. Original shape: (600478, 4)


In [10]:
# --- Step 2: Filter rows based on a column value ---
# This creates a new DataFrame containing only the rows that match

print(f"Filtering based on '{FILTER_COLUMN} = {FILTER_VALUE}' rule")
try:
    filtered_df = df[df[FILTER_COLUMN] == FILTER_VALUE]
except KeyError:
    print(f"Error: The column '{FILTER_COLUMN}' was not found in the file.")

Filtering based on 'language = id' rule


In [11]:
# --- Step 2.1: Delete Columns (optional) ---
# print(f"Deleting columns: {', '.join(COLUMNS_TO_DELETE)}...")
# try:
#     df_cleaned = filtered_df.drop(columns=COLUMNS_TO_DELETE, errors='ignore')
#     print(f"Cleaned shape: {df_cleaned.shape}")
# except KeyError:
#     print(f"Error: The column(s) '{COLUMNS_TO_DELETE}' was not found in the file.")

In [12]:
# --- Step 3: Split Data by Checking File Size ---

# Create the output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Get the header (column names) for writing to the CSVs
header = list(filtered_df.columns)

# Convert DataFrame to a list of dictionaries. This is a fast way to iterate.
# 
records = filtered_df.to_dict('records')

# Create an iterator, which lets us pull one record at a time
record_iterator = iter(records)

# We will use this to track if we ran out of data
data_is_exhausted = False

for target in TARGETS:
    filename = os.path.join(OUTPUT_FOLDER, target['filename'])
    target_size = target['size_bytes']
    
    print(f"Writing to '{filename}' until it reaches {target_size / 1024 / 1024:.2f} MB...")
    
    current_size = 0
    
    try:
        # Open the new file for writing
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            # Use csv.DictWriter to write rows from our dictionaries
            writer = csv.DictWriter(f, fieldnames=header)
            
            # Write the header first
            writer.writeheader()
            current_size = f.tell() # Get the size of the header
            
            # Keep writing rows until we hit the target size
            while current_size < target_size:
                # Get the next record from our iterator
                record = next(record_iterator)
                writer.writerow(record)
                
                # Check the file's current size
                current_size = f.tell()
                
    except StopIteration:
        # This error means 'next(record_iterator)' failed because
        # we ran out of records.
        print(f"Warning: Ran out of data while writing to {filename}.")
        data_is_exhausted = True
        break # Exit the main 'for' loop
    
    print(f"Finished '{filename}'. Final size: {current_size} bytes.")

Writing to 'ready-to-process/whatsapp-review/data_part_1_15MB.csv' until it reaches 15.00 MB...
Finished 'ready-to-process/whatsapp-review/data_part_1_15MB.csv'. Final size: 15728763 bytes.
Writing to 'ready-to-process/whatsapp-review/data_part_2_2MB.csv' until it reaches 2.00 MB...
Finished 'ready-to-process/whatsapp-review/data_part_2_2MB.csv'. Final size: 2097167 bytes.
Writing to 'ready-to-process/whatsapp-review/data_part_3_25KB.csv' until it reaches 0.02 MB...
Finished 'ready-to-process/whatsapp-review/data_part_3_25KB.csv'. Final size: 25708 bytes.


In [13]:
# --- Step 3.1: Save Any Leftover Data ---
if not data_is_exhausted:
    print("Saving remaining data...")
    
    # Convert the rest of the iterator to a list
    remaining_records = list(record_iterator)
    
    if remaining_records:
        leftover_path = os.path.join(OUTPUT_FOLDER, LEFTOVER_FILE)
        
        with open(leftover_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=header)
            writer.writeheader()
            writer.writerows(remaining_records)
            
        print(f"Saved {len(remaining_records)} leftover rows to '{leftover_path}'.")
    else:
        print("No leftover data to save.")

print("Data splitting complete!")

Saving remaining data...
Saved 18879 leftover rows to 'ready-to-process/whatsapp-review/data_part_4_remaining.csv'.
Data splitting complete!
