# CSV to ClassLabel to Apache Arrow Splits

This solution uses `load_dataset`, `.cast()`, `.train_test_split()`, and `save_to_disk()` to achieve the final, optimized result.

### Prerequisites and Setup


In [1]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Features, ClassLabel, load_from_disk
import os
import shutil
import matplotlib.pyplot as plt
import numpy as np

# --- Configuration ---
CSV_FILE_NAME = "../datasets/product_reviews.csv"
FINAL_SAVE_PATH = "../product_review_splits"
SEED = 40 # For reproducible splits
TEST_SIZE = 0.10
VALID_SIZE = 0.1111 # 10% of the original data is ~11.11% of the remaining 90%

  from .autonotebook import tqdm as notebook_tqdm


### 1. Load CSV and Cast to ClassLabel

We load the CSV and immediately define and apply the ClassLabel feature to the sentiment column.

In [2]:
# 2. Load the CSV. 'sentiment' loads as a simple string.
initial_dataset_dict = load_dataset("csv", data_files=CSV_FILE_NAME)
raw_dataset = initial_dataset_dict["train"] # Access the single 'train' split

# 3. Define the ClassLabel Feature
# The order here defines the integer mapping: negative=0, neutral=1, positive=2
sentiment_features = raw_dataset.features.copy()
sentiment_features["sentiment"] = ClassLabel(names=['negative', 'neutral', 'positive'])

# 4. Cast the column
processed_dataset = raw_dataset.cast(sentiment_features)

print("\n--- After Casting ---")
print(f"Dataset features: {processed_dataset.features}")
print(f"First example label (integer ID): {processed_dataset[0]['sentiment']}")
print(f"Decoded label: {processed_dataset.features['sentiment'].int2str(processed_dataset[0]['sentiment'])}")

Generating train split: 100 examples [00:00, 35025.50 examples/s]
Casting the dataset: 100%|██████████| 100/100 [00:00<00:00, 44892.48 examples/s]


--- After Casting ---
Dataset features: {'product': Value('string'), 'review': Value('string'), 'sentiment': ClassLabel(names=['negative', 'neutral', 'positive'])}
First example label (integer ID): 2
Decoded label: positive





### 2. Split into Train, Validation, and Test Sets

We apply the **two-step split** process to partition the data into the three required splits.

In [3]:
# 5. First Split: Separate Test Set (10%)
# Splits the data into a Test set (10%) and a larger Train/Valid set (90%)
train_valid_splits = processed_dataset.train_test_split(test_size=TEST_SIZE, seed=SEED)

test_dataset = train_valid_splits["test"]
train_valid_dataset = train_valid_splits["train"]

# 6. Second Split: Separate Train and Validation Sets
# Splits the remaining 90% into Train (80%) and Validation (10%)
# VALID_SIZE = 0.1111 (10% / 90%)
final_splits = train_valid_dataset.train_test_split(test_size=VALID_SIZE, seed=SEED)

train_dataset = final_splits["train"]
valid_dataset = final_splits["test"] # The 'test' of this split becomes our validation

print("\n--- Split Sizes ---")
print(f"Train: {len(train_dataset)} examples")
print(f"Validation: {len(valid_dataset)} examples")
print(f"Test: {len(test_dataset)} examples")


--- Split Sizes ---
Train: 80 examples
Validation: 10 examples
Test: 10 examples


### 3. Save as Apache Arrow Format

We assemble the three splits into a `DatasetDict` and use `save_to_disk()` (which uses the Apache Arrow/Parquet format by default) as the best practice for storage.

In [4]:
# 7. Combine into a DatasetDict
final_dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": valid_dataset,
    "test": test_dataset
})

# 8. Save the entire DatasetDict to disk (Best Practice)
final_dataset_dict.save_to_disk(FINAL_SAVE_PATH)

print(f"\nSuccessfully saved all splits in Apache Arrow format to: {FINAL_SAVE_PATH}")
print(f"Saved DatasetDict structure:\n{final_dataset_dict}")

# Clean up the intermediate CSV
#os.remove(CSV_FILE_NAME)

Saving the dataset (1/1 shards): 100%|██████████| 80/80 [00:00<00:00, 29826.16 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 4147.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10/10 [00:00<00:00, 4223.87 examples/s]


Successfully saved all splits in Apache Arrow format to: ../product_review_splits
Saved DatasetDict structure:
DatasetDict({
    train: Dataset({
        features: ['product', 'review', 'sentiment'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['product', 'review', 'sentiment'],
        num_rows: 10
    })
    test: Dataset({
        features: ['product', 'review', 'sentiment'],
        num_rows: 10
    })
})





### 4. Loading the Final Dataset

To demonstrate that the saved files retain the `ClassLabel` metadata, we load the data using `load_from_disk()`.

In [5]:
# 9. Load the DatasetDict from the saved directory
loaded_splits = load_from_disk(FINAL_SAVE_PATH)

# Verify the features are correct
print("\n--- Verification: Reloaded Features ---")
print(f"Loaded validation set features:\n{loaded_splits['validation'].features['sentiment']}")
print(f"Loaded validation set size: {len(loaded_splits['validation'])}")
print("\n--- DatasetDict")
print(loaded_splits)
# Clean up the saved directory
#shutil.rmtree(FINAL_SAVE_PATH)


--- Verification: Reloaded Features ---
Loaded validation set features:
ClassLabel(names=['negative', 'neutral', 'positive'])
Loaded validation set size: 10

--- DatasetDict
DatasetDict({
    train: Dataset({
        features: ['product', 'review', 'sentiment'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['product', 'review', 'sentiment'],
        num_rows: 10
    })
    test: Dataset({
        features: ['product', 'review', 'sentiment'],
        num_rows: 10
    })
})
