In [1]:
import pandas as pd
from pathlib import Path

class CSVSplitter:
    """
    A class to split large CSV files into smaller chunks while preserving headers.
    """
    
    def __init__(self, input_file: str, chunk_size: int = 10000):
        """
        Initialize the CSV splitter.
        
        Args:
            input_file (str): Path to the input CSV file
            chunk_size (int): Number of rows per output file (excluding header)
        """
        self.input_file = Path(input_file)
        self.chunk_size = chunk_size
        
        if not self.input_file.exists():
            raise FileNotFoundError(f"Input file not found: {self.input_file}")
            
    def split(self) -> list[Path]:
        """
        Split the CSV file into chunks.
        
        Returns:
            list[Path]: List of paths to the generated chunk files
        """
        # Read CSV in chunks
        chunk_paths = []
        for i, chunk in enumerate(pd.read_csv(self.input_file, chunksize=self.chunk_size)):
            # Generate output filename
            output_path = self.input_file.parent / f"{self.input_file.stem}_part{i+1}{self.input_file.suffix}"
            
            # Save chunk with header
            chunk.to_csv(output_path, index=False)
            chunk_paths.append(output_path)
            
        return chunk_paths

# Example usage
splitter = CSVSplitter("youtube_channel_urls_web.csv", chunk_size=50000)
output_files = splitter.split()
print(f"Split into {len(output_files)} files:")
for f in output_files:
    print(f"- {f}")



Split into 4 files:
- youtube_channel_urls_web_part1.csv
- youtube_channel_urls_web_part2.csv
- youtube_channel_urls_web_part3.csv
- youtube_channel_urls_web_part4.csv
