## This script is used to clean the database. Do not run if you don't know what you are doing.

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# combine all the csv files in the folder into a new one. They have the same structure and columns.
def combine_csv_files(folder_path: str, output_file: str) -> None:
    """
    Combine all CSV files in a folder into a single CSV file.
    Files must have the same structure and columns.
    
    Args:
        folder_path (str): Path to folder containing CSV files
        output_file (str): Path for the combined output CSV file
    """
    # Get all CSV files in folder
    folder = Path(folder_path)
    csv_files = list(folder.glob('*.csv'))
    
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder}")
        
    # Read and combine all files
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        dfs.append(df)
        
    # Concatenate all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Save combined dataframe
    combined_df.to_csv(output_file, index=False)
    print(f"Combined {len(csv_files)} files into {output_file}")

# Example usage
# combine_csv_files("path/to/csv/folder", "path/to/output.csv")



In [3]:
combine_csv_files("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine", "output.csv")

Combined 3 files into output.csv


In [34]:


class CSVSplitter:
    """
    A class to split large CSV files into smaller chunks while preserving headers.
    """
    
    def __init__(self, input_file: str, chunk_size: int = 10000):
        """
        Initialize the CSV splitter.
        
        Args:
            input_file (str): Path to the input CSV file
            chunk_size (int): Number of rows per output file (excluding header)
        """
        self.input_file = Path(input_file)
        self.chunk_size = chunk_size
        
        if not self.input_file.exists():
            raise FileNotFoundError(f"Input file not found: {self.input_file}")
            
    def split(self) -> list[Path]:
        """
        Split the CSV file into chunks.
        
        Returns:
            list[Path]: List of paths to the generated chunk files
        """
        # Read CSV in chunks
        chunk_paths = []
        for i, chunk in enumerate(pd.read_csv(self.input_file, chunksize=self.chunk_size)):
            # Generate output filename
            output_path = self.input_file.parent / f"{self.input_file.stem}_part{i+1}{self.input_file.suffix}"
            
            # Save chunk with header
            chunk.to_csv(output_path, index=False)
            chunk_paths.append(output_path)
            
        return chunk_paths

# Example usage
splitter = CSVSplitter("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_empty_wait_to_check.csv", chunk_size=14000)
output_files = splitter.split()
print(f"Split into {len(output_files)} files:")
for f in output_files:
    print(f"- {f}")



Split into 2 files:
- /Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_empty_wait_to_check_part1.csv
- /Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_empty_wait_to_check_part2.csv
