In [1]:
import pandas as pd
from pathlib import Path

class CSVSplitter:
    """
    A class to split large CSV files into smaller chunks while preserving headers.
    """
    
    def __init__(self, input_file: str, chunk_size: int = 10000):
        """
        Initialize the CSV splitter.
        
        Args:
            input_file (str): Path to the input CSV file
            chunk_size (int): Number of rows per output file (excluding header)
        """
        self.input_file = Path(input_file)
        self.chunk_size = chunk_size
        
        if not self.input_file.exists():
            raise FileNotFoundError(f"Input file not found: {self.input_file}")
            
    def split(self) -> list[Path]:
        """
        Split the CSV file into chunks.
        
        Returns:
            list[Path]: List of paths to the generated chunk files
        """
        # Read CSV in chunks
        chunk_paths = []
        for i, chunk in enumerate(pd.read_csv(self.input_file, chunksize=self.chunk_size)):
            # Generate output filename
            output_path = self.input_file.parent / f"{self.input_file.stem}_part{i+1}{self.input_file.suffix}"
            
            # Save chunk with header
            chunk.to_csv(output_path, index=False)
            chunk_paths.append(output_path)
            
        return chunk_paths

# Example usage
splitter = CSVSplitter("youtube_channel_urls_web.csv", chunk_size=50000)
output_files = splitter.split()
print(f"Split into {len(output_files)} files:")
for f in output_files:
    print(f"- {f}")



Split into 4 files:
- youtube_channel_urls_web_part1.csv
- youtube_channel_urls_web_part2.csv
- youtube_channel_urls_web_part3.csv
- youtube_channel_urls_web_part4.csv


In [15]:
import pandas as pd
from pathlib import Path

In [16]:
# we are using the notebook. Combine all the csv files from one folder into a new one. They have the same structure and columns.
class CSVCombiner:
    """
    A class to combine multiple CSV files with the same structure into a single file.
    """
    
    def __init__(self, input_dir: str, output_file: str):
        """
        Initialize the CSV combiner.
        
        Args:
            input_dir (str): Directory containing CSV files to combine
            output_file (str): Path for the output combined CSV file
        """
        self.input_dir = Path(input_dir)
        self.output_file = Path(output_file)
        
        if not self.input_dir.exists():
            raise FileNotFoundError(f"Input directory not found: {self.input_dir}")
            
    def combine(self) -> Path:
        """
        Combine CSV files from input directory into a single file.
        
        Returns:
            Path: Path to the combined output file
        """
        # Get list of CSV files
        csv_files = list(self.input_dir.glob("*.csv"))
        if not csv_files:
            raise ValueError(f"No CSV files found in {self.input_dir}")
            
        # Read and combine all CSV files
        dfs = []
        for file in csv_files:
            df = pd.read_csv(file)
            dfs.append(df)
            
        # Concatenate all dataframes
        combined_df = pd.concat(dfs, ignore_index=True)
        
        # Save combined dataframe
        combined_df.to_csv(self.output_file, index=False)
        
        return self.output_file

# Example usage
combiner = CSVCombiner("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine", "/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/combined_output.csv")
output_file = combiner.combine()
print(f"Combined CSV files into: {output_file}")


Combined CSV files into: /Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/combined_output.csv


In [28]:
df = pd.read_csv("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/combined_output.csv")

In [29]:
#how many rows we have in total
print(f"Total number of rows: {len(df)}")

#how many rows we have in total where handle is not empty
print(f"Number of rows where handle is not empty: {len(df[df['handle'].notna()])}")

Total number of rows: 155976
Number of rows where handle is not empty: 129147


In [30]:
# create a new csv file with only the rows where handle is not empty
df[df['handle'].notna()].to_csv("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_not_empty.csv", index=False)


In [19]:
# create a new csv file with only the rows where handle is empty
df[df['handle'].isna()].to_csv("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_empty.csv", index=False)



In [21]:
df = pd.read_csv("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_empty.csv")

In [23]:
#for this df, how many rows we have in total
print(f"Total number of rows: {len(df)}")

#how many rows we have in total where subscribers is not empty
print(f"Number of rows where subscribers is not empty: {len(df[df['subscribers'].notna()])}")



Total number of rows: 26829
Number of rows where subscribers is not empty: 26829


In [26]:
#only leave Youtube_Channel_URL column content, delete the others column content. Keep the header
df = df[['Youtube_Channel_URL']]
df.to_csv("/Users/yuanlu/Code/youtube-top-10000-channels/data/wait_to_combine/handle_empty_wait_to_check.csv", index=False)
