In [1]:
# Import required libraries
import os
import pandas as pd
from datasets import load_dataset

class HuggingFaceDataLoader:
    """
    A class for loading datasets from Hugging Face, chunking them, and exporting to Parquet format.

    Attributes:
        output_dir (str): The directory where output files will be saved.
        raw_dataset: The loaded Hugging Face dataset.
    """

    def __init__(self, output_dir: str):
        """
        Initialize the HuggingFaceDataLoader with an output directory.

        Args:
            output_dir (str): The directory where output files will be saved.
        """
        self.output_dir = output_dir
        self.raw_dataset = None

    def pull_and_chunk_dataset(self, dataset_link: str, split: str = "train", chunk_size: int = 500000):
        """
        Pull a dataset from Hugging Face, split it into smaller chunks, and save to the output directory.

        Args:
            dataset_link (str): The dataset link for Hugging Face dataset.
            split (str): The dataset split to download (e.g., 'train', 'test').
            chunk_size (int): Number of rows per chunk. Adjust this size based on the file size limitations.

        Raises:
            Exception: If there's an error pulling or saving the dataset.
        """
        try:
            # Load the dataset from Hugging Face (with a specific split)
            self.raw_dataset = load_dataset(dataset_link, split=split)

            # Convert the dataset to a pandas DataFrame
            df = self.raw_dataset.to_pandas()

            # Calculate the number of chunks based on the desired chunk size
            num_chunks = (len(df) // chunk_size) + 1

            # Ensure the output directory exists
            os.makedirs(self.output_dir, exist_ok=True)

            # Split and save each chunk as a separate Parquet file
            for i in range(num_chunks):
                chunk_df = df[i * chunk_size: (i + 1) * chunk_size]
                chunk_path = os.path.join(self.output_dir, f"data_chunk_{i}.parquet")
                chunk_df.to_parquet(chunk_path)
                print(f"Saved chunk {i+1}/{num_chunks} to {chunk_path}")

        except Exception as e:
            print(f"Error pulling or saving the dataset: {e}")
            raise

# Initialize HuggingFaceDataLoader with the output directory
output_dir = "../data/original_data/"
data_loader = HuggingFaceDataLoader(output_dir=output_dir)

# Pull and chunk the dataset
data_loader.pull_and_chunk_dataset(dataset_link="Nan-Do/code-search-net-python", chunk_size=500000)


  from .autonotebook import tqdm as notebook_tqdm


Saved chunk 1/1 to ../data/original_data/data_chunk_0.parquet
