In [None]:
!pip install datasets

In [None]:
#!huggingface-cli login

In [None]:
#!wget -nc https://www.gutenberg.org/cache/epub/feeds/txt-files.tar.zip

In [None]:
#!unzip  txt-files.tar.zip

In [3]:
#!rm  txt-files.tar.zip

In [None]:
#!tar -xvf txt-files.tar

In [None]:
import os
import json
from datasets import load_dataset, DatasetDict

def fetch_text_files(root_dir, limit=None):
    """
    Recursively fetch all .txt files from the given root directory up to a specified limit.
    """
    count = 0
    for dirpath, _, filenames in os.walk(root_dir):
        for file in filenames:
            if file.endswith(".txt"):
                yield os.path.join(dirpath, file)
                count += 1
                if limit and count >= limit:
                    return

def create_dataset_stream(root_dir, output_file, limit=32145):
    """
    Stream process text files and save as JSONL. Processes only up to the specified limit.
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        for file_path in fetch_text_files(root_dir, limit=limit):
            try:
                with open(file_path, 'r', encoding='utf-8') as txt_file:
                    text_content = txt_file.read().strip()
                    f.write(json.dumps({"text": text_content}) + "\n")
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    print(f"Dataset of up to {limit} files saved to {output_file}")

def main():
    # Directory containing your .txt files
    root_dir = "./cache"
    output_file = "gutenberg_dataset.jsonl"
    file_limit = 32145

    # Process files and save as JSONL
    create_dataset_stream(root_dir, output_file, limit=file_limit)

    # Load the dataset into memory (not streaming)
    dataset = load_dataset("json", data_files=output_file, split="train")

    # Split into train and test subsets
    train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
    dataset_dict = DatasetDict({
        "train": train_test_split["train"],
        "test": train_test_split["test"]
    })

    # Save the split datasets to JSONL files
    with open("train_dataset.jsonl", "w", encoding="utf-8") as train_file, \
         open("test_dataset.jsonl", "w", encoding="utf-8") as test_file:
        for entry in dataset_dict["train"]:
            train_file.write(json.dumps(entry) + "\n")
        for entry in dataset_dict["test"]:
            test_file.write(json.dumps(entry) + "\n")

    print("Datasets saved as 'fineweb_train.jsonl' and 'fineweb_test.jsonl'.")

if __name__ == "__main__":
    main()


Dataset of up to 32145 files saved to gutenberg_dataset.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
from datasets import load_dataset, DatasetDict
from huggingface_hub import HfApi

def upload_to_huggingface(dataset_path, repo_name, repo_type="dataset", train_file="train_dataset.jsonl", test_file="test_dataset.jsonl"):
    """
    Upload a dataset to Hugging Face Hub.

    Args:
        dataset_path (str): The directory containing the dataset files.
        repo_name (str): The name of the Hugging Face repository.
        repo_type (str): Repository type (default is "dataset").
        train_file (str): Name of the training dataset file (JSONL format).
        test_file (str): Name of the testing dataset file (JSONL format).
    """
    # Load the train and test datasets
    train_data = load_dataset("json", data_files=f"{dataset_path}/{train_file}", split="train")
    test_data = load_dataset("json", data_files=f"{dataset_path}/{test_file}", split="train")

    # Combine into a DatasetDict
    dataset_dict = DatasetDict({
        "train": train_data,
        "test": test_data,
    })

    # Push the dataset to Hugging Face
    dataset_dict.push_to_hub(repo_name, private=False)
    print(f"Dataset successfully uploaded to Hugging Face Hub under repository '{repo_name}'.")

if __name__ == "__main__":
    # Path to your local dataset folder
    dataset_path = "./"  # Replace with your dataset folder
    repo_name = "ewre324/gutenberg32145"  # Replace with your desired repository name

    # Upload the dataset
    upload_to_huggingface(dataset_path, repo_name)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]




Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset successfully uploaded to Hugging Face Hub under repository 'ewre324/gutenberg2145'.
