In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
import fitz  # PyMuPDF
import json
import re
import random
from datasets import DatasetDict, Dataset

In [3]:
# ----------- Extract Text from PDFs -----------
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text

def extract_text_from_folder(folder_path):
    all_text = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
            text = extract_text_from_pdf(pdf_path)
            all_text.append(text)
    return "\n".join(all_text)


In [4]:
# ----------- Create Instruction-Based Dataset -----------
def split_into_paragraphs(text):
    paragraphs = re.split(r"\n\s*\n", text)  # Split on empty lines
    return [p.strip() for p in paragraphs if len(p.strip()) > 50]

def create_instruction_format(paragraphs):
    dataset = []
    for para in paragraphs:
        dataset.append({
            "instruction": "Summarize the following historical passage:",
            "input": para,
            "output": " ".join(para.split()[:50]) + "..."  # Placeholder summary
        })
    return dataset

In [5]:
# ----------- Split into Train/Test (80-20) -----------
def split_train_test(data, train_ratio=0.8):
    random.shuffle(data)  # Shuffle for randomness
    split_idx = int(len(data) * train_ratio)
    train_data = data[:split_idx]
    test_data = data[split_idx:]
    return train_data, test_data



In [6]:
# ----------- Save as JSON -----------
def save_as_json(data, output_path):
    with open(output_path, "w") as f:
        json.dump(data, f, indent=4)


In [7]:
# ----------- Upload to Hugging Face Datasets -----------
def upload_to_huggingface(train_json, test_json, dataset_name):
    dataset = DatasetDict({
        "train": Dataset.from_json(train_json),
        "test": Dataset.from_json(test_json),
    })
    dataset.push_to_hub(dataset_name)


In [9]:
# ----------- Main Execution -----------
folder_path = "C:/Users/Admin/Desktop/pdf_folder"  # Change this to your folder
train_json = "history_train.json"
test_json = "history_test.json"
huggingface_dataset_name = "gauri-sharan/history-class8-dataset"

# Processing pipeline
text_data = extract_text_from_folder(folder_path)
paragraphs = split_into_paragraphs(text_data)
instruction_data = create_instruction_format(paragraphs)

# Split data
train_data, test_data = split_train_test(instruction_data)

# Save train & test sets
save_as_json(train_data, train_json)
save_as_json(test_data, test_json)

# Upload dataset
upload_to_huggingface(train_json, test_json, huggingface_dataset_name)

print("Dataset processing and upload completed! 🚀")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset processing and upload completed! 🚀
