In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import json
from pathlib import Path
from random import shuffle

# ------------ CONFIG ------------
RAW_ROOT = Path("/content/drive/MyDrive/raw")
OUT_PATH = "/content/drive/MyDrive/novel_paragraphs_12k_self.jsonl"

GENRES = [
    "romance",
    "goth",
    "detective_mystery",
    "fantasy",
    "science_fiction",
    "comedy",
]

TARGET_PER_GENRE = 2000  # 6 * 2000 = 12000
MIDDLE_LOW = 0.20        # keep paragraphs between 20% and ...
MIDDLE_HIGH = 0.80       # ... 80% of the book

def word_count(s: str) -> int:
    return len(s.split())

MIN_WORDS = 80
MAX_WORDS = 220

# -------------------------------

def strip_gutenberg_boiler(text: str) -> str:
    """
    Remove common Gutenberg header/footer markers.
    It's heuristic but good enough for our use.
    """
    # header
    start_re = re.compile(r"\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*", re.IGNORECASE)
    m = start_re.search(text)
    if m:
        text = text[m.end():]
    else:
        # try a softer start
        m2 = re.search(r"\*\*\* START OF THE PROJECT GUTENBERG EBOOK .*", text, re.IGNORECASE)
        if m2:
            text = text[m2.end():]

    # footer
    end_re = re.compile(r"\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*", re.IGNORECASE)
    m = end_re.search(text)
    if m:
        text = text[:m.start()]
    else:
        # softer footer
        m2 = re.search(r"End of the Project Gutenberg EBook .*", text, re.IGNORECASE)
        if m2:
            text = text[:m2.start()]

    return text.strip()

def split_into_paragraphs(text: str):
    """
    Splits raw text into paragraphs (based on blank lines),
    and keeps only those that have between 120–220 words.
    """
    # normalize newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # split on blank lines
    raw_paras = re.split(r"\n\s*\n", text)

    paras = []
    for p in raw_paras:
        # clean up extra spaces
        p = re.sub(r"\s+", " ", p).strip()

        # ✅ filter by word count
        if MIN_WORDS <= word_count(p) <= MAX_WORDS:
            paras.append(p)

    return paras

def pick_middle_band(paras, low=0.2, high=0.8):
    """
    paras: list[str]
    returns the sublist from low% to high% (indices)
    """
    n = len(paras)
    if n == 0:
        return []
    start = int(n * low)
    end = int(n * high)
    # make sure we have at least something
    if end <= start:
        start = 0
        end = n
    return paras[start:end]

all_rows = []

for genre in GENRES:
    genre_dir = RAW_ROOT / genre
    txt_files = list(genre_dir.glob("*.txt"))
    if not txt_files:
        print(f"[WARN] No txt files for genre {genre} in {genre_dir}")
        continue

    shuffle(txt_files)
    collected = 0
    print(f"\n=== Processing genre: {genre} ===")

    for txt_path in txt_files:
        if collected >= TARGET_PER_GENRE:
            break

        text = txt_path.read_text(encoding="utf-8", errors="ignore")
        text = strip_gutenberg_boiler(text)
        paras = split_into_paragraphs(text)
        if not paras:
            continue

        middle_paras = pick_middle_band(paras, MIDDLE_LOW, MIDDLE_HIGH)
        if not middle_paras:
            # fallback: use all
            middle_paras = paras

        shuffle(middle_paras)

        for para in middle_paras:
            row = {
                "mode": "self",
                "source_style": genre,
                "target_style": genre,
                "input": f"Rewrite this paragraph in {genre} style:\n{para}",
                "output": para,
                "raw_content": para
            }
            all_rows.append(row)
            collected += 1
            if collected >= TARGET_PER_GENRE:
                break

    print(f"→ collected {collected} rows for {genre}")

# write JSONL
with open(OUT_PATH, "w", encoding="utf-8") as f:
    for r in all_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"\n✅ Done. Wrote {len(all_rows)} rows to {OUT_PATH}")



=== Processing genre: romance ===
→ collected 2000 rows for romance

=== Processing genre: goth ===
→ collected 2000 rows for goth

=== Processing genre: detective_mystery ===
→ collected 2000 rows for detective_mystery

=== Processing genre: fantasy ===
→ collected 2000 rows for fantasy

=== Processing genre: science_fiction ===
→ collected 2000 rows for science_fiction

=== Processing genre: comedy ===
→ collected 2000 rows for comedy

✅ Done. Wrote 12000 rows to /content/drive/MyDrive/novel_paragraphs_12k_self.jsonl


In [None]:
import json

path = '/content/drive/MyDrive/novel_paragraphs_12k_self.jsonl'

data = []
with open(path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} rows.")
print("Example row:")
print(json.dumps(data[0], indent=2, ensure_ascii=False))

Loaded 12000 rows.
Example row:
{
  "mode": "self",
  "source_style": "romance",
  "target_style": "romance",
  "input": "Rewrite this paragraph in romance style:\nThe young man saw at once that his hostess was not in the room; then, with surprise, he discovered another lady standing by the fire. This lady, who was long, lean and loosely put together, was clad in raiment intricately looped and fringed, with plaids and stripes and bands of plain colour disposed in a design to which the clue seemed missing. Her hair, which had tried to turn white and only succeeded in fading, was surmounted by a Spanish comb and black lace scarf, and silk mittens, visibly darned, covered her rheumatic hands.",
  "output": "The young man saw at once that his hostess was not in the room; then, with surprise, he discovered another lady standing by the fire. This lady, who was long, lean and loosely put together, was clad in raiment intricately looped and fringed, with plaids and stripes and bands of plain c