In [None]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
# ================== ONE-CELL STYLE-TRANSFER PIPELINE ==================
# 1. load JSONL
# 2. sample 10% from every source_style
# 3. assign target_style (round-robin, not equal to source)
# 4. call OpenAI (GPT-4o) to rewrite
# 5. append transfer rows
# 6. write merged JSONL
# ======================================================================

!pip install --quiet --upgrade openai pandas

import json, random
from pathlib import Path
import pandas as pd
from openai import OpenAI
import os


INPUT_FILE = "/content/drive/MyDrive/novel_paragraphs_12k_self.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/novel_paragraphs_with_transfer.jsonl"

 
client = OpenAI(api_key= "*******") #Removed key for submission as API - key is personal

SAMPLE_FRAC = 0.30   # 30% from each style

# -------------------------------------------------
# 1) LOAD JSONL
# -------------------------------------------------
rows = []
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))

df = pd.DataFrame(rows)
print(" Loaded rows:", len(df))

if "source_style" not in df.columns:
    raise ValueError("Dataset must have a 'source_style' column like in your screenshot.")

styles = list(df["source_style"].dropna().unique())
styles.sort()
print(" Styles found:", styles)

# -------------------------------------------------
# 2) SAMPLE 30% FROM EACH SOURCE STYLE
# -------------------------------------------------
sample_parts = []
for s, g in df.groupby("source_style"):
    n = max(1, int(len(g) * SAMPLE_FRAC))
    part = g.sample(n=n, random_state=42)
    sample_parts.append(part)

sampled_df = pd.concat(sample_parts, ignore_index=True)
print(" Sampled for transfer:", len(sampled_df))

# -------------------------------------------------
# 3) ASSIGN TARGET STYLES UNIFORMLY (ROUND-ROBIN)
# -------------------------------------------------
def pick_target_style(src_style, all_styles, idx):
    tgt = all_styles[idx % len(all_styles)]
    if tgt == src_style:
        tgt = all_styles[(idx + 1) % len(all_styles)]
    return tgt

transfer_jobs = []
for i, row in sampled_df.reset_index(drop=True).iterrows():
    src = row["source_style"]
    tgt = pick_target_style(src, styles, i)
    transfer_jobs.append((row, tgt))

print(" Transfer jobs prepared:", len(transfer_jobs))

# -------------------------------------------------
# 4) OPENAI CALL – REWRITE (GPT-4o)
# -------------------------------------------------
def rewrite_passage(paragraph, src_style, tgt_style):
    if not paragraph:
        paragraph = "The evening was quiet, and the characters were waiting for the next event to unfold."

    prompt = f"""
You are a literary rewriting assistant that performs *genre transfer* on narrative paragraphs.

Your task: rewrite the following paragraph in the **{tgt_style}** style.

Rules:
1. Preserve all **character names, relationships, and identities** exactly as they appear.
2. Keep all **main events, actions, and logical sequence** unchanged.
3. Adjust only the **tone, narration, vocabulary, atmosphere, and imagery** to fit the new genre.
4. Do **not** add new events or characters.
5. Do **not** summarize or explain — return a full rewritten passage in the new style.
6. The rewritten paragraph should feel naturally written in the {tgt_style} genre.

Example of what to change:
- Romance → focus on emotions, tenderness, and sensory detail.
- Gothic → emphasize mystery, darkness, and psychological unease.
- Detective → focus on observation, logic, and suspense.
- Fantasy → include wonder, magical realism, or mythic tone.
- Science fiction → emphasize technology, futurism, or speculative ideas.
- Comedy → inject humor, irony, and wit.

---
Source genre: {src_style}
Target genre: {tgt_style}

Paragraph:
\"\"\"{paragraph}\"\"\"
"""

    resp = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a literary style transfer model."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=450,
    )
    return resp.choices[0].message.content.strip()

# -------------------------------------------------
# 5) RUN ALL TRANSFERS
# -------------------------------------------------
new_rows = []
for i, (row, tgt_style) in enumerate(transfer_jobs, 1):
    paragraph = (
        row.get("raw_content")
        or row.get("output")
        or row.get("input")
        or ""
    )
    src_style = row["source_style"]

    try:
        rewritten = rewrite_passage(paragraph, src_style, tgt_style)
    except Exception as e:
        print(f" Error at {i}: {e}")
        continue

    new_obj = {
        "mode": "transfer",
        "source_style": src_style,
        "target_style": tgt_style,
        "input": f"Rewrite this paragraph in {tgt_style} style: {paragraph}",
        "output": rewritten,
        "raw_content": paragraph,
    }
    new_rows.append(new_obj)

    if i % 10 == 0:
        print(f" rewritten {i}/{len(transfer_jobs)}")

print(" Transfer rows created:", len(new_rows))

# -------------------------------------------------
# 6) MERGE + SAVE
# -------------------------------------------------
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for r in rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")
    for r in new_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(" Done. Merged dataset saved to:", OUTPUT_FILE)
# ================== END CELL ==================


In [None]:
file_path = "/content/drive/MyDrive/novel_paragraphs_final.jsonl"
import json

data = []
with open(file_path, "r") as f:
    for line in f:
        data.append(json.loads(line))

print(f"Loaded {len(data)} rows")
print(data[0])  # see the first entry
import pandas as pd

df = pd.DataFrame(data)

df_1 = df[df["mode"] == "transfer"]
df_2 = df[df["mode"] == "self"]

# display the first few rows
df_1.head()



Loaded 15575 rows
{'mode': 'self', 'source_style': 'romance', 'target_style': 'romance', 'input': 'Rewrite this paragraph in romance style:\nThe young man saw at once that his hostess was not in the room; then, with surprise, he discovered another lady standing by the fire. This lady, who was long, lean and loosely put together, was clad in raiment intricately looped and fringed, with plaids and stripes and bands of plain colour disposed in a design to which the clue seemed missing. Her hair, which had tried to turn white and only succeeded in fading, was surmounted by a Spanish comb and black lace scarf, and silk mittens, visibly darned, covered her rheumatic hands.', 'output': 'The young man saw at once that his hostess was not in the room; then, with surprise, he discovered another lady standing by the fire. This lady, who was long, lean and loosely put together, was clad in raiment intricately looped and fringed, with plaids and stripes and bands of plain colour disposed in a desig

Unnamed: 0,mode,source_style,target_style,input,output,raw_content
12000,transfer,comedy,detective_mystery,Rewrite this paragraph in detective_mystery st...,"In the dim, flickering light of the neglected ...","""No, no, no! What yer got? what yer got? Gie m..."
12001,transfer,comedy,detective_mystery,Rewrite this paragraph in detective_mystery st...,The tension in the room thickened as Gowing bu...,Goodness knows I felt humiliated enough at thi...
12002,transfer,comedy,fantasy,Rewrite this paragraph in fantasy style: At Dr...,"At the ancient city of Dresden, we were forewa...",At Dresden they advised us not to talk German ...
12003,transfer,comedy,goth,Rewrite this paragraph in goth style: But ther...,"In the shadowed recesses of our minds, both Ha...",But there! I daresay both Harris and I would h...
12004,transfer,comedy,romance,Rewrite this paragraph in romance style: Georg...,George has been remarkably devoted to his dear...,George has been very attentive to this aunt of...


In [None]:


df_1.to_csv("/content/drive/MyDrive/novel_paragraphs_transfer.csv", index=False)
df_2.to_csv("/content/drive/MyDrive/novel_paragraphs_self.csv", index=False)


In [None]:
!grep -c '"mode": "transfer"' "/content/drive/MyDrive/novel_paragraphs_final.jsonl"


3575


In [None]:
# === Keep 6,000 self rows (balanced by genre) + keep all transfer rows unchanged ===
# Inputs/outputs
INPUT_PATH = "/content/drive/MyDrive/novel_paragraphs_final.jsonl"   # change if needed
OUTPUT_JSONL = "/content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.jsonl"
OUTPUT_CSV   = "/content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.csv"

import json
from pathlib import Path
import pandas as pd


print(f"Using mode column:  {mode_col}")
print(f"Using genre column: {genre_col}")

# -------- Split self vs transfer --------
df_self = df[df[mode_col] == "self"].copy()
df_transfer = df[df[mode_col] == "transfer"].copy()

print(f"Total rows: {len(df)} | self: {len(df_self)} | transfer: {len(df_transfer)}")


# -------- Combine with all transfer rows --------
df_out = pd.concat([df_self_6k, df_transfer], ignore_index=True)

# -------- Save --------
df_out.to_json(OUTPUT_JSONL, orient="records", lines=True, force_ascii=False)
df_out.to_csv(OUTPUT_CSV, index=False)

# -------- Verification --------
print("\n=== Verification ===")
print("Counts by mode:")
print(df_out[mode_col].value_counts())

print("\nSelf rows per genre (post-sampling):")
print(df_out[df_out[mode_col] == "self"][genre_col].value_counts().sort_index())

print("\nTransfer rows per genre (unchanged):")
print(df_out[df_out[mode_col] == "transfer"][genre_col].value_counts().sort_index())

print("\nSaved to:")
print("JSONL:", OUTPUT_JSONL)
print("CSV:  ", OUTPUT_CSV)


Using mode column:  mode
Using genre column: source_style
Total rows: 15575 | self: 12000 | transfer: 3575

=== Verification ===
Counts by mode:
mode
self        6000
transfer    3575
Name: count, dtype: int64

Self rows per genre (post-sampling):
source_style
comedy               1000
detective_mystery    1000
fantasy              1000
goth                 1000
romance              1000
science_fiction      1000
Name: count, dtype: int64

Transfer rows per genre (unchanged):
source_style
comedy               600
detective_mystery    600
fantasy              600
goth                 600
romance              600
science_fiction      575
Name: count, dtype: int64

Saved to:
JSONL: /content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.jsonl
CSV:   /content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.csv
