In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

from sentence_transformers import SentenceTransformer


In [None]:
# === 1. Load embeddings (precomputed MPNet) ===
EMB_PATH = "/content/drive/MyDrive/genre_classification_mpnet/genre_embeddings.npy"
X = np.load(EMB_PATH)
print("Embeddings shape:", X.shape)

# === 2. Rebuild labels from the same filtered DF ===
CSV_PATH = "/content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.csv"
TEXT_COL = "raw_content"
LABEL_COL = "source_style"

df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).reset_index(drop=True)

print("DF shape after dropna:", df.shape)
assert df.shape[0] == X.shape[0], "DF rows and embedding rows must match!"

y_raw = df[LABEL_COL].tolist()
print("Number of labels:", len(y_raw))

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
num_classes = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)

# === 3. Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# === 4. Dataset + DataLoaders ===
class EmbeddingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = EmbeddingDataset(X_train, y_train)
test_ds  = EmbeddingDataset(X_test,  y_test)

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

emb_dim = X.shape[1]   # e.g. 768

# === 5. RNN classifier on embeddings ===
class RNNOnEmbeddings(nn.Module):
    def __init__(self, emb_dim, hidden_dim, num_classes, num_layers=1, bidirectional=False):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )
        self.bidirectional = bidirectional
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        # x: (batch, emb_dim) -> (batch, seq_len=1, emb_dim)
        x_seq = x.unsqueeze(1)
        outputs, (h_n, c_n) = self.lstm(x_seq)

        if self.bidirectional:
            h_forward = h_n[-2, :, :]
            h_backward = h_n[-1, :, :]
            h = torch.cat((h_forward, h_backward), dim=1)
        else:
            h = h_n[-1, :, :]

        logits = self.fc(h)
        return logits

hidden_dim = 256
model_rnn = RNNOnEmbeddings(
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_layers=1,
    bidirectional=False,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=1e-3)

num_epochs = 20  

for epoch in range(1, num_epochs + 1):
    model_rnn.train()
    total_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        logits = model_rnn(batch_X)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_X.size(0)

    avg_loss = total_loss / len(train_ds)
    print(f"Epoch {epoch}/{num_epochs} - Train loss: {avg_loss:.4f}")

# Quick check on test set 
model_rnn.eval()
all_preds, all_true = [], []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        logits = model_rnn(batch_X)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_true.extend(batch_y.numpy())

print("\n=== RNN on Embeddings - Classification Report ===\n")
print(classification_report(all_true, all_preds, target_names=label_encoder.classes_))

print("\n=== Confusion Matrix ===\n")
print(confusion_matrix(all_true, all_preds))


Embeddings shape: (9575, 768)
DF shape after dropna: (9575, 6)
Number of labels: 9575
Classes: ['comedy' 'detective_mystery' 'fantasy' 'goth' 'romance' 'science_fiction']
Train shape: (7660, 768) Test shape: (1915, 768)
Using device: cuda
Epoch 1/20 - Train loss: 1.4717
Epoch 2/20 - Train loss: 1.0087
Epoch 3/20 - Train loss: 0.8574
Epoch 4/20 - Train loss: 0.7720
Epoch 5/20 - Train loss: 0.7194
Epoch 6/20 - Train loss: 0.6777
Epoch 7/20 - Train loss: 0.6466
Epoch 8/20 - Train loss: 0.6220
Epoch 9/20 - Train loss: 0.5966
Epoch 10/20 - Train loss: 0.5759
Epoch 11/20 - Train loss: 0.5561
Epoch 12/20 - Train loss: 0.5389
Epoch 13/20 - Train loss: 0.5217
Epoch 14/20 - Train loss: 0.5087
Epoch 15/20 - Train loss: 0.4912
Epoch 16/20 - Train loss: 0.4748
Epoch 17/20 - Train loss: 0.4580
Epoch 18/20 - Train loss: 0.4394
Epoch 19/20 - Train loss: 0.4204
Epoch 20/20 - Train loss: 0.4069

=== RNN on Embeddings - Classification Report ===

                   precision    recall  f1-score   support

In [4]:
torch.save(model_rnn.state_dict(), "/content/drive/MyDrive/genre_rnn_mpnet.pt")
import joblib
joblib.dump(label_encoder, "/content/drive/MyDrive/genre_label_encoder.joblib")


['/content/drive/MyDrive/genre_label_encoder.joblib']

In [5]:
st_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
def predict_styles_for_texts(text_list):
    """
    text_list: list of strings
    returns: list of predicted style labels
    """
    model_rnn.eval()
    with torch.no_grad():
        emb = st_model.encode(
            text_list,
            batch_size=64,
            convert_to_tensor=True,
            normalize_embeddings=True,
        ).to(device)

        logits = model_rnn(emb)
        preds_idx = torch.argmax(logits, dim=1).cpu().numpy()

    return label_encoder.inverse_transform(preds_idx)


In [7]:
import numpy as np
import torch

def predict_style_proba_for_texts(texts, model, embedder, device, label_encoder):
    model.eval()
    all_probs = []

    with torch.no_grad():
        for txt in texts:
            emb = embedder.encode(str(txt))              # (emb_dim,)
            emb = torch.tensor(emb, dtype=torch.float32).unsqueeze(0).to(device)  # (1, emb_dim)
            logits = model(emb)                          # (1, num_classes)
            probs = torch.softmax(logits, dim=1).cpu().numpy().flatten()
            all_probs.append(probs)

    return np.vstack(all_probs)  # (N, num_classes)


In [9]:
def add_style_match_to_csv(
    csv_path,
    gen_col,
    pred_label_col="rnn_pred_style",
    score_col="rnn_style_match",
    model=model_rnn,
    embedder= st_model,
    label_encoder=label_encoder,
    device=device,
):
    print(f"\n=== Processing {csv_path} ===")
    df = pd.read_csv(csv_path)

    # only score non-empty outputs
    mask = df[gen_col].notna() & df[gen_col].astype(str).str.strip().ne("")
    texts = df.loc[mask, gen_col].astype(str).tolist()

    # 1) predict probability distribution for generated outputs
    probs = predict_style_proba_for_texts(
        texts=texts,
        model=model,
        embedder=embedder,
        device=device,
        label_encoder=label_encoder,
    )  # shape (N, num_classes)

    # 2) top-1 predicted labels from probs
    top1_idx = probs.argmax(axis=1)
    pred_labels = label_encoder.inverse_transform(top1_idx)

    # write predictions into df
    df.loc[mask, pred_label_col] = pred_labels

    # 3) rank-based score:
    #    1.0 if target_style is rank 1
    #    0.5 if rank 2
    #    0.25 if rank 3
    #    0   otherwise
    df[score_col] = 0.0

    masked_indices = df.index[mask]
    for i, row_idx in enumerate(masked_indices):
        target_style = df.at[row_idx, "target_style"]

        if pd.isna(target_style):
            continue

        target_idx = label_encoder.transform([target_style])[0]
        ranked = np.argsort(probs[i])[::-1]  # descending order of prob

        if ranked[0] == target_idx:
            score = 1.0
        elif len(ranked) > 1 and ranked[1] == target_idx:
            score = 0.5
        elif len(ranked) > 2 and ranked[2] == target_idx:
            score = 0.25
        else:
            score = 0.0

        df.at[row_idx, score_col] = score

    # 4) save back to same file
    df.to_csv(csv_path, index=False)

    print(f"Saved with columns: {pred_label_col}, {score_col}")
    print("Mean style score:", df[score_col].mean())
    return df


In [10]:
# 1) GPT-4
df_gpt4 = add_style_match_to_csv(
    "/content/drive/MyDrive/test_dataset_mode_transfer_cleaned_with_sim.csv",
    gen_col="output_gpt4",
)

# 2) LoRA
df_lora = add_style_match_to_csv(
    "/content/drive/MyDrive/new_test_with_lora_with_sim.csv",
    gen_col="output",
)

# 3) Qwen base
df_qwen = add_style_match_to_csv(
    "/content/drive/MyDrive/test_eval_qwen_base_with_sim.csv",
    gen_col="output_qwen_base",
)



=== Processing /content/drive/MyDrive/test_dataset_mode_transfer_cleaned_with_sim.csv ===
Saved with columns: rnn_pred_style, rnn_style_match
Mean style score: 0.3802083333333333

=== Processing /content/drive/MyDrive/new_test_with_lora_with_sim.csv ===
Saved with columns: rnn_pred_style, rnn_style_match
Mean style score: 0.2591145833333333

=== Processing /content/drive/MyDrive/test_eval_qwen_base_with_sim.csv ===
Saved with columns: rnn_pred_style, rnn_style_match
Mean style score: 0.24739583333333334


In [11]:
# 1) GPT-4
df_gpt4 = add_style_match_to_csv(
    "/content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4_with_sim.csv",
    gen_col="output_gpt4",
)

# 2) LoRA
df_lora = add_style_match_to_csv(
    "/content/drive/MyDrive/eval_with_lora_with_sim.csv",
    gen_col="output",
)

# 3) Qwen base
df_qwen = add_style_match_to_csv(
    "/content/drive/MyDrive/external_genre_validation_400_with_tragets_qwen_base_with_sim.csv",
    gen_col="output_qwen_base",
)


=== Processing /content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4_with_sim.csv ===
Saved with columns: rnn_pred_style, rnn_style_match
Mean style score: 0.48375

=== Processing /content/drive/MyDrive/eval_with_lora_with_sim.csv ===
Saved with columns: rnn_pred_style, rnn_style_match
Mean style score: 0.30875

=== Processing /content/drive/MyDrive/external_genre_validation_400_with_tragets_qwen_base_with_sim.csv ===
Saved with columns: rnn_pred_style, rnn_style_match
Mean style score: 0.28875


Spacy NER

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
import spacy

# Load only NER to keep it fast
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer", "textcat"])


In [None]:
import pandas as pd

def entity_retention_score(orig_text: str, gen_text: str) -> float:
    """
    Compute spaCy named-entity retention:
    (# entities from orig that appear in gen) / (# entities in orig)

    Returns 1.0 if original has no entities.
    """
    if not isinstance(orig_text, str) or not isinstance(gen_text, str):
        return 0.0

    orig_doc = nlp(orig_text)
    gen_lower = gen_text.lower()

    orig_entities = [ent.text.strip() for ent in orig_doc.ents if ent.text.strip()]
    if len(orig_entities) == 0:
        return 1.0  # nothing to preserve

    kept = 0
    for ent in orig_entities:
        if ent.lower() in gen_lower:
            kept += 1

    return kept / len(orig_entities)


In [None]:
def add_spacy_retention_to_csv(
    csv_path,
    orig_col,
    gen_col,
    new_col="spacy_entity_retention"
):
    print(f"\n=== Processing {csv_path} ===")
    df = pd.read_csv(csv_path)

    # Only compute where we have both texts
    mask = df[orig_col].notna() & df[gen_col].notna()

    scores = []
    for orig, gen in zip(df.loc[mask, orig_col], df.loc[mask, gen_col]):
        scores.append(entity_retention_score(str(orig), str(gen)))

    df.loc[mask, new_col] = scores

    # Optional: fill others with NaN or 1.0
    # df[new_col] = df[new_col].fillna(1.0)

    df.to_csv(csv_path, index=False)
    print(f"Saved {new_col}. Mean = {df[new_col].mean():.3f}")
    return df


In [None]:
# 1) GPT-4 eval
df_gpt4 = add_spacy_retention_to_csv(
    csv_path="/content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4_with_sim.csv",
    orig_col="raw_content",
    gen_col="output_gpt4",
    new_col="spacy_entity_retention"
)

# 2) LoRA eval
df_lora = add_spacy_retention_to_csv(
    csv_path="/content/drive/MyDrive/eval_with_lora_with_sim.csv",
    orig_col="raw_content",
    gen_col="output",
    new_col="spacy_entity_retention"
)

# 3) Qwen base eval
df_qwen = add_spacy_retention_to_csv(
    csv_path="/content/drive/MyDrive/external_genre_validation_400_with_tragets_qwen_base_with_sim.csv",
    orig_col="raw_content",
    gen_col="output_qwen_base",
    new_col="spacy_entity_retention"
)



=== Processing /content/drive/MyDrive/eval_style_pairs_with_outputs_gpt4_with_sim.csv ===
Saved spacy_entity_retention. Mean = 0.640

=== Processing /content/drive/MyDrive/eval_with_lora_with_sim.csv ===
Saved spacy_entity_retention. Mean = 0.450

=== Processing /content/drive/MyDrive/external_genre_validation_400_with_tragets_qwen_base_with_sim.csv ===
Saved spacy_entity_retention. Mean = 0.474


In [None]:
# 1) GPT-4 eval
df_gpt4 = add_spacy_retention_to_csv(
    csv_path="/content/drive/MyDrive/test_dataset_mode_transfer_cleaned_with_sim.csv",
    orig_col="raw_content",
    gen_col="output_gpt4",
    new_col="spacy_entity_retention"
)

# 2) LoRA eval
df_lora = add_spacy_retention_to_csv(
    csv_path="/content/drive/MyDrive/new_test_with_lora_with_sim.csv",
    orig_col="raw_content",
    gen_col="output",
    new_col="spacy_entity_retention"
)

# 3) Qwen base eval
df_qwen = add_spacy_retention_to_csv(
    csv_path="/content/drive/MyDrive/test_eval_qwen_base_with_sim.csv",
    orig_col="raw_content",
    gen_col="output_qwen_base",
    new_col="spacy_entity_retention"
)


=== Processing /content/drive/MyDrive/test_dataset_mode_transfer_cleaned_with_sim.csv ===
Saved spacy_entity_retention. Mean = 0.725

=== Processing /content/drive/MyDrive/new_test_with_lora_with_sim.csv ===
Saved spacy_entity_retention. Mean = 0.587

=== Processing /content/drive/MyDrive/test_eval_qwen_base_with_sim.csv ===
Saved spacy_entity_retention. Mean = 0.572
