In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [None]:
# ============================
# 1. CONFIG: CHANGE THESE
# ============================
CSV_PATH = "/content/drive/MyDrive/novel_paragraphs_6kself_plus_transfer.csv"

# Text column to classify
TEXT_COL = "raw_content"   # e.g. "raw_content" or "output"

# Label column with genre
LABEL_COL = "source_style"        # e.g. "style", "source_style", "target_style"

# Where to save everything in Drive
BASE_SAVE_DIR = "/content/drive/MyDrive/genre_classification_mpnet"

MODEL_SAVE_PATH = os.path.join(BASE_SAVE_DIR, "mpnet_model")
EMB_SAVE_PATH   = os.path.join(BASE_SAVE_DIR, "genre_embeddings.npy")
LBL_SAVE_PATH   = os.path.join(BASE_SAVE_DIR, "genre_labels.csv")
CLF_SAVE_PATH   = os.path.join(BASE_SAVE_DIR, "genre_classifier.pkl")

os.makedirs(BASE_SAVE_DIR, exist_ok=True)

print("Base save dir:", BASE_SAVE_DIR)

Base save dir: /content/drive/MyDrive/genre_classification_mpnet


In [None]:
# ============================
# 2. Load dataset
# ============================
df = pd.read_csv(CSV_PATH)
print("Loaded dataset with shape:", df.shape)

# Drop rows with missing text or label
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).reset_index(drop=True)
print("After dropping NA, shape:", df.shape)

texts = df[TEXT_COL].tolist()
labels = df[LABEL_COL].tolist()

print("Number of examples:", len(texts))
print("Unique labels (genres):", sorted(set(labels)))

Loaded dataset with shape: (9575, 6)
After dropping NA, shape: (9575, 6)
Number of examples: 9575
Unique labels (genres): ['comedy', 'detective_mystery', 'fantasy', 'goth', 'romance', 'science_fiction']


In [None]:
# ============================
# 3. Load SentenceTransformer model
# ============================
model_name = "sentence-transformers/all-mpnet-base-v2"
print("Loading model:", model_name)
model = SentenceTransformer(model_name)

# ============================
# 4. Compute embeddings
# ============================
print("Encoding texts to embeddings...")
embeddings = model.encode(
    texts,
    batch_size=32,
    convert_to_numpy=True,
    normalize_embeddings=True,   # good for linear classifier
    show_progress_bar=True,
)

print("Embeddings shape:", embeddings.shape)

Loading model: sentence-transformers/all-mpnet-base-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding texts to embeddings...


Batches:   0%|          | 0/300 [00:00<?, ?it/s]

Embeddings shape: (9575, 768)


In [None]:
# ============================
# 5. Train / test split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    embeddings,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels,
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])

Train size: 7660 Test size: 1915


In [None]:
# ============================
# 6. Train classifier
# ============================
clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    multi_class="multinomial"
)

print("Training Logistic Regression classifier...")
clf.fit(X_train, y_train)
print("Training done.")

Training Logistic Regression classifier...




Training done.


In [None]:
# ============================
# 7. Evaluate classifier
# ============================
y_pred = clf.predict(X_test)
print("\n=== Classification Report ===\n")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===\n")
print(confusion_matrix(y_test, y_pred))


=== Classification Report ===

                   precision    recall  f1-score   support

           comedy       0.72      0.75      0.73       320
detective_mystery       0.72      0.74      0.73       320
          fantasy       0.76      0.72      0.74       320
             goth       0.70      0.72      0.71       320
          romance       0.72      0.69      0.71       320
  science_fiction       0.79      0.77      0.78       315

         accuracy                           0.73      1915
        macro avg       0.73      0.73      0.73      1915
     weighted avg       0.73      0.73      0.73      1915


=== Confusion Matrix ===

[[239  19  18  11  17  16]
 [ 11 237  13  29  18  12]
 [ 19  16 230  18  18  19]
 [ 15  25  11 232  24  13]
 [ 35  13  21  23 222   6]
 [ 15  18  10  20   9 243]]


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# ===== Dataset wrappers =====
class EmbeddingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # X: (emb_dim,) -> we’ll reshape later to (1, emb_dim)
        return self.X[idx], self.y[idx]

train_ds = EmbeddingDataset(X_train, y_train)
test_ds  = EmbeddingDataset(X_test,  y_test)

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

emb_dim = X.shape[1]   # e.g. 768

# ===== RNN classifier over embeddings =====
class RNNOnEmbeddings(nn.Module):
    def __init__(self, emb_dim, hidden_dim, num_classes, num_layers=1, bidirectional=False):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
        )
        self.bidirectional = bidirectional
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, x):
        # x: (batch, emb_dim) -> make it (batch, seq_len=1, emb_dim)
        x_seq = x.unsqueeze(1)  # (batch, 1, emb_dim)

        outputs, (h_n, c_n) = self.lstm(x_seq)
        if self.bidirectional:
            h_forward = h_n[-2, :, :]
            h_backward = h_n[-1, :, :]
            h = torch.cat((h_forward, h_backward), dim=1)  # (batch, 2*hidden_dim)
        else:
            h = h_n[-1, :, :]  # (batch, hidden_dim)

        logits = self.fc(h)  # (batch, num_classes)
        return logits

hidden_dim = 256

model_rnn = RNNOnEmbeddings(
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_layers=1,
    bidirectional=False,    # you can set True if you want, but seq_len=1 anyway
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=1e-3)


Using device: cpu


In [None]:
num_epochs = 20  # adjust as you like

for epoch in range(1, num_epochs + 1):
    model_rnn.train()
    total_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        optimizer.zero_grad()
        logits = model_rnn(batch_X)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_X.size(0)

    avg_loss = total_loss / len(train_ds)
    print(f"Epoch {epoch}/{num_epochs} - Train loss: {avg_loss:.4f}")


Epoch 1/20 - Train loss: 0.5219
Epoch 2/20 - Train loss: 0.5131
Epoch 3/20 - Train loss: 0.4872
Epoch 4/20 - Train loss: 0.4720
Epoch 5/20 - Train loss: 0.4563
Epoch 6/20 - Train loss: 0.4408
Epoch 7/20 - Train loss: 0.4186
Epoch 8/20 - Train loss: 0.4033
Epoch 9/20 - Train loss: 0.3856
Epoch 10/20 - Train loss: 0.3687
Epoch 11/20 - Train loss: 0.3514
Epoch 12/20 - Train loss: 0.3268
Epoch 13/20 - Train loss: 0.3134
Epoch 14/20 - Train loss: 0.2943
Epoch 15/20 - Train loss: 0.2795
Epoch 16/20 - Train loss: 0.2627
Epoch 17/20 - Train loss: 0.2508
Epoch 18/20 - Train loss: 0.2348
Epoch 19/20 - Train loss: 0.2179
Epoch 20/20 - Train loss: 0.2038


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

model_rnn.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        logits = model_rnn(batch_X)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_true.extend(batch_y.numpy())

print("\n=== RNN on Embeddings - Classification Report ===\n")
print(classification_report(all_true, all_preds, target_names=label_encoder.classes_))

print("\n=== Confusion Matrix ===\n")
print(confusion_matrix(all_true, all_preds))



=== RNN on Embeddings - Classification Report ===

                   precision    recall  f1-score   support

           comedy       0.86      0.88      0.87       320
detective_mystery       0.86      0.90      0.88       320
          fantasy       0.88      0.84      0.86       320
             goth       0.80      0.86      0.83       320
          romance       0.90      0.81      0.85       320
  science_fiction       0.92      0.90      0.91       315

         accuracy                           0.87      1915
        macro avg       0.87      0.87      0.87      1915
     weighted avg       0.87      0.87      0.87      1915


=== Confusion Matrix ===

[[282  11   6   9   5   7]
 [  6 287   7  11   6   3]
 [  8   5 269  18  12   8]
 [  9  16   8 275   6   6]
 [ 17   9  11  21 260   2]
 [  7   7   5  11   1 284]]
