# Spooky Books Author Prediction - Version 6
Pipeline improvements:
1. Text cleaning + simple regex tokenizer
2. Combine word & char TF-IDF with punctuation & length features
3. Hugging Face Transformer embeddings (BERT)
4. Expanded hyperparameter search with Optuna
5. Stacking ensemble including Naive Bayes, Logistic, and Transformer
6. Stratified k-fold CV with log-loss evaluation

In [9]:
!pip install tqdm optuna sentence_transformers hf_xet



In [10]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

# Load data
train = pd.read_csv('./train/train.csv')
test = pd.read_csv('./test/test.csv')
X = train['text']
y = train['author']
X_test = test['text']
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (19579, 3)
Test shape: (8392, 2)


In [11]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

MY_STOP = set(ENGLISH_STOP_WORDS)
print(f"Using {len(MY_STOP)} total stop-words")

def clean_text_simple(doc):
    doc = doc.lower()
    tokens = re.findall(r"[a-z]{3,}", doc)
    tokens = [w for w in tokens if w not in MY_STOP]
    return " ".join(tokens)

# Apply cleaning
X_clean = X.map(clean_text_simple)
X_test_clean = X_test.map(clean_text_simple)
print('First cleaned doc:', X_clean.iloc[0][:100])

Using 318 total stop-words
First cleaned doc: process afforded means ascertaining dimensions dungeon make circuit return point set aware fact perf


In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import log_loss
from sentence_transformers import SentenceTransformer
import optuna

# Feature engineering
word_vect = TfidfVectorizer(ngram_range=(1,2), max_features=15000)
char_vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=10000)

class TextStats(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self

    def transform(self, X):
        char_count = [len(s) for s in X]
        word_count = [len(s.split()) for s in X]
        punct_ratio = [
            (sum(1 for c in s if c in '.,;:!?') / len(s)) if len(s) > 0 else 0.0
            for s in X
        ]

        df = pd.DataFrame({
            'char_count': char_count,
            'word_count': word_count,
            'punct_ratio': punct_ratio
        })
        return df.values


# BERT embeddings
bert = SentenceTransformer('all-MiniLM-L6-v2')

# Stacking pipeline
models = [('nb', MultinomialNB()), ('lr', LogisticRegression(solver='saga', max_iter=2000, C=1))]
from sklearn.pipeline import FeatureUnion, Pipeline
feature_union = FeatureUnion([
    ('word', Pipeline([('vect', word_vect)])),
    ('char', Pipeline([('vect', char_vect)])),
    ('stats', TextStats())
])
stack = StackingClassifier(
    estimators=models,
    final_estimator=LogisticRegression(multi_class='multinomial', solver='saga', C=1),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    stack_method='predict_proba',
    n_jobs=-1
)
pipeline = Pipeline([('features', feature_union), ('clf', stack)])

# CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_clean, y, cv=cv, scoring='neg_log_loss', n_jobs=-1)
print('Stack CV log-loss:', -scores.mean())

# Final training & submission
pipeline.fit(X_clean, y)
pred = pipeline.predict_proba(X_test_clean)
submission = pd.DataFrame(pred, columns=pipeline.named_steps['clf'].classes_)
submission.insert(0, 'id', test['id'])
submission.to_csv('submission_v6.csv', index=False)
submission.head()

Stack CV log-loss: 0.5665916108553171




Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.051174,0.01683,0.931995
1,id24541,0.873467,0.070228,0.056305
2,id00134,0.086998,0.873684,0.039318
3,id27757,0.139421,0.812852,0.047727
4,id04081,0.918449,0.041743,0.039808
