# Spooky Books Author Prediction - Version 8
Unified stacked TF-IDF pipeline with three base learners, randomized hyperparameter search, and stratified cross-validation.

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import loguniform
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import log_loss

## 2. Load Data

In [2]:
train = pd.read_csv('./train/train.csv')
test  = pd.read_csv('./test/test.csv')
X_text, y = train['text'], train['author']
X_test_text = test['text']
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (19579, 3)
Test shape: (8392, 2)


## 3. Shared TF-IDF Feature Union

In [3]:
tfidf_union = FeatureUnion([
    ('word', TfidfVectorizer(
        ngram_range=(1,2), max_df=0.85, min_df=3,
        max_features=12000, stop_words='english'
    )),
    ('char', TfidfVectorizer(
        analyzer='char_wb', ngram_range=(3,5),
        max_features=8000
    )),
])
preprocessor = tfidf_union

## 4. Define Base Learner Pipelines

In [4]:
base_pipelines = {
    'lr': Pipeline([
        ('tfidf', preprocessor),
        ('clf', LogisticRegression(
            multi_class='multinomial', solver='saga',
            max_iter=2000, random_state=42, n_jobs=-1
        ))
    ]),

    'nb': Pipeline([
        ('tfidf', preprocessor),
        ('clf', MultinomialNB())
    ]),

    'svc': Pipeline([
        ('tfidf', preprocessor),
        ('clf', CalibratedClassifierCV(
            LinearSVC(max_iter=2000, random_state=42),
            cv=3, method='sigmoid'
        ))
    ])
}

## 5. Randomized Hyperparameter Search

In [5]:
param_dists = {
    'lr': {'clf__C': loguniform(1e-2, 1e2)},
    'nb': {'clf__alpha': np.linspace(0.1, 1.0, 5)},
    'svc': {'clf__estimator__C': loguniform(1e-2, 1e2)}
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
searches = {}
for name, pipe in base_pipelines.items():
    n_iter = 8 if name == 'lr' else 6
    rs = RandomizedSearchCV(
        pipe, param_dists[name], n_iter=n_iter,
        scoring='neg_log_loss', cv=cv, n_jobs=-1, random_state=42
    )
    print(f"Tuning {name}...")
    rs.fit(X_text, y)
    print(f"{name} best params: {rs.best_params_}, log-loss={-rs.best_score_:.4f}\n")
    searches[name] = rs.best_estimator_

Tuning lr...




lr best params: {'clf__C': np.float64(2.481040974867813)}, log-loss=0.4212

Tuning nb...




nb best params: {'clf__alpha': np.float64(0.1)}, log-loss=0.4468

Tuning svc...
svc best params: {'clf__estimator__C': np.float64(0.31489116479568624)}, log-loss=0.4309



## 6. Stacked Ensemble & Evaluation

In [6]:
estimators = [(name, est) for name, est in searches.items()]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(
        multi_class='multinomial', solver='saga', random_state=42
    ),
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    stack_method='predict_proba', n_jobs=-1
)

stack_loss = -cross_val_score(
    stack, X_text, y, cv=cv, scoring='neg_log_loss', n_jobs=-1
).mean()
print(f"Stacking CV log-loss: {stack_loss:.4f}")

Stacking CV log-loss: 0.4044


## 7. Final Training & Submission

In [7]:
stack.fit(X_text, y)
probs = stack.predict_proba(X_test_text)

submission = pd.DataFrame(probs, columns=stack.classes_)
submission.insert(0, 'id', test['id'])
submission = submission[['id', 'EAP', 'HPL', 'MWS']]
submission.to_csv('./sample_submission/spooky_books_author_v8.csv', index=False)
print("Saved: spooky_books_author_v8.csv")



Saved: spooky_books_author_v8.csv
