# Spooky Books Author Prediction - v10
Advanced TF-IDF + engineered features, calibrated models, and optimized voting weights via CV.

## 1. Import Libraries

In [16]:
import pandas as pd
import numpy as np
from scipy.stats import loguniform
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from scipy.sparse import hstack
import re

## 2. Load & Encode Data

In [17]:
train = pd.read_csv('./train/train.csv')
test  = pd.read_csv('./test/test.csv')
X_text = train['text']
y = train['author']
X_test_text = test['text']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)
print('Classes:', le.classes_)
print('Train shape:', train.shape, 'Test shape:', test.shape)

Classes: ['EAP' 'HPL' 'MWS']
Train shape: (19579, 3) Test shape: (8392, 2)


## 3. Feature Engineering Functions

In [18]:
def text_features(texts):
    df = pd.DataFrame()
    df['char_count'] = texts.str.len()
    df['word_count'] = texts.str.split().apply(len)
    df['avg_word_len'] = df['char_count'] / df['word_count']
    df['punct_count'] = texts.apply(lambda x: len(re.findall(r'[.,!?;]', x)))
    return df.fillna(0)

# Compute engineered features
feat_train = text_features(X_text)
feat_test = text_features(X_test_text)

## 4. TF-IDF + Engineered Features

In [19]:
# TF-IDF vectorizers with sublinear_tf and smooth_idf
word_tf = TfidfVectorizer(
    ngram_range=(1,2), max_features=15000, stop_words='english',
    sublinear_tf=True, smooth_idf=True
)
char_tf = TfidfVectorizer(
    analyzer='char_wb', ngram_range=(3,5), max_features=5000,
    sublinear_tf=True, smooth_idf=True
)

# Transform text
X_word = word_tf.fit_transform(X_text)
X_char = char_tf.fit_transform(X_text)
X_tfidf = hstack([X_word, X_char])

# Combine with engineered numeric features
from scipy.sparse import csr_matrix
X_num = csr_matrix(feat_train.values)
X = hstack([X_tfidf, X_num])

# Test data
X_test_tfidf = hstack([word_tf.transform(X_test_text), char_tf.transform(X_test_text)])
X_test = hstack([X_test_tfidf, csr_matrix(feat_test.values)])

## 5. Model Tuning with Calibration

In [20]:
# Logistic Regression
lr = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=2000, random_state=42, n_jobs=-1)
lr_cal = CalibratedClassifierCV(lr, cv=3, method='sigmoid')
lr_search = RandomizedSearchCV(
    lr, {'C': loguniform(1e-2, 1e2)}, n_iter=10,
    scoring='neg_log_loss', cv=5, n_jobs=-1, random_state=42
)
lr_search.fit(X, y_enc)
print('LR best C:', lr_search.best_params_)

# Multinomial NB
nb = MultinomialNB()
nb_cal = CalibratedClassifierCV(nb, cv=3, method='sigmoid')
nb_search = RandomizedSearchCV(
    nb, {'alpha': np.linspace(0.1, 1.0, 10)}, n_iter=10,
    scoring='neg_log_loss', cv=5, n_jobs=-1, random_state=42
)
nb_search.fit(X, y_enc)
print('NB best alpha:', nb_search.best_params_)



LR best C: {'C': np.float64(63.512210106407046)}
NB best alpha: {'alpha': np.float64(0.1)}


## 6. Optimized Weighted Voting Ensemble

In [21]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros((X_test.shape[0], len(le.classes_)))
val_losses = []

# Determine optimal weights via grid search over two weights
best = {'loss': np.inf, 'w_lr': None}
w_range = np.linspace(0, 1, 21)
for w in w_range:
    temp_losses = []
    for tr, val in skf.split(X, y_enc):
        p1 = lr_search.predict_proba(X[val])
        p2 = nb_search.predict_proba(X[val])
        loss = log_loss(y_enc[val], w*p1 + (1-w)*p2)
        temp_losses.append(loss)
    avg = np.mean(temp_losses)
    if avg < best['loss']:
        best = {'loss': avg, 'w_lr': w}
w_lr = best['w_lr']; w_nb = 1 - w_lr
print(f"Optimal weights -> LR: {w_lr:.2f}, NB: {w_nb:.2f}, Log Loss: {best['loss']:.4f}")

# Generate test predictions
for name, model, w in [('lr', lr_search, w_lr), ('nb', nb_search, w_nb)]:
    test_preds += w * model.predict_proba(X_test)


Optimal weights -> LR: 0.00, NB: 1.00, Log Loss: 0.2476


## 7. Submission

In [22]:
submission = pd.DataFrame(test_preds, columns=le.classes_)
submission.insert(0, 'id', test['id'])
submission.to_csv('./sample_submission/submission_v10.csv', index=False)
print('Saved: submission_v10.csv')

Saved: submission_v10.csv
