# Spooky Books Author Prediction - Version 7
This notebook implements an efficient stacked TF-IDF pipeline with cross-validation, randomized hyperparameter search, and dimensionality reduction.

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import loguniform
from sklearn.metrics import log_loss

## 1. Load Data

In [8]:
train = pd.read_csv('./train/train.csv')
test  = pd.read_csv('./test/test.csv')
X, y    = train['text'], train['author']
X_test  = test['text']
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (19579, 3)
Test shape: (8392, 2)


## 2. Build Combined TF-IDF + SVD Feature Extractor

In [9]:
features = FeatureUnion([
    ('word', TfidfVectorizer(
        ngram_range=(1,2),
        max_df=0.85,
        min_df=3,
        max_features=10000,
        stop_words='english'
    )),
    ('char', TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(3,5),
        max_features=5000
    )),
])

preprocessor = Pipeline([
    ('tfidf_union', features),
    ('svd', TruncatedSVD(n_components=300, random_state=42)),
])

## 3. Define and Calibrate Base Estimators

In [10]:
base_lr = Pipeline([
    ('prep', preprocessor),
    ('clf', CalibratedClassifierCV(
        LogisticRegression(
            multi_class='multinomial', 
            solver='saga', 
            max_iter=2000, 
            random_state=42,
            n_jobs=-1
        ),
        cv=3, method='isotonic'
    ))
])

base_svc = Pipeline([
    ('prep', preprocessor),
    ('clf', CalibratedClassifierCV(
        LinearSVC(
            max_iter=2000, 
            random_state=42
        ),
        cv=3, method='sigmoid'
    ))
])

## 4. Randomized Hyperparameter Search

In [11]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search_lr = RandomizedSearchCV(
    base_lr,
    {'clf__estimator__C': loguniform(1e-2, 1e2)},
    n_iter=8,
    scoring='neg_log_loss',
    cv=cv,
    n_jobs=-1,
    random_state=42
)

search_svc = RandomizedSearchCV(
    base_svc,
    {'clf__estimator__C': loguniform(1e-2, 1e2)},
    n_iter=6,
    scoring='neg_log_loss',
    cv=cv,
    n_jobs=-1,
    random_state=42
)

search_lr.fit(X, y)
search_svc.fit(X, y)

print('LR best C:', search_lr.best_params_, 'CV log-loss:', -search_lr.best_score_)
print('SVC best C:', search_svc.best_params_, 'CV log-loss:', -search_svc.best_score_)



LR best C: {'clf__estimator__C': np.float64(2.481040974867813)} CV log-loss: 0.6437798058950017
SVC best C: {'clf__estimator__C': np.float64(63.512210106407046)} CV log-loss: 0.6341337500525179


## 5. Light Stacking Ensemble

In [12]:
estimators = [
    ('lr', search_lr.best_estimator_),
    ('svc', search_svc.best_estimator_)
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(
        multi_class='multinomial', 
        solver='saga', 
        random_state=42
    ),
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    stack_method='predict_proba',
    n_jobs=-1
)

stack_loss = -cross_val_score(
    stack, X, y, cv=cv, scoring='neg_log_loss', n_jobs=-1
).mean()
print('Stacking CV log-loss:', stack_loss)

stack.fit(X, y)

Stacking CV log-loss: 0.629548310804949




## 6. Predict and Create Submission

In [14]:
y_pred = stack.predict_proba(X_test)
submission = pd.DataFrame(y_pred, columns=stack.classes_)
submission.insert(0, 'id', test['id'])
submission = submission[['id','EAP','HPL','MWS']]
submission.to_csv('./sample_submission/submission_v7.csv', index=False)
submission.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.042691,0.027124,0.930186
1,id24541,0.882359,0.063515,0.054127
2,id00134,0.038655,0.943285,0.01806
3,id27757,0.454769,0.48515,0.060081
4,id04081,0.678239,0.07737,0.244391
