# Spooky Books Author Prediction - Version 5
Pipeline ringan tanpa stacking berat:
1. TF-IDF (unigram + bigram)
2. Feature selection opsional
3. Dua model efisien: Logistic Regression & MultinomialNB
4. Voting ensemble sederhana
5. Cross-validation & evaluasi log-loss
6. Buat submission

## 1. Import Libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

## 2. Load Data

In [21]:
train = pd.read_csv('./train/train.csv')
test  = pd.read_csv('./test/test.csv')
X, y = train['text'], train['author']
X_test = test['text']
print('Train:', train.shape, 'Test:', test.shape)

Train: (19579, 3) Test: (8392, 2)


## 3. Build Base Pipelines

In [22]:
pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', max_iter=500, random_state=42))
])

pipe_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')),
    ('clf', MultinomialNB())
])

## 4. Ensemble Voting

In [23]:
ensemble = VotingClassifier(
    estimators=[('lr', pipe_lr), ('nb', pipe_nb)],
    voting='soft',
    n_jobs=-1
)

## 5. Cross-Validation Log-loss

In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = -cross_val_score(ensemble, X, y, cv=cv, scoring='neg_log_loss', n_jobs=-1)
print(f'CV Log-loss: {scores.mean():.4f} ± {scores.std():.4f}')

CV Log-loss: 0.5941 ± 0.0055


## 6. Train & Predict


In [25]:
ensemble.fit(X, y)
y_pred = ensemble.predict_proba(X_test)

## 7. Create Submission

In [26]:
submission = pd.DataFrame(y_pred, columns=ensemble.classes_)
submission.insert(0, 'id', test['id'])
submission.to_csv('./sample_submission/submission_v5.csv', index=False)
submission.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.296994,0.091558,0.611448
1,id24541,0.787731,0.121854,0.090416
2,id00134,0.472462,0.457572,0.069966
3,id27757,0.64189,0.310674,0.047436
4,id04081,0.738948,0.169696,0.091356
