# Spooky Books Author Prediction - Version 4

Perbaikan pipeline dengan:
1. Cross-validation dan stratifikasi
2. Extended TF-IDF (word & char n-grams)
3. Grid search untuk kedua model
4. Stacked ensemble dengan logistic meta-model
5. Evaluasi log-loss CV

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

## 2. Load Data

In [2]:
train = pd.read_csv('./train/train.csv')
test  = pd.read_csv('./test/test.csv')
X = train['text']
y = train['author']
X_test = test['text']
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (19579, 3)
Test shape: (8392, 2)


## 3. Define TF-IDF Pipelines

In [3]:
# Word n-grams pipeline
word_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=12000,
        ngram_range=(1,2),
        stop_words='english',
        min_df=4,
        max_df=0.85
    )),
    ('clf', LogisticRegression(multi_class='multinomial', solver='saga', random_state=42, max_iter=2000))
])

# Char n-grams pipeline
char_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=8000,
        analyzer='char_wb',
        ngram_range=(3,5)
    )),
    ('clf', LinearSVC(random_state=42, max_iter=2000))
])

## 4. Hyperparameter Tuning

In [4]:
param_grid_word = {'clf__C': [0.1, 1, 10]}
param_grid_char = {'clf__C': [0.5, 1.0]}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_word = GridSearchCV(word_pipeline, param_grid_word, cv=cv, scoring='neg_log_loss', n_jobs=-1)
grid_char = GridSearchCV(char_pipeline, param_grid_char, cv=cv, scoring='accuracy', n_jobs=-1)

grid_word.fit(X, y)
grid_char.fit(X, y)

print('Word best params:', grid_word.best_params_)
print('Word CV log-loss:', -grid_word.best_score_)
print('Char best params:', grid_char.best_params_)
print('Char CV accuracy:', grid_char.best_score_)



Word best params: {'clf__C': 10}
Word CV log-loss: 0.523305066328412
Char best params: {'clf__C': 1.0}
Char CV accuracy: 0.80821289219197


## 5. Stacking Ensemble

In [5]:
estimators = [
    ('word', grid_word.best_estimator_),
    ('char', CalibratedClassifierCV(grid_char.best_estimator_, cv=3))
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(multi_class='multinomial', solver='saga', random_state=42),
    cv=cv,
    stack_method='predict_proba',
    n_jobs=-1
)

# CV log-loss for stacking
scores = cross_val_score(stack, X, y, cv=cv, scoring='neg_log_loss', n_jobs=-1)
print('Stacking CV log-loss:', -scores.mean())

# Train on full data
stack.fit(X, y)

Stacking CV log-loss: 0.42942359390950946




## 6. Predict and Submit

In [6]:
y_pred = stack.predict_proba(X_test)
submission = pd.DataFrame(y_pred, columns=stack.classes_)
submission.insert(0, 'id', test['id'])
submission = submission[['id', 'EAP', 'HPL', 'MWS']]
submission.to_csv('./sample_submission/submission_v4.csv', index=False)
submission.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.031637,0.011828,0.956535
1,id24541,0.96279,0.018454,0.018755
2,id00134,0.0559,0.929401,0.014699
3,id27757,0.552792,0.404292,0.042916
4,id04081,0.936603,0.024782,0.038615
