In [1]:
# tfidf_glove840_optimized.py
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from scipy.sparse import hstack, csr_matrix
from tqdm import tqdm

# === ✅ Load data ===
train = pd.read_csv('/kaggle/input/identify-the-author/train/train.csv')
test = pd.read_csv('/kaggle/input/identify-the-author/test/test.csv')

# === ✅ Encode labels ===
le = LabelEncoder()
y = le.fit_transform(train['author'])

# === 🔍 Build vocab from train + test ===
def build_vocab(texts):
    vocab = set()
    for doc in texts:
        for word in doc.lower().split():
            vocab.add(word)
    return vocab

full_vocab = build_vocab(pd.concat([train['text'], test['text']]))

# === 📥 Load filtered GloVe.840B.300d ===
def load_filtered_glove(path, vocab, dim=300):
    glove = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="🔄 Loading filtered GloVe"):
            parts = line.strip().split()
            word = parts[0]
            vec = parts[1:]

            if word in vocab:
                if len(vec) != dim:
                    continue  # Skip malformed lines
                try:
                    glove[word] = np.asarray(vec, dtype='float32')
                except ValueError:
                    continue  # Skip non-convertible lines
    return glove


glove_path = '/kaggle/input/glove-vectorize/glove.840B.300d.txt'  # Replace with your dataset name
glove = load_filtered_glove(glove_path, full_vocab, dim=300)

# === 🧠 Convert texts to average GloVe vectors ===
def text_to_glove(texts, glove, dim=300):
    vectors = []
    for doc in texts:
        tokens = doc.lower().split()
        vecs = [glove[t] for t in tokens if t in glove]
        if vecs:
            vectors.append(np.mean(vecs, axis=0))
        else:
            vectors.append(np.zeros(dim))
    return np.array(vectors)

X_glove = text_to_glove(train['text'], glove, dim=300)
X_test_glove = text_to_glove(test['text'], glove, dim=300)

# === 🧾 TF-IDF vectorizers (word + char_wb) ===
word_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_features=25000,
    sublinear_tf=True,
    norm='l2'
)

char_vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3, 6),
    min_df=2,
    max_features=25000,
    sublinear_tf=True
)

vectorizer = FeatureUnion([
    ('word', word_vectorizer),
    ('char', char_vectorizer)
])

X_tfidf = vectorizer.fit_transform(train['text'])
X_test_tfidf = vectorizer.transform(test['text'])

# === 🔀 Stack TF-IDF + GloVe
X_combined = hstack([X_tfidf, csr_matrix(X_glove)])
X_test_combined = hstack([X_test_tfidf, csr_matrix(X_test_glove)])

# === 🔁 Stratified CV + Train LogisticRegression
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros((X_combined.shape[0], len(le.classes_)))
test_preds = np.zeros((X_test_combined.shape[0], len(le.classes_)))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_combined, y)):
    print(f"\n🌀 Fold {fold+1}")
    X_tr, X_val = X_combined[train_idx], X_combined[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    clf = LogisticRegression(
        max_iter=2000,
        C=2.0,
        solver='saga',
        multi_class='multinomial',
        n_jobs=-1
    )
    clf.fit(X_tr, y_tr)

    oof_preds[val_idx] = clf.predict_proba(X_val)
    fold_loss = log_loss(y_val, oof_preds[val_idx])
    print(f"📉 Fold {fold+1} Log Loss: {fold_loss:.4f}")

    test_preds += clf.predict_proba(X_test_combined) / skf.n_splits

# === 📉 Final Validation Log Loss
overall_loss = log_loss(y, oof_preds)
print(f"\n✅ Overall Validation Log Loss: {overall_loss:.5f}")

# Save LR preds for blending
lr_oof = oof_preds
lr_test = test_preds


# === 📤 Submission
eps = 1e-15
test_preds = np.clip(test_preds, eps, 1 - eps)
sub = pd.DataFrame(test_preds, columns=le.classes_)
sub.insert(0, 'id', test['id'])
sub.to_csv("submission_tfidf_glove840.csv", index=False)
print("\n🚀 submission_tfidf_glove840.csv ready!")
sub.head()


🔄 Loading filtered GloVe: 2196018it [01:44, 21045.77it/s]



🌀 Fold 1
📉 Fold 1 Log Loss: 0.4232

🌀 Fold 2
📉 Fold 2 Log Loss: 0.4112

🌀 Fold 3
📉 Fold 3 Log Loss: 0.4112

🌀 Fold 4
📉 Fold 4 Log Loss: 0.4134

🌀 Fold 5
📉 Fold 5 Log Loss: 0.4262

✅ Overall Validation Log Loss: 0.41703

🚀 submission_tfidf_glove840.csv ready!


Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.06134,0.014535,0.924124
1,id24541,0.887753,0.080363,0.031884
2,id00134,0.007526,0.989299,0.003175
3,id27757,0.630342,0.337847,0.031811
4,id04081,0.79794,0.07052,0.131541


In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import numpy as np
import pandas as pd

# Use same label encoder
num_classes = len(le.classes_)
nb_oof = np.zeros((X_tfidf.shape[0], num_classes))
nb_test = np.zeros((X_test_tfidf.shape[0], num_classes))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_tfidf, y)):
    print(f"\n🔁 NB Fold {fold+1}")
    clf = MultinomialNB(alpha=0.5)  # smoothing
    clf.fit(X_tfidf[train_idx], y[train_idx])
    
    nb_oof[val_idx] = clf.predict_proba(X_tfidf[val_idx])
    nb_test += clf.predict_proba(X_test_tfidf) / skf.n_splits

# Evaluate
nb_loss = log_loss(y, nb_oof)
print(f"\n✅ Naive Bayes Log Loss: {nb_loss:.5f}")

# Submit
nb_test = np.clip(nb_test, 1e-15, 1 - 1e-15)
sub = pd.DataFrame(nb_test, columns=le.classes_)
sub.insert(0, 'id', test['id'])
sub.to_csv("submission_nb_baseline.csv", index=False)
print("\n🚀 submission_nb_baseline.csv ready!")
sub.head()



🔁 NB Fold 1

🔁 NB Fold 2

🔁 NB Fold 3

🔁 NB Fold 4

🔁 NB Fold 5

✅ Naive Bayes Log Loss: 0.45343

🚀 submission_nb_baseline.csv ready!


Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.013409,0.001262,0.985329
1,id24541,0.990975,0.005684,0.003341
2,id00134,0.00488,0.995003,0.000117
3,id27757,0.185099,0.814522,0.000378
4,id04081,0.953932,0.01671,0.029359


In [3]:
for w in np.arange(0.1, 0.95, 0.05):
    blend = w * lr_oof + (1 - w) * nb_oof
    loss = log_loss(y, blend)
    print(f"LR {w:.2f} + NB {1-w:.2f} → Log Loss: {loss:.5f}")


LR 0.10 + NB 0.90 → Log Loss: 0.42671
LR 0.15 + NB 0.85 → Log Loss: 0.41879
LR 0.20 + NB 0.80 → Log Loss: 0.41259
LR 0.25 + NB 0.75 → Log Loss: 0.40765
LR 0.30 + NB 0.70 → Log Loss: 0.40373
LR 0.35 + NB 0.65 → Log Loss: 0.40065
LR 0.40 + NB 0.60 → Log Loss: 0.39831
LR 0.45 + NB 0.55 → Log Loss: 0.39662
LR 0.50 + NB 0.50 → Log Loss: 0.39553
LR 0.55 + NB 0.45 → Log Loss: 0.39500
LR 0.60 + NB 0.40 → Log Loss: 0.39501
LR 0.65 + NB 0.35 → Log Loss: 0.39555
LR 0.70 + NB 0.30 → Log Loss: 0.39660
LR 0.75 + NB 0.25 → Log Loss: 0.39819
LR 0.80 + NB 0.20 → Log Loss: 0.40035
LR 0.85 + NB 0.15 → Log Loss: 0.40312
LR 0.90 + NB 0.10 → Log Loss: 0.40659


In [4]:
# === 🔀 Blend predictions
blend_oof = 0.55 * lr_oof + 0.45 * nb_oof
blend_test = 0.55 * lr_test + 0.45 * nb_test

# === 📉 Final Log Loss
final_loss = log_loss(y, blend_oof)
print(f"\n✅ Blended LogLoss (LR + NB): {final_loss:.5f}")

# === 📤 Save submission
eps = 1e-15
blend_test = np.clip(blend_test, eps, 1 - eps)
sub = pd.DataFrame(blend_test, columns=le.classes_)
sub.insert(0, 'id', test['id'])
sub.to_csv("submission_blended_lr_nb.csv", index=False)
print("\n🚀 submission_blended_lr_nb.csv ready!")
sub.head()



✅ Blended LogLoss (LR + NB): 0.39500

🚀 submission_blended_lr_nb.csv ready!


Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.039771,0.008562,0.951667
1,id24541,0.934203,0.046757,0.01904
2,id00134,0.006335,0.991866,0.001799
3,id27757,0.429983,0.552351,0.017666
4,id04081,0.868136,0.046305,0.085559


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# === 🔤 Char TF-IDF
char_vectorizer_nb = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 6),
    min_df=2,
    max_features=20000,
    sublinear_tf=True
)

X_char = char_vectorizer_nb.fit_transform(train['text'])
X_test_char = char_vectorizer_nb.transform(test['text'])

# === 🧠 Char-NB predictions
nb_char_oof = np.zeros_like(nb_oof)
nb_char_test = np.zeros_like(nb_test)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_char, y)):
    print(f"\n🔁 NB-CHAR Fold {fold+1}")
    clf = MultinomialNB(alpha=0.5)
    clf.fit(X_char[train_idx], y[train_idx])
    
    nb_char_oof[val_idx] = clf.predict_proba(X_char[val_idx])
    nb_char_test += clf.predict_proba(X_test_char) / skf.n_splits

# === Evaluate Char-NB alone (optional)
char_nb_loss = log_loss(y, nb_char_oof)
print(f"\n✅ Char-NB Log Loss: {char_nb_loss:.5f}")



🔁 NB-CHAR Fold 1

🔁 NB-CHAR Fold 2

🔁 NB-CHAR Fold 3

🔁 NB-CHAR Fold 4

🔁 NB-CHAR Fold 5

✅ Char-NB Log Loss: 0.57801


In [6]:
# === 🔀 3-way blend
blend_oof = 0.6 * lr_oof + 0.3 * nb_oof + 0.1 * nb_char_oof
blend_test = 0.6 * lr_test + 0.3 * nb_test + 0.1 * nb_char_test

final_loss = log_loss(y, blend_oof)
print(f"\n✅ 3-Way Blended Log Loss (LR + NB-word + NB-char): {final_loss:.5f}")



✅ 3-Way Blended Log Loss (LR + NB-word + NB-char): 0.40498


In [7]:
blend_test = np.clip(blend_test, 1e-15, 1 - 1e-15)
sub = pd.DataFrame(blend_test, columns=le.classes_)
sub.insert(0, 'id', test['id'])
sub.to_csv("submission_blend_3way.csv", index=False)
print("\n🚀 submission_blend_3way.csv ready!")
sub.head()



🚀 submission_blend_3way.csv ready!


Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.045071,0.010775,0.944154
1,id24541,0.926431,0.052242,0.021327
2,id00134,0.008966,0.988891,0.002143
3,id27757,0.464261,0.514975,0.020763
4,id04081,0.82631,0.057367,0.116322


In [8]:
for w1 in [0.6, 0.65, 0.7]:
    for w2 in [0.2, 0.25, 0.3]:
        w3 = 1.0 - w1 - w2
        if w3 < 0: continue
        blend = w1 * lr_oof + w2 * nb_oof + w3 * nb_char_oof
        loss = log_loss(y, blend)
        print(f"LR: {w1:.2f}, NB-word: {w2:.2f}, NB-char: {w3:.2f} → LogLoss: {loss:.5f}")


LR: 0.60, NB-word: 0.20, NB-char: 0.20 → LogLoss: 0.41713
LR: 0.60, NB-word: 0.25, NB-char: 0.15 → LogLoss: 0.41081
LR: 0.60, NB-word: 0.30, NB-char: 0.10 → LogLoss: 0.40498
LR: 0.65, NB-word: 0.20, NB-char: 0.15 → LogLoss: 0.41192
LR: 0.65, NB-word: 0.25, NB-char: 0.10 → LogLoss: 0.40589
LR: 0.65, NB-word: 0.30, NB-char: 0.05 → LogLoss: 0.40038
LR: 0.70, NB-word: 0.20, NB-char: 0.10 → LogLoss: 0.40731
LR: 0.70, NB-word: 0.25, NB-char: 0.05 → LogLoss: 0.40162
LR: 0.70, NB-word: 0.30, NB-char: 0.00 → LogLoss: 0.39660
