In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
print("Loading processed train data...")
df = pd.read_csv("../data/processed/train_processed.csv")

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

Loading processed train data...
Shape: (7558, 235)
Columns: ['id', 'target', 'clean_text', 'url_count', 'typo_count', 'hashtag_count', 'has_url', 'has_hashtag', 'has_typos', 'eda_char_count', 'eda_word_count', 'location_encoded', 'keyword_clean_accident', 'keyword_clean_aftershock', 'keyword_clean_airplane accident', 'keyword_clean_ambulance', 'keyword_clean_annihilated', 'keyword_clean_annihilation', 'keyword_clean_apocalypse', 'keyword_clean_armageddon', 'keyword_clean_army', 'keyword_clean_arson', 'keyword_clean_arsonist', 'keyword_clean_attack', 'keyword_clean_attacked', 'keyword_clean_avalanche', 'keyword_clean_battle', 'keyword_clean_bioterror', 'keyword_clean_bioterrorism', 'keyword_clean_blaze', 'keyword_clean_blazing', 'keyword_clean_bleeding', 'keyword_clean_blew up', 'keyword_clean_blight', 'keyword_clean_blizzard', 'keyword_clean_blood', 'keyword_clean_bloody', 'keyword_clean_blown up', 'keyword_clean_body bag', 'keyword_clean_body bagging', 'keyword_clean_body bags', 'keyw

In [None]:
X = df.drop(columns=["id", "target", "clean_text"])  
text_col = df["clean_text"]
y = df["target"]

print("\nTarget distribution:")
print(y.value_counts(normalize=True))


Target distribution:
target
0    0.570654
1    0.429346
Name: proportion, dtype: float64


In [None]:
X_train, X_val, text_train, text_val, y_train, y_val = train_test_split(
    X, text_col, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("\nTrain size:", len(X_train), f"({len(X_train)/len(df):.1%})")
print("Val size:  ", len(X_val), f"({len(X_val)/len(df):.1%})")


Train size: 6046 (80.0%)
Val size:   1512 (20.0%)


In [None]:
tfidf = TfidfVectorizer(
    max_features=8000,          
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words="english"
)

In [None]:
text_train = text_train.fillna("")
text_val   = text_val.fillna("")

text_train = text_train.astype(str)
text_val   = text_val.astype(str)

print("NaN in text_train after fix:", text_train.isna().sum())
print("Empty strings in text_train:", (text_train == "").sum())

NaN in text_train after fix: 0
Empty strings in text_train: 2


In [None]:
X_text_train = tfidf.fit_transform(text_train)
X_text_val   = tfidf.transform(text_val)

X_train_full = np.hstack([X_text_train.toarray(), X_train.values]) # type: ignore
X_val_full   = np.hstack([X_text_val.toarray(),   X_val.values]) # type: ignore

print("\nFinal train shape (TF-IDF + features):", X_train_full.shape)
print("Final val shape:  ", X_val_full.shape)


Final train shape (TF-IDF + features): (6046, 8172)
Final val shape:   (1512, 8172)


In [None]:
model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",    
    random_state=42,
    solver="lbfgs"
)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_full, y_train, cv=cv, scoring="f1")

print("\n5-fold CV F1 scores:", cv_scores)
print(f"Mean CV F1: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


5-fold CV F1 scores: [0.79296875 0.78035218 0.80494767 0.79619048 0.77744209]
Mean CV F1: 0.7904 ± 0.0102


In [None]:
model.fit(X_train_full, y_train)
y_pred_val = model.predict(X_val_full)

print("\nValidation performance:")
print(classification_report(y_val, y_pred_val))
print("Accuracy:", accuracy_score(y_val, y_pred_val))
print("F1-score:", f1_score(y_val, y_pred_val))


Validation performance:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       863
           1       0.78      0.80      0.79       649

    accuracy                           0.82      1512
   macro avg       0.82      0.82      0.82      1512
weighted avg       0.82      0.82      0.82      1512

Accuracy: 0.8194444444444444
F1-score: 0.7923954372623574


In [None]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred_val))


Confusion Matrix:
[[718 145]
 [128 521]]


In [None]:
feature_names = [f"tfidf_{i}" for i in range(X_text_train.shape[1])] + X_train.columns.tolist()
coefs = pd.Series(model.coef_[0], index=feature_names).sort_values(ascending=False)

print("\nTop 20 most important features (by LogReg coef):")
print(coefs.head(20))

print("\nTop 20 least important (negative coef):")
print(coefs.tail(20))


Top 20 most important features (by LogReg coef):
location_encoded                  11.326431
tfidf_3342                         2.405330
tfidf_3854                         1.977922
tfidf_4753                         1.858071
keyword_clean_derailment           1.749504
keyword_clean_oil spill            1.743526
keyword_clean_nuclear disaster     1.727558
tfidf_5857                         1.713618
tfidf_6690                         1.706972
keyword_clean_wild fires           1.702742
keyword_clean_typhoon              1.693881
tfidf_7159                         1.691379
keyword_clean_suicide bombing      1.670635
keyword_clean_debris               1.667691
keyword_clean_outbreak             1.651908
keyword_clean_mass murder          1.581646
tfidf_1756                         1.573247
tfidf_1208                         1.565838
keyword_clean_forest fires         1.541609
keyword_clean_wreckage             1.503441
dtype: float64

Top 20 least important (negative coef):
keyword_clean_