# 3) Hızlı Baseline (önerilir): TF-IDF + Lojistik Regresyon

Bu, "derine" inmeden önce sağlam bir referans metrik verir.


In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


## Veriyi Yükle


In [2]:
# Temizlenmiş veriyi oku
df = pd.read_csv("../data/cleaned_data.csv")
print(f"Veri boyutu: {df.shape}")
df.head()


Veri boyutu: (47837, 2)


Unnamed: 0,text,label
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


## Dengesizlik Kontrolü + train/valid/test Bölümü


In [3]:
# Stratified split
X_train, X_tmp, y_train, y_tmp = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp
)

print(len(X_train), len(X_val), len(X_test))


38269 4784 4784


## Class Weights (dengesiz sınıflar için)


In [4]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = {c:w for c,w in zip(classes, class_weights)}
print(f"Class weights: {class_weights}")


Class weights: {'Access': np.float64(0.8392324561403509), 'Administrative rights': np.float64(3.3974609375), 'HR Support': np.float64(0.5478269583142464), 'Hardware': np.float64(0.4391466997154136), 'Internal Project': np.float64(2.8221976401179942), 'Miscellaneous': np.float64(0.8469591005665722), 'Purchase': np.float64(2.427004058853374), 'Storage': np.float64(2.1528465346534653)}


## Pipeline: TF-IDF + Logistic Regression


In [5]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(min_df=3, max_df=0.9, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=200, n_jobs=-1))
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_val)
print(classification_report(y_val, pred, zero_division=0))


                       precision    recall  f1-score   support

               Access       0.92      0.88      0.90       712
Administrative rights       0.93      0.62      0.75       176
           HR Support       0.86      0.88      0.87      1092
             Hardware       0.79      0.90      0.84      1362
     Internal Project       0.96      0.76      0.85       212
        Miscellaneous       0.84      0.82      0.83       706
             Purchase       0.99      0.83      0.90       246
              Storage       0.95      0.82      0.88       278

             accuracy                           0.86      4784
            macro avg       0.90      0.82      0.85      4784
         weighted avg       0.86      0.86      0.86      4784



## Test Seti ile Değerlendirme


In [6]:
pred_test = pipe.predict(X_test)
print("\n=== TEST SETİ SONUÇLARI ===")
print(classification_report(y_test, pred_test, zero_division=0))



=== TEST SETİ SONUÇLARI ===
                       precision    recall  f1-score   support

               Access       0.92      0.86      0.89       713
Administrative rights       0.90      0.61      0.73       176
           HR Support       0.87      0.88      0.87      1091
             Hardware       0.79      0.91      0.85      1362
     Internal Project       0.93      0.78      0.85       212
        Miscellaneous       0.84      0.83      0.83       706
             Purchase       0.99      0.88      0.93       247
              Storage       0.97      0.82      0.89       277

             accuracy                           0.86      4784
            macro avg       0.90      0.82      0.85      4784
         weighted avg       0.87      0.86      0.86      4784



## Modeli Kaydet


In [7]:
import pickle

# Modeli kaydet
with open("../models/baseline_tfidf_logreg.pkl", "wb") as f:
    pickle.dump(pipe, f)
    
print("Baseline model kaydedildi!")


Baseline model kaydedildi!


**Not:** Eğer bu baseline zaten yüksekse, LSTM yerine Linear SVM ya da BERT de düşünebiliriz. Ama projenin planına sadık kalıp LSTM kuracağız.
