In [7]:
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# ----------------------
# Verilerin Yolu
# ----------------------
REAL_DIR = r"C:/store/git/km-stat-activity/data/real"
FAKE_DIR = r"C:/store/git/km-stat-activity/processed/fake"

# Real dosyalar: km_stat_1_processed.csv .. km_stat_10_processed.csv
real_files = sorted(glob.glob(os.path.join(REAL_DIR, "km_stat_*_processed.csv")))

# Fake dosyalar: profile_guid=*/*-processed.csv
fake_files = sorted(glob.glob(os.path.join(FAKE_DIR, "profile_guid=*", "*-processed.csv")))

print(f"Toplam Gerçek Dosya: {len(real_files)}")
print(f"Toplam Sahte Dosya: {len(fake_files)}")

# ----------------------
# Dosyaları oku ve etiketle
# ----------------------
def load_and_label(files, label):
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        df['label'] = label
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

real_df = load_and_label(real_files, 1)
fake_df = load_and_label(fake_files, 0)

# ----------------------
# Tüm veriyi birleştir
# ----------------------
df_all = pd.concat([real_df, fake_df], ignore_index=True)

# ----------------------
# Öznitelik Seçimi 
# ----------------------
FEATURE_COLUMNS = [
    'x_direction_changes', 'y_direction_changes',
    'min_x', 'min_y', 'max_x', 'max_y',
    'bbox_area', 'avg_speed', 'avg_acceleration',
    'mouse_idle_ratio', 'movement_entropy', 'linearity'
]


X = df_all[FEATURE_COLUMNS]
y = df_all['label']

# ----------------------
# Train-Test ayırma
# ----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------
# StandardScaler (Z-Score)
# ----------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ----------------------
# SelectKBest ile en iyi 8 özelliği seç
# ----------------------
k_best = SelectKBest(score_func=f_classif, k=8)
X_train_k = k_best.fit_transform(X_train_scaled, y_train)
X_test_k = k_best.transform(X_test_scaled)
selected_features = [FEATURE_COLUMNS[i] for i in k_best.get_support(indices=True)]
print("Seçilen özellikler:", selected_features)

# -önce bu kodu çalıştırayım değil mi---------------------
# Logistic Regression
# ----------------------
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train_k, y_train)
y_pred = model_lr.predict(X_test_k)

# ----------------------
# Değerlendirme
# ----------------------
cm_lr = confusion_matrix(y_test, y_pred)
accuracy_lr = accuracy_score(y_test, y_pred)
precision_lr = precision_score(y_test, y_pred)
recall_lr = recall_score(y_test, y_pred)
specificity_lr = cm_lr[0,0] / (cm_lr[0,0] + cm_lr[0,1])

print("Logistic Regression Confusion Matrix:\n", cm_lr)
print(f"Accuracy   : {accuracy_lr:.4f}")
print(f"Precision  : {precision_lr:.4f}")
print(f"Recall     : {recall_lr:.4f}")
print(f"Specificity: {specificity_lr:.4f}")


Toplam Gerçek Dosya: 10
Toplam Sahte Dosya: 1675
Seçilen özellikler: ['x_direction_changes', 'y_direction_changes', 'min_y', 'max_x', 'max_y', 'avg_speed', 'mouse_idle_ratio', 'movement_entropy']
Logistic Regression Confusion Matrix:
 [[180900      0]
 [     7 180265]]
Accuracy   : 1.0000
Precision  : 1.0000
Recall     : 1.0000
Specificity: 1.0000
