In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# CSV dosyasını yükle
df = pd.read_csv("real_activity_log.csv")

# Tarih işle
df["start_date_time"] = pd.to_datetime(df["start_date_time"], errors="coerce")
df = df[df["start_date_time"].notna()]
df["hour"] = df["start_date_time"].dt.floor("H")

# key_ratio oluştur
df["key_ratio"] = df["key_down"] / (df["key_up"] + 1)

# max_key_count çıkar
def parse_key_counts(s):
    try:
        numbers = list(map(int, s.strip("{}").split(",")))
        return max(numbers) if numbers else 0
    except:
        return 0

df["max_key_count"] = df["key_codes"].apply(parse_key_counts)

# NaN olanları temizle
df_clean = df[["key_down", "key_ratio", "max_key_count"]].dropna()

# Özellikleri ölçekle
X = StandardScaler().fit_transform(df_clean)

# K-means ile cluster
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X)

# PCA ile 2 boyuta indir
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Görselleştir
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="Set1", alpha=0.6)
plt.title("K-means Clustering - Keyboard Behavior")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.grid(True)
plt.legend(*scatter.legend_elements(), title="Cluster")
plt.tight_layout()
plt.show()

# Her cluster için ortalama değerleri göster
df_clean["cluster"] = labels
print(df_clean.groupby("cluster").mean().round(2))
