In [None]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch, helpers
from pandas import json_normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# --- CONFIG ---
client = "https://my-elasticsearch-project-eb24cb.es.eu-west-1.aws.elastic.cloud:443"
API_KEY_1 = "WDBOaHlwY0JEVV9IeldsV1RhNE46TEpWdkpKcy1JTlNlQ1gzQWJCMEhVdw=="
INDEX_READ = ".ds-metrics-system.process-default-2025.07.02-000001"
INDEX_TARGET = "anomaly"

# --- CONNECT TO ELASTICSEARCH ---
es = Elasticsearch(hosts=[client], api_key=API_KEY_1, verify_certs=True)

# --- FETCH DATA ---
def fetch_data(size=5000):
    query = {
        "size": size,
        "query": {"match_all": {}},
        "sort": [{"@timestamp": {"order": "desc"}}],
        "_source": [
            "@timestamp", "host.name",
            "system.process.cpu.total.value", "system.process.memory.size",
            "process.cpu.pct", "event.duration",
            "process.name", "process.memory", "host.os.name", "process.state"
        ]
    }
    res = es.search(index=INDEX_READ, body=query)
    docs = [hit['_source'] for hit in res['hits']['hits']]
    df = pd.DataFrame(docs)
    df = json_normalize(df.to_dict(orient='records'))
    return df

# --- LABEL ANOMALIES BASED ON Z-SCORE + IQR ---
def label_anomalies(df, feature_cols, z_thresh=3.0):
    df['anomaly_label'] = 0
    for col in feature_cols:
        if col in df.columns:
            df[col] = df[col].astype(float)
            z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
            z_outliers = z_scores > z_thresh

            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            iqr_outliers = (df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)

            df['anomaly_label'] = df['anomaly_label'] | ((z_outliers | iqr_outliers).astype(int))
    return df

# --- COMPARE MULTIPLE MODELS ---
def compare_models(X, y, df_original, folds=10):
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    models = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Naive Bayes": GaussianNB(),
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "SVM": SVC(kernel='rbf', probability=True, random_state=42),
        "KNN": KNeighborsClassifier()
    }
    results = {}
    confusion_matrices = {}

    for name, model in models.items():
        accs, precs, recalls, f1s = [], [], [], []
        all_preds, all_true = [], []

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accs.append(accuracy_score(y_test, y_pred))
            precs.append(precision_score(y_test, y_pred, zero_division=0))
            recalls.append(recall_score(y_test, y_pred, zero_division=0))
            f1s.append(f1_score(y_test, y_pred, zero_division=0))

            all_preds.extend(y_pred)
            all_true.extend(y_test)

        results[name] = {
            "Accuracy": np.mean(accs),
            "Precision": np.mean(precs),
            "Recall": np.mean(recalls),
            "F1 Score": np.mean(f1s)
        }
        confusion_matrices[name] = confusion_matrix(all_true, all_preds)

    print("\nModel Comparison (Average of {} folds):".format(folds))
    print("{:<20} {:>8} {:>10} {:>10} {:>10}".format("Model", "Accuracy", "Precision", "Recall", "F1 Score"))
    for name, metrics in results.items():
        print("{:<20} {:8.4f} {:10.4f} {:10.4f} {:10.4f}".format(
            name, metrics["Accuracy"], metrics["Precision"], metrics["Recall"], metrics["F1 Score"]
        ))
        print("Confusion Matrix ({}):\n{}\n".format(name, confusion_matrices[name]))

    return models['Random Forest'].fit(X, y).predict(X), models['Decision Tree'].fit(X, y).predict(X)

# --- PUSH TO ELASTICSEARCH ---
def push_to_elasticsearch(df, rf_preds, dt_preds):
    actions = []
    for i, row in df.iterrows():
        try:
            doc = {
                '_index': INDEX_TARGET,
                '_source': {
                    '@timestamp': pd.to_datetime(row['@timestamp']).isoformat() if not pd.isnull(row['@timestamp']) else None,
                    'agent_name': str(row.get('host.name', '')),
                    'system.process.cpu.total.value': float(row.get('system.process.cpu.total.value', 0)),
                    'system.process.memory.size': float(row.get('system.process.memory.size', 0)),
                    'process.cpu.pct': float(row.get('process.cpu.pct', 0)),
                    'event.duration': float(row.get('event.duration', 0)),
                    'process.name': str(row.get('process.name', '')),
                    'host.os.name': str(row.get('host.os.name', '')),
                    'process.state': str(row.get('process.state', '')),
                    'process.memory': str(row.get('process.memory', '')),
                    'anomaly_label': int(row.get('anomaly_label', 0)),
                    'rf_pred': int(rf_preds[i]),
                    'dt_pred': int(dt_preds[i])
                }
            }
            actions.append(doc)
        except Exception as e:
            print(f"[SKIPPED] Row {i} failed: {e}")

    if actions:
        success, _ = helpers.bulk(es, actions)
        print(f" Indexed {success} documents to '{INDEX_TARGET}'")
    else:
        print("! No valid documents to push.")


# --- MAIN PIPELINE ---
if __name__ == "__main__":
    print("Fetching data...")
    df = fetch_data(size=5000)

    features = [
        "system.process.cpu.total.value",
        "system.process.memory.size",
        "event.duration",
        "process.cpu.pct"
    ]
    df.dropna(subset=features, inplace=True)

    print("Creating anomaly labels...")
    df = label_anomalies(df, features, z_thresh=3.0)

    print("Encoding categorical features...")
    label_encoders = {}
    for col in ['host.os.name', 'process.name', 'process.state']:
        if col in df.columns:
            le = LabelEncoder()
            df[f'{col}_encoded'] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    encoded_features = features + [f'{col}_encoded' for col in label_encoders]
    X = df[encoded_features]
    y = df['anomaly_label']

    print("Comparing all 6 model performances...")
    rf_preds, dt_preds = compare_models(X, y, df)

    print("Pushing RF and DT predictions to Elasticsearch...")
    push_to_elasticsearch(df.reset_index(drop=True), rf_preds, dt_preds)

    print("\n[âœ“] Done.")
