In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import faiss  # pip install faiss-cpu
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold



In [None]:
files = [
    "output/subset_window_1754898203.csv","output/window_1_1756028067.csv",
    "output/window_3_1756030271.csv",
    "output/window_5_1756032959.csv","output/window_10_1754904064.csv","output/window_15_1756035248.csv"
]
merged_df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
merged_df.to_csv("output/merged_selected.csv", index=False)

In [None]:
FEATURE_COLUMNS = [
            'src2dst_avg_pkt_size', 'src2dst_avg_pkt_size_mean', 'src2dst_avg_pkt_size_median',
            'src2dst_avg_pkt_size_std', 'src2dst_avg_pkt_size_min', 'src2dst_avg_pkt_size_max',
            'src2dst_pps', 'src2dst_pps_mean', 'src2dst_pps_median',
            'src2dst_pps_std', 'src2dst_pps_min', 'src2dst_pps_max',
            'dst2src_avg_pkt_size', 'dst2src_avg_pkt_size_mean', 'dst2src_avg_pkt_size_median',
            'dst2src_avg_pkt_size_std', 'dst2src_avg_pkt_size_min', 'dst2src_avg_pkt_size_max',
            'dst2src_pps', 'dst2src_pps_mean', 'dst2src_pps_median',
            'dst2src_pps_std', 'dst2src_pps_min', 'dst2src_pps_max',
            'pkt_dir_ratio', 'pkt_dir_ratio_mean', 'pkt_dir_ratio_median',
            'pkt_dir_ratio_std', 'pkt_dir_ratio_min', 'pkt_dir_ratio_max']

In [None]:


def vectordb(df):
    FEATURE_COLUMNS = [
            'src2dst_avg_pkt_size', 'src2dst_avg_pkt_size_mean', 'src2dst_avg_pkt_size_median',
            'src2dst_avg_pkt_size_std', 'src2dst_avg_pkt_size_min', 'src2dst_avg_pkt_size_max',
            'src2dst_pps', 'src2dst_pps_mean', 'src2dst_pps_median',
            'src2dst_pps_std', 'src2dst_pps_min', 'src2dst_pps_max',
            'dst2src_avg_pkt_size', 'dst2src_avg_pkt_size_mean', 'dst2src_avg_pkt_size_median',
            'dst2src_avg_pkt_size_std', 'dst2src_avg_pkt_size_min', 'dst2src_avg_pkt_size_max',
            'dst2src_pps', 'dst2src_pps_mean', 'dst2src_pps_median',
            'dst2src_pps_std', 'dst2src_pps_min', 'dst2src_pps_max',
            'pkt_dir_ratio', 'pkt_dir_ratio_mean', 'pkt_dir_ratio_median',
            'pkt_dir_ratio_std', 'pkt_dir_ratio_min', 'pkt_dir_ratio_max']
    try:
        print("מספר שורות:", len(df))

        # אם עוד לא יצרת y_qoe_by_ping – אפשר ככה (אחרת פשוט להשתמש בעמודה הקיימת)
        from rank import rank_qoe_3_classes, rank_str_to_int_mapping_3_classes
        # df = df.dropna(subset=FEATURE_COLUMNS + ['ping']).copy()
        df['y_qoe_by_ping'] = df['ping'].apply(lambda p: rank_str_to_int_mapping_3_classes[rank_qoe_3_classes(p)])

        X = df[FEATURE_COLUMNS].values.astype('float32')
        y = df['y_qoe_by_ping'].values.astype('int64')

        # 2) חלוקה ל־Train/Test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        # 3) סקיילינג (חשוב מאוד ל-kNN/FAISS)
        scaler = StandardScaler()
        X_train_sc = scaler.fit_transform(X_train).astype('float32')
        X_test_sc = scaler.transform(X_test).astype('float32')

        # 4) בניית אינדקס FAISS (L2)
        d = X_train_sc.shape[1]
        index = faiss.IndexFlatL2(d)  # אינדקס פשוט; לגדלים גדולים נעבור ל-HNSW/IVF
        index.add(X_train_sc)  # מוסיפים את הווקטורים

        # 5) חיפוש k שכנים ו־Majority Vote
        k = 5
        D, I = index.search(X_test_sc, k)  # D=מרחקים, I=אינדקסים של השכנים (ב-X_train_sc)

        def majority_vote(neighbor_indices):
            labels = y_train[neighbor_indices]
            vals, counts = np.unique(labels, return_counts=True)
            return vals[np.argmax(counts)]

        y_pred = np.array([majority_vote(I[i]) for i in range(len(X_test_sc))])

        # 6) הערכה
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        # print(f"FAISS-kNN → Accuracy: {acc:.3f} | Macro-F1: {f1:.3f}")
        print(f"  FAISS-kNN → Accuracy: {acc:.3f} | Macro-F1: {f1:.3f}")
        print("מספר שורות:", len(df))
    except Exception as e:
        print(f"   שגיאה: {e}")



In [None]:

df = pd.read_csv("output/window_10_1754904064.csv")

t_df = df.dropna(subset=FEATURE_COLUMNS + ['ping']).copy()
print("dropna")
vectordb(t_df)


# t_df = df.fillna(0).copy()
# print("fillna")
# vectordb(t_df)





# t_df = df.copy()
# imp = SimpleImputer(strategy="median")   # או "mean", או "most_frequent"
# X = imp.fit_transform(t_df)
# print("median")
# vectordb(t_df)

dropna
מספר שורות: 131004
  FAISS-kNN → Accuracy: 0.883 | Macro-F1: 0.883
מספר שורות: 131004
fillna
מספר שורות: 169016
   שגיאה: Input X contains infinity or a value too large for dtype('float32').


ValueError: Input X contains infinity or a value too large for dtype('float64').