# Alex's Algorithm #1

Input: D-dimensional Eucledean Space Matrix of Real Valued Metric Space clustering use-cases/think through (this could be applied to other use cases as well e.g. anomaly detection w.o. starting with the filtering for these elements.

Paramteres: 

K-anonymity level: K-anonymity

Epsilon

First filter out outlier rows from the data set, here one can use e.g. LOF or ABOD for this purpose or use winsorization column wise to cut off long tail distribution behavior (https://en.wikipedia.org/wiki/Local_outlier_factor , https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD2008.pdf , https://en.wikipedia.org/wiki/Winsorizing). /Or to be align with different use cases for e.g. anomaly detection, don't remove these elements./

Run hierachical clustering on the Euclidean space data and save dendogram. 
(Here we would advise to use the linkage to keep simple with single or centroid linkage.


If we denote the root with level 0 and the level of leafs with n, then j \in \{1,...,n\} denotes the j'th level, and if we will also denote with i the i'th cluster at the j'th level, where i \in \{1,...,j\}, with this notation we can define than c_{i,j} the cluster size at the i'th level for the j'th cluster. Now find i that statisfies


    max_{i} min_{j} c_{i,j}>=k.


Split the data into these clusters.


For each cluster:


For each column feature:


        Calculate the centroid/mean value for the column per cluster and imputate it to the original data points per cluster + White Noise (white Noise should be different per cluster as defined below)
        where White Noise = Laplace(0, (max(c_{i,j}[d]-min(c_{i,j}[d])/epsilon)
        where c_{i,j}[d] denotes the set of feature/column "d" of the rows of the j'th cluster at the i'th level in the dendogram


For categorical variables/non-numeric variables imputate cluster mode. /In case we need a common metric, the subscriber must normalize the space (e.g. with Min-Max Scaler) and should use a normalized Hamming- or Levenhstein-distance for them./

***

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# MOdel selection
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import LocalOutlierFactor

# Model hyper parameter tuning
from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import sklearn.neighbors

import os

import warnings  
warnings.filterwarnings('ignore')

import numpy
import math

from scipy.spatial import distance
from scipy.cluster import hierarchy


In [2]:
def getNewValue(col, epsilon) :
    # base = col.iloc[0]
    base = numpy.mean(col)
    r = max(col) - min(col)
    return np.random.laplace(base, r/epsilon)

In [4]:
def get_clusters(k, tree, clusters) :
    if tree.is_leaf():
        raise Exception("Cluster contains only 1 element")
    left = tree.get_left()
    right = tree.get_right()
    if (left.get_count() >= k) and (right.get_count() >= k):
        get_clusters(k, left, clusters)
        get_clusters(k, right, clusters)
    else:
        clusters.append(tree.pre_order())

In [5]:
def main_transform(df, k, epsilon, lof=False) :
    scaler = StandardScaler()
    scaledDf = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    if lof:
        scaledDf = filter_lof(scaledDf)

    newDf = pd.DataFrame(index=scaledDf.index.copy(), columns=df.columns)

        
    linked = hierarchy.linkage(scaledDf, 'ward')
    tree = hierarchy.to_tree(linked)
    clusters = []
    get_clusters(k, tree, clusters)
    maxClusterSize = 0
    for clusterIndexes in clusters:
        if maxClusterSize < len(clusterIndexes):
            maxClusterSize = len(clusterIndexes)
        cluster = scaledDf.loc[clusterIndexes]
        newCluster = cluster.apply(getNewValue, args=[epsilon])
        for i in clusterIndexes : 
            newDf.loc[i] = newCluster

    print("Max cluster size:", maxClusterSize)
    return pd.DataFrame(scaler.inverse_transform(newDf), columns=newDf.columns)

In [6]:
def filter_lof(df, k=20) :
    lof = LocalOutlierFactor(n_neighbors=k)
    df2 = pd.DataFrame.copy(df)
    df2["_lof"] = lof.fit_predict(df2)
    return df2[df2["_lof"]>0].drop(columns="_lof").reset_index(drop=True)

## main_transform(df, K, Epsilon)

### Without removing outliers

#### K = 10, epsilon = 1

main_transform(df[0:200], 10, 10)

### Removing outliers with LOF

#### K = 3

#### epsilon = 10

main_transform(filtered, 3, 10)

#### Diff

filtered - main_transform(filtered, 3, 10)

## Batch process files

In [7]:
files = ["benign_traffic.csv", "gafgyt_attacks/combo.csv", "gafgyt_attacks/junk.csv", "gafgyt_attacks/scan.csv", "gafgyt_attacks/tcp.csv", "gafgyt_attacks/udp.csv", "mirai_attacks/ack.csv", "mirai_attacks/scan.csv", "mirai_attacks/syn.csv", "mirai_attacks/udp.csv", "mirai_attacks/udpplain.csv"]

In [9]:
def process_files(idir, odir, k, epsilon, lof=False) :
    print(odir, k, epsilon, lof)
    os.makedirs(odir)
    os.makedirs(os.path.join(odir, "mirai_attacks"))
    os.makedirs(os.path.join(odir, "gafgyt_attacks"))

    for filename in files:
        print(filename)
        df = pd.read_csv(os.path.join(idir, filename))
        # df = df[0:200]
        df = df.sample(frac=0.5, replace=False, random_state=42).reset_index(drop=True)
        df2 = main_transform(df, k, epsilon, lof)
        df2.to_csv(os.path.join(odir, filename), index=False)
        

### Baby Monitor

process_files("baby", "baby_k10_e1", 10, 1)
process_files("baby", "baby_k20_e1", 20, 1)
process_files("baby", "baby_k50_e1", 50, 1)

process_files("baby", "baby_k10_e5", 10, 5)
process_files("baby", "baby_k20_e5", 20, 5)
process_files("baby", "baby_k50_e5", 50, 5)

process_files("baby", "baby_k10_e10", 10, 10)
process_files("baby", "baby_k20_e10", 20, 10)
process_files("baby", "baby_k50_e10", 50, 10)

process_files("baby", "baby_k10_e1_lof", 10, 1, True)
process_files("baby", "baby_k20_e1_lof", 20, 1, True)
process_files("baby", "baby_k50_e1_lof", 50, 1, True)

process_files("baby", "baby_k10_e5_lof", 10, 5, True)
process_files("baby", "baby_k20_e5_lof", 20, 5, True)
process_files("baby", "baby_k50_e5_lof", 50, 5, True)

process_files("baby", "baby_k10_e10_lof", 10, 10, True)
process_files("baby", "baby_k20_e10_lof", 20, 10, True)
process_files("baby", "baby_k50_e10_lof", 50, 10, True)

### SimpleHome_XCS7_1003_WHT_Security_Camera

In [13]:
#src = "/data/dataprivacy/detection_of_IoT_botnet_attacks/SimpleHome_XCS7_1003_WHT_Security_Camera"

src = "/data/dataprivacy/detection_of_IoT_botnet_attacks/Philips_B120N10_Baby_Monitor"


In [None]:
print(src)
process_files(src, "k10_e1_lof", 10, 1, True)
process_files(src, "k20_e1_lof", 20, 1, True)
process_files(src, "k50_e1_lof", 50, 1, True)

process_files(src, "k10_e5_lof", 10, 5, True)
process_files(src, "k20_e5_lof", 20, 5, True)
process_files(src, "k50_e5_lof", 50, 5, True)

process_files(src, "k10_e10_lof", 10, 10, True)
process_files(src, "k20_e10_lof", 20, 10, True)
process_files(src, "k50_e10_lof", 50, 10, True)

/data/dataprivacy/detection_of_IoT_botnet_attacks/Philips_B120N10_Baby_Monitor
k10_e1_lof 10 1 True
benign_traffic.csv
