# Alex's Algorithm #2

Input: D-dimensional Eucledean Space Matrix of Real Valued Metric Space clustering use-cases/think through (this could be applied to other use cases as well e.g. anomaly detection w.o. starting with the filtering for these elements.

Paramteres: 

K-anonymity level: K-anonymity

Epsilon

First filter out outlier rows from the data set, here one can use e.g. LOF or ABOD for this purpose or use winsorization column wise to cut off long tail distribution behavior (https://en.wikipedia.org/wiki/Local_outlier_factor , https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD2008.pdf , https://en.wikipedia.org/wiki/Winsorizing). /Or to be align with different use cases for e.g. anomaly detection, don't remove these elements./

For each row in the dataset:

    Find K-nearest negihbours
    For each column/feature /denoted with d/:
        Calculate the mean value of the K-sphere for each of columns and imputate it to the original column value+ White Noise (white Noise should be different for every point)
        where White Noise = Laplace(0, (max(K-sphere[d]-min(K-sphere[d])/epsilon)
        where K-sphere[d] denotes the set of feature/column "d" of the rows of the K-sphere (the original row is inclusive)
For categorical variables/non-numeric variables imputate K-sphere mode.
Another alternative could be to write down the neighbouring graph from the K-spheres and change random feature values between them, if the use-case is not an anomaly detection.
For categorical variables/non-numeric variables imputate cluster mode. /In case we need a common metric, the subscriber must normalize the space (e.g. with Min-Max Scaler) and should use a normalized Hamming- or Levenhstein-distance for them./
***

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# MOdel selection
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import LocalOutlierFactor

# Model hyper parameter tuning
from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import sklearn.neighbors

import os

import warnings  
warnings.filterwarnings('ignore')

import numpy
import math


In [2]:
def getKSphere(df, k, index, tree) :
    indexes = tree.query(df[index:index+1], k+1, return_distance=False)
    return df.loc[indexes[0]]

In [3]:
def getNewValue(col, epsilon) :
    # base = col.iloc[0]
    base = numpy.mean(col)
    r = max(col) - min(col)
    return np.random.laplace(base, r/epsilon)

In [4]:
def processLine(df, k, epsilon, index, tree) :
    sphere = getKSphere(df, k, index, tree)
    return sphere.apply(getNewValue, args=[epsilon])


In [5]:
def main_transform(df, k, epsilon, lof=False) :
    
    scaler = StandardScaler()
    scaledDf = pd.DataFrame(scaler.fit_transform(df))
    if lof:
        scaledDf = filter_lof(scaledDf)

    resList = []
    tree = sklearn.neighbors.BallTree(scaledDf, metric="euclidean")

    for i in scaledDf.index.values:
        resList.append(processLine(scaledDf, k, epsilon, i, tree))
        if (i % 50 == 0):
            print(i)
    return pd.DataFrame(scaler.inverse_transform(resList), columns=df.columns)

In [6]:
def filter_lof(df, k=20) :
    lof = LocalOutlierFactor(n_neighbors=k)
    df2 = pd.DataFrame.copy(df)
    df2["_lof"] = lof.fit_predict(df2)
    return df2[df2["_lof"]>0].drop(columns="_lof").reset_index(drop=True)

## main_transform(df, K, Epsilon)

### Without removing outliers

#### K = 10, epsilon = 1

main_transform(df[0:200], 10, 10)

### Removing outliers with LOF

#### K = 3

#### epsilon = 10

main_transform(filtered, 3, 10)

#### Diff

filtered - main_transform(filtered, 3, 10)

## Batch process files

In [7]:
files = ["benign_traffic.csv", "gafgyt_attacks/combo.csv", "gafgyt_attacks/junk.csv", "gafgyt_attacks/scan.csv", "gafgyt_attacks/tcp.csv", "gafgyt_attacks/udp.csv", "mirai_attacks/ack.csv", "mirai_attacks/scan.csv", "mirai_attacks/syn.csv", "mirai_attacks/udp.csv", "mirai_attacks/udpplain.csv"]

In [12]:
def process_files(idir, odir, k, epsilon, lof=False) :
    os.makedirs(odir)
    os.makedirs(os.path.join(odir, "mirai_attacks"))
    os.makedirs(os.path.join(odir, "gafgyt_attacks"))

    for filename in files:
        df = pd.read_csv(os.path.join(idir, filename))
        # df = df[0:200]
        df = df.sample(frac=0.5, replace=False, random_state=42).reset_index(drop=True)
        df2 = main_transform(df, k, epsilon, lof)
        df2.to_csv(os.path.join(odir, filename), index=False)
        

### Baby Monitor

process_files("baby", "baby_k10_e1", 10, 1)
process_files("baby", "baby_k20_e1", 20, 1)
process_files("baby", "baby_k50_e1", 50, 1)

process_files("baby", "baby_k10_e5", 10, 5)
process_files("baby", "baby_k20_e5", 20, 5)
process_files("baby", "baby_k50_e5", 50, 5)

process_files("baby", "baby_k10_e10", 10, 10)
process_files("baby", "baby_k20_e10", 20, 10)
process_files("baby", "baby_k50_e10", 50, 10)

process_files("baby", "baby_k10_e1_lof", 10, 1, True)
process_files("baby", "baby_k20_e1_lof", 20, 1, True)
process_files("baby", "baby_k50_e1_lof", 50, 1, True)

process_files("baby", "baby_k10_e5_lof", 10, 5, True)
process_files("baby", "baby_k20_e5_lof", 20, 5, True)
process_files("baby", "baby_k50_e5_lof", 50, 5, True)

process_files("baby", "baby_k10_e10_lof", 10, 10, True)
process_files("baby", "baby_k20_e10_lof", 20, 10, True)
process_files("baby", "baby_k50_e10_lof", 50, 10, True)

### SimpleHome_XCS7_1003_WHT_Security_Camera

In [13]:
src = "/data/dataprivacy/detection_of_IoT_botnet_attacks/SimpleHome_XCS7_1003_WHT_Security_Camera"
dst = "SimpleHome_XCS7_1003_WHT_Security_Camera/"

In [None]:
process_files(src, "k10_e1_lof", 10, 1, True)
process_files(src, "k20_e1_lof", 20, 1, True)
process_files(src, "k50_e1_lof", 50, 1, True)

process_files(src, "k10_e5_lof", 10, 5, True)
process_files(src, "k20_e5_lof", 20, 5, True)
process_files(src, "k50_e5_lof", 50, 5, True)

process_files(src, "k10_e10_lof", 10, 10, True)
process_files(src, "k20_e10_lof", 20, 10, True)
process_files(src, "k50_e10_lof", 50, 10, True)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
