#Clustering Using K-Means and DBSCANS Algorithms 

In [7]:
#libaries
import numpy as np
import os
import re
import h5py
import socket
import struct
from sklearn.preprocessing import normalize, MinMaxScaler 

In [None]:
data_path = ""

def get_prevectors(data_path):
    prevectors = {}
    for path in os.listdir(data_path):
        full_path = os.path.join(data_path, path)
        with open(full_path, "r") as f:
            for line in f:
                try:
                    ip, request_type, response_code = LOG_REGEX.findall(line)[0]
                    ip = ip2int(ip)
                    if ip not in prevectors:
                        prevectors[ip] = {"requests": {}, "responses": {}}
                    if request_type not in prevectors[ip]["requests"]:
                        prevectors[ip]["requests"][request_type] = 0
                    prevectors[ip]["requests"][request_type] += 1
                    if response_code not in prevectors[ip]["responses"]:
                        prevectors[ip]["responses"][response_code] = 0
                    prevectors[ip]["responses"][response_code] += 1
                except IndexError:
                    continue
    return prevectors


def convert_prevectors_to_vectors(prevectors):
    request_types = [
        "GET",
        "POST",
        "HEAD",
        "OPTIONS",
        "PUT",
        "TRACE"
    ]
    response_codes = [
        200,
        404,
        403,
        304,
        301,
        206,
        418,
        416,
        403,
        405,
        503,
        500,
    ]

    vectors = np.zeros((len(prevectors.keys()), len(request_types) + len(response_codes)), dtype=np.float32)
    ips = []

    for index, (k, v) in enumerate(prevectors.items()):
        ips.append(k)
        for ri, r in enumerate(request_types):
            if r in v["requests"]:
                vectors[index, ri] = v["requests"][r]
        for ri, r in enumerate(response_codes):
            if r in v["responses"]:
                vectors[index, len(request_types) + ri] = v["requests"][r]

    return ips, vectors


def create_secrepo(data_path):
    prevectors = get_prevectors(data_path)
    ips, vectors = convert_prevectors_to_vectors(prevectors)
    scaler = MinMaxScaler


In [9]:
#Step 2: Graphing our Vectors
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import h5py


def visualize(vectors):
    pca = PCA(n_components=3)
    projected_vectors = pca.fit_transform(vectors)
    print projected_vectors.shape
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    plt.scatter(
        projected_vectors[:, 0],
        projected_vectors[:, 1],
        zs=projected_vectors[:, 2],
        s=200,
    )
    plt.show()

visualize(projected_vectors)

In [None]:
#Step 3: First pass clustering with K-means
import h5py
from sklearn.cluster import KMeans, DBSCAN
import numpy as np
from collections import Counter


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-c", "--cluster_method", choices=["kmeans", "dbscan"], default="kmeans")
    parser.add_argument('-n', "--number_clusters", type=int, default=2)
    parser.add_argument('-e', '--epsilon', type=float, default=6)
    parser.add_argument('-m', '--number_points', type=int, default=1)
    parser.add_argument("-i", "--vectors", required=True, help="HDF5 file containing the vectors")
    parser.add_argument("-o", "--output", required=True, help="Output HDF5 containing the vectors")
    args = parser.parse_args()

    cluster_method = args.cluster_method
    path = args.vectors
    output_path = args.output

    with h5py.File(path, "r") as f:
        vectors = f["vectors"][:]
        ips = f["notes"][:]

    if cluster_method == "kmeans":
        number_clusters = args.number_clusters
        kmeans = KMeans(n_clusters=number_clusters)
        clusters = kmeans.fit_predict(vectors)
    elif cluster_method == "dbscan":
        epsilon = args.epsilon
        number_points = args.number_points
        dbscan = DBSCAN(eps=epsilon, min_samples=number_points)
        clusters = dbscan.fit_predict(vectors)

    counter = Counter(clusters.tolist())
    for key in sorted(counter.keys()):
        print "Label {0} has {1} samples".format(key, counter[key])

    # create new hdf5 with clusters added
    with h5py.File(output_path, "w") as f:
        f.create_dataset("vectors", shape=vectors.shape, data=vectors)
        f.create_dataset("cluster", shape=(vectors.shape[0],), data=clusters, dtype=np.int32)
        f.create_dataset("notes", shape=(vectors.shape[0],), data=np.array(ips))


In [10]:
#Step 4: Validating our Clusters Statistically
import h5py
import socket
import struct
from sklearn.metrics import pairwise_distances, silhouette_samples, silhouette_score
import numpy as np


def int2ip(addr):
    return socket.inet_ntoa(struct.pack("!I", addr))


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--vectors", required=True, help="HDF5 file containing the vectors")
    parser.add_argument("-l", "--label", required=False, default=None)
    args = parser.parse_args()

    path = args.vectors

    with h5py.File(path, "r") as f:
        vectors = f["vectors"][:]
        ips = f["notes"][:]
        clusters = f["cluster"][:]

    ips = map(int2ip, ips.tolist())

    print "Vectors shape:", vectors.shape
    print "Minimum feature value:", vectors.min()
    print "Mean feature value:", vectors.mean()
    print "Max feature value:", vectors.max()
    print "Percentage of null values:", 100.0 * (float((vectors == 0).sum()) / (vectors.shape[0] * vectors.shape[1]))
    print ""

    vector_distances = pairwise_distances(vectors)
    print "Minimum distance between vectors:", vector_distances.min()
    print "Mean distance between vectors:", vector_distances.mean()
    print "Maximum distance between vectors:", vector_distances.max()
    print ""

    silhouette_scores = silhouette_samples(vectors, clusters)
    centroid_distances = []

    print "Number of labels:", len(set(clusters.tolist()))
    for label in sorted(set(clusters.tolist())):
        n_vects = vectors[clusters == label, :]
        centroid = n_vects.mean(0)
        centroid_distances.extend(pairwise_distances(centroid.reshape(1, -1), n_vects).tolist()[0])
        distances = pairwise_distances(n_vects)
        scores = silhouette_scores[clusters == label]

        print "Number of items in label {0}: {1}  ({2}%) (avg dist: {3}) (avg silhouette: {4})".format(
            label,
            n_vects.shape[0],
            (100.0 * n_vects.shape[0]) / vectors.shape[0],
            distances.mean(),
            scores.mean()
        )
    print ""

    centroid_distances = np.array(centroid_distances)
    print "Minimum label centroid distance:", centroid_distances.min()
    print "Mean label centroid distance:", centroid_distances.mean()
    print "Max label centroid distance:", centroid_distances.max()
    print "Overall Silhouette Score", silhouette_score(vector_distances, clusters)

In [None]:
#Step 5: Inspecting our Clusters
import h5py
import socket
import struct


def int2ip(addr):
    return socket.inet_ntoa(struct.pack("!I", addr))


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--vectors", required=True, help="HDF5 file containing the vectors")
    parser.add_argument("-l", "--label", type=int, required=False, default=None)
    args = parser.parse_args()

    path = args.vectors

    with h5py.File(path, "r") as f:
        vectors = f["vectors"][:]
        ips = f["notes"][:]
        clusters = f["cluster"][:]

    if args.label is None:
        for cluster_id in sorted(set(clusters.tolist())):
            for ip in ips[clusters == cluster_id]:
                print cluster_id, int2ip(ip)
    else:
        cluster_id = args.label
        for ip in ips[clusters == cluster_id]:
            print cluster_id, int2ip(ip)

In [None]:
#Step 6: Modifying K to optimize Cluster Results
python cluster_vectors.py -c kmeans -n 12 -i secrepo.h5 -o secrepo.h5

In [None]:
#Step 7: Repeating our INspection and Vlaidation Procedures
python label_notes.py -i secrepo.h5|grep70.32.104.50

#Step 8: Validate the cluster using Silhouette Scoring
python stats_vectors.py secrepo.h5

In [None]:
#Step 9: Our next step is to see what IP addresses have been doing by tracking their activity in web server logs 
python label_notes.py -i secrepo.h5 -l <label>

#use grep to search through logs and siplay entires inwhich IP addresses appear 
grep -ar 70;.32.104.50 datasets/....

In [None]:
#Cluster Analysis with DBscan
#Step 1: to generate clusers we'll run script
python cluster_vectors.py -c dbscan -e 0.5 -m 2 -i secrepo.h5 -o secrepo.h5

#step 2: apply new hyperparameters with increased Eps setting from 5 to 6, producing less clusters 
python cluster_vectors.py -c dbscan -e -m 5 -i secrepo.h5 -o secrepo.h5

#Stpe 3: Skip cluster insepction and validation and jump head to begin investigating bheavior of suspect samples. List samples 
python label_notes.py -i secrepo.h5 -l -1

#Step 4: find out what IPs have bee doing using grep
grep -ar 192.187.126.162 datasets/....