Analytics
===
*Bothound project*

# Initialization

In [16]:
# initialization
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.stats import itemfreq

# boeh
from collections import OrderedDict
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()
from bokeh.charts import Bar
from bokeh.charts import Histogram

# enabling folding extension. Run it once.
ext_require_path = 'usability/codefolding/main'
from notebook.nbextensions import EnableNBExtensionApp
if hasattr(EnableNBExtensionApp(), 'enable_nbextension'):
    EnableNBExtensionApp().enable_nbextension(ext_require_path)
else:
    from notebook.nbextensions import enable_nbextension
    enable_nbextension('notebook', ext_require_path)

import yaml
from bothound_tools import BothoundTools

color_set = [
    [0, 0, 255],      #Blue
    [255, 0, 0],      #Red
    [0, 255, 0],      #Green
    [255, 255, 0],    #Yellow
    [255, 0, 255],    #Magenta
    [255, 128, 128],  #Pink
    [128, 128, 128],  #Gray
    [128, 0, 0],      #Brown
    [255, 128, 0],    #Orange
]

stram = open("../conf/bothound.yaml", "r")
conf = yaml.load(stram)
tools = BothoundTools(conf)
tools.connect_to_db()

def get_palette(N=5):
    result = []
    for x in range(N):
        s = color_set[x % len(color_set)]
        result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
    return result
palette = get_palette(80)

def plot_costs(costs, num_clusters, title):
    KK = range(1,len(costs)+1)

    # elbow curve
    kIdx = num_clusters      
    clr = cm.spectral( np.linspace(0,1,10) ).tolist()
    mrk = 'os^p<dvh8>+x.'

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(KK, costs, 'b*-')
    ax.plot(num_clusters, costs[num_clusters-1], marker='o', markersize=14, 
        markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    #ax.set_ylim((0,100))
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within sum of squeres')
    plt.title(title)
    
def plot_clusters(clusters, num_clusters,title="Histogram"):
    sizes = [0]*num_clusters
    for i in clusters: 
        if(i >= 0) :
            if (i >= num_clusters):
                print i
            sizes[i] = sizes[i]+1
    print (sizes)
    
      #plot histogramm
    left = [] 
    for i in range(len(sizes)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,sizes, color = palette)            
    
    
def get_clustering_model(X, num_clusters):
    model = KMeans(n_clusters=num_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
    model.fit(X)
    
    clusters = model.predict(X)
    plot_clusters(clusters, num_clusters)
    return clusters

def get_best_clustering_model(X, max_number_of_clusters, title):
    cost = []
    KK = range(1,max_number_of_clusters+1)
    kms = []
    # calculate all the clustering and cost
    for no_of_clusters in KK:
        km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
        km.fit(X)
        kms.append(km)

        sizes = [0]*no_of_clusters
        for i in km.predict(X): 
            if(i >= no_of_clusters):
                print i
            sizes[i] = sizes[i]+1
        print (sizes)

        cost.append(km.inertia_)

    # calculate first derivative
    derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
    #print "d1", derivative1

    # calculate second derivative
    derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
    #print "d2", derivative2

    max2 = argrelextrema(np.argsort(derivative2), np.less) 
    num_clusters = 4 
    #print "max2", max2
    if(len(max2[0]) > 0):
        num_clusters = max2[0][0] + 3
    else:
        # calculate third derivative
        derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
        #print derivative3

        max3 = argrelextrema(np.argsort(derivative3), np.greater) 
        if(len(max3[0]) > 0):
            num_clusters = max3[0][0] + 4 

    model = kms[num_clusters-1]
    
    # plot costs
    plot_costs(cost, model.n_clusters, "Cost of k-Means." + title)

    clusters = model.predict(X)
    plot_clusters(clusters, model.n_clusters, title)
    return clusters, model.n_clusters, cost


import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

from plotly.plotly import iplot
from plotly.graph_objs import Scatter3d, Data, Marker
import plotly.graph_objs as go

def plot3(feature_indexes, X, clusters, selected_cluster, title = "Cluster"):
    clusters_plot = []
    num_clusters = max(clusters)+1
    for i in range(0, num_clusters):
        d = X[clusters == i,: ]
        cluster = Scatter3d(
            x=d[:,feature_indexes[0]],
            y=d[:,feature_indexes[1]],
            z=d[:,feature_indexes[2]],
            mode='markers',
            name = "All traffic" if i == 0 else "{} {}".format(title, i),
            marker=dict(
                color='rgb({}, {}, {})'.format(palette[i][0]*255,palette[i][1]*255,palette[i][2]*255 ),
                size=12,
                line=dict(
                    color='rgb(204, 204, 204)',
                    width=0.0
                ),
                opacity=0.2
            )
        )
        clusters_plot.append(cluster)

    data = Data(clusters_plot)
    bk_color = "rgb(224, 224, 224)"
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0,t=60),
        title='', 
        height = 1000,
        width = 1000,
        legend=dict(
            #x=0,
            #y=1,
            #traceorder='normal',
            font=dict(
                family='sans-serif',
                size=16,
                color='#000'
            ),
            bgcolor='#E2E2E2',
            bordercolor='#FFFFFF',
            borderwidth=2
        ),
        scene=go.Scene(
            xaxis=dict(
                title = features[feature_indexes[0]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
           ),
            yaxis=dict(
                 title = features[feature_indexes[1]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            ),
            zaxis=dict(
                 title = features[feature_indexes[2]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            )
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)


def plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2, cluster2 = -1):
    clusters_np = np.array(clusters)
    ips_np = np.array(ips)
    ips2 = set(tools.get_ips(id_incident2, cluster2))
    d = {}
    d["Cluster"] = []
    d["Incident"] = []
    d["data"] = []
    percentages = []
    intersections = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Unique from incident {}".format(id_incident))
        cluster_ips = set(ips_np[clusters_np == cluster])
        intersection = len(ips2.intersection(cluster_ips))
        intersections.append(intersection)
        d["data"].append(len(cluster_ips)-intersection)
        if(len(cluster_ips) == 0):
            percentages.append(0)
        else:
            percentages.append(intersection*100.0/len(cluster_ips))
        
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Intersection with incident {}".format(id_incident2))
        d["data"].append(intersections[cluster])
        
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Incident',legend='top_right', 
          title = "Intersection. Incident {} vs. Incident {} (cluster={})".format(id_incident, id_incident2, cluster2) ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def plot_countries(clusters, num_clusters, sessions, num_countries = 10):
    countries = tools.get_countries()
    ids = np.array([s['id_country'] for s in sessions])
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per cluster
    clusters_np = np.array(clusters)
    d = {}
    d["Cluster"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for cluster in range(0, num_clusters):
        ids_cluster = ids[clusters_np == cluster]
        freqs.append(itemfreq(ids_cluster))
        
    for i in range(0,len(best_countries)):
        for cluster in range(0, num_clusters):
            d["Cluster"].append(cluster)
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[cluster]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    
def plot_ban(clusters, num_clusters, sessions):
    clusters_np = np.array(clusters)
    bans = np.array([s['ban'] for s in sessions])
    d = {}
    d["Cluster"] = []
    d["Ban"] = []
    d["data"] = []
    banned = []
    percentage = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Served")
        cluster_total = bans[clusters_np == cluster]
        cluster_banned = cluster_total[cluster_total==1]
        banned.append(cluster_banned.shape[0])
        if (cluster_total.shape[0] == 0):
            p = 0
        else:
            p = float("{0:.2f}".format(cluster_banned.shape[0]*100.0/cluster_total.shape[0]))
        percentage.append(p)
        d["data"].append(cluster_total.shape[0]-cluster_banned.shape[0])
          
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Banned")
        d["data"].append(banned[cluster])

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Ban',legend='top_right', 
          title = "Banjax Ban" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    print banned
    print percentage
    

def get_countries(incidents, num_countries = 25):
    countries = tools.get_countries()
    ids = []
    for incident in incidents:
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids.append(s['id_country'])
    ids = np.array(ids)
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append({ "count": best_countries[i][1], "name": c['name'], "id": c["id"]})
        #codes.append([best_countries[i][1], c['name']])
    #for c in codes:
    #    print c
    return codes

def get_countries_count(id_incidents, attack):
    sql_where = "where id_incident in ("
    for id in id_incidents:
        sql_where = sql_where + "{},".format(id)
    sql_where = sql_where[:-1]
    sql_where += ")"
    
    if(attack > 0):
        sql_where += " and attack = {}".format(attack)
    else:
        sql_where += " and attack > 0"
    #print sql_where
        

    tools.cur.execute("select distinctrow IP, id_country from sessions " + sql_where)
    countries = tools.cur.fetchall()
    res_dict = {}
    for c in countries:
        id = c["id_country"]
        if id in res_dict:
            res_dict[id] += 1
        else:
            res_dict[id] = 1
    res = []
    for key, value in res_dict.iteritems():
        temp = [key,value]
        res.append(temp)
    
    res = sorted(res, key=lambda x: x[1], reverse=True) 
    return res
    
    
def plot_attack_countries(id_incidents, num_countries = 10):
    
    attacks = tools.get_attacks(id_incidents)
    
    countries = tools.get_countries()
    names = {}
    for c in countries:
        names[c["id"]] = c["name"]
    
    all_counts = get_countries_count(id_incidents, -1)
    
    # calculate best countries count per attack
    best_countries = []

    n = num_countries if num_countries < len(all_counts) else len(all_counts)
    for i in range(0,n):
        best_countries.append(all_counts[i][0])
        
    freqs= []
    for attack in attacks:
        cur_countries = get_countries_count(id_incidents, attack["id"])
        n = num_countries if num_countries < len(cur_countries) else len(cur_countries)
        for i in range(0,n):
            if (cur_countries[i][0] not in best_countries):
                best_countries.append(cur_countries[i][0])
        freqs.append(cur_countries)
            
    d = {}
    d["Attack"] = []
    d["Country"] = []
    d["data"] = []

    for i in range(0,len(best_countries)):
        for index_attack in range(0, len(attacks)):
            d["Attack"].append(attacks[index_attack]["id"])
            d["Country"].append(names[best_countries[i]])
            
            exists = False

            for k in range(0,num_countries):
                f = freqs[index_attack][k]
                if (f[0] == best_countries[i]):
                    d["data"].append(f[1])
                    exists = True
                    break
                
            """
            for f in freqs[index_attack]:
                if (f[0] == best_countries[i]):
                    d["data"].append(f[1])
                    exists = True
                    break
            """
            if (not exists) :
                d["data"].append(0)

    for index_attack in range(0, len(attacks)):
        d["Attack"].append(attacks[index_attack]["id"])
        d["Country"].append("Other")
        v = 0
        for k in range(0, num_countries):
            v += freqs[index_attack][k][1]
        d["data"].append(v)
            
    df=pd.DataFrame(d)
    p=Bar(df,label='Attack',values='data',stack='Country',legend='top_center', 
          title = "Countries" ,
         ylabel = "#IP", plot_width=1000, plot_height=600)
    bk.show(p)  
            
def plot_incident_countries(incidents, num_countries = 10):
    countries = tools.get_countries()
    ids = []
    for incident in incidents:
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids.append(s['id_country'])
    ids = np.array(ids)
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per incident
    d = {}
    d["Incident"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for incident in incidents:
        ids_incident = []
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids_incident.append(s['id_country'])
        freqs.append(itemfreq(ids_incident))
        
    for i in range(0,len(best_countries)):
        for index_incident in range(0, len(incidents)):
            d["Incident"].append(incidents[index_incident]["id"])
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[index_incident]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Incident',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def bar_plot(data, x_label, y_label, title):
   
    keys = []
    for d in data:
        keys = keys + d["values"].keys()
    keys = set(keys)

    d = {}
    d[x_label] = []
    d["legend"] = []
    d["data"] = []
    
    for x in data:    
        for key in keys:
            d[x_label].append(x["x"])
            d["legend"].append(key)
            d["data"].append(x["values"][key] if key in x["values"] else 0)
    df=pd.DataFrame(d)
    p=Bar(df,label='Incident',values='data',stack="legend",legend='top_right', 
        title = title,
        ylabel = y_label, plot_width=1000, plot_height=1000)
    bk.show(p)   




JupyterApp._config_dir_default is deprecated: use @default decorator instead.


EnableNBExtensionApp._config_file_name_default is deprecated: use @default decorator instead.


JupyterApp._data_dir_default is deprecated: use @default decorator instead.


JupyterApp._jupyter_path_default is deprecated: use @default decorator instead.


JupyterApp._log_level_default is deprecated: use @default decorator instead.


JupyterApp._runtime_dir_default is deprecated: use @default decorator instead.


ConfigManager._config_dir_default is deprecated: use @default decorator instead.


ConfigManager._config_dir_default is deprecated: use @default decorator instead.



# Configuration

In [12]:
# Choose incidents to explore
id_incidents = [29,30,31,32,33,34,42]


# Read Data

In [13]:
# Reading from Database
incidents = []
for id in id_incidents:
    print "Indicent", id, "loading..."
    incident = {}
    incident["id"] = id
    incident["sessions"] = tools.get_sessions(id)
    incident["incident"] = tools.get_incident(id)[0]
    incidents.append(incident)
    print "total sessions", len(incident['sessions'])
print "Done."

Indicent 29 loading...
total sessions 22914
Indicent 30 loading...
total sessions 12352
Indicent 31 loading...
total sessions 7758
Indicent 32 loading...
total sessions 3916
Indicent 33 loading...
total sessions 9670
Indicent 34 loading...
total sessions 8760
Indicent 42 loading...
total sessions 12352
Done.


# Attacks Summary

In [14]:
tools.incidents_summary(id_incidents)
attacks = tools.get_attacks(id_incidents) # show attack count
for a in attacks:
    print "Attack {} = {} ips".format(a["id"], a["count"] )

Incident 29, num IPs = 14790, num Bots = 13013
Incident 30, num IPs = 10963, num Bots = 9023
Incident 31, num IPs = 5948, num Bots = 3243
Incident 32, num IPs = 3887, num Bots = 2748
Incident 33, num IPs = 9670, num Bots = 8844
Incident 34, num IPs = 7878, num Bots = 7151
Incident 42, num IPs = 10963, num Bots = 9023
Attack 1 = 13857 ips
Attack 2 = 8913 ips
Attack 4 = 2589 ips
Attack 5 = 772 ips
Attack 6 = 971 ips
Attack 7 = 11746 ips


# Countries by attack

In [17]:
plot_attack_countries(id_incidents, num_countries = 5)


Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



# Countries by Incident

In [18]:
plot_incident_countries(incidents, num_countries = 5)
get_countries(incidents)



Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



[{'count': 12085, 'id': 11L, 'name': 'Russian Federation'},
 {'count': 12058, 'id': 2L, 'name': 'United States'},
 {'count': 5196, 'id': 6L, 'name': 'Ukraine'},
 {'count': 3577, 'id': 53L, 'name': 'China'},
 {'count': 2995, 'id': 8L, 'name': 'Germany'},
 {'count': 2584, 'id': 15L, 'name': 'United Kingdom'},
 {'count': 1817, 'id': 20L, 'name': 'France'},
 {'count': 1684, 'id': 5L, 'name': 'Netherlands'},
 {'count': 1517, 'id': 75L, 'name': 'Lithuania'},
 {'count': 980, 'id': 40L, 'name': 'Switzerland'},
 {'count': 880, 'id': 145L, 'name': 'Gibraltar'},
 {'count': 866, 'id': 13L, 'name': 'Turkey'},
 {'count': 844, 'id': 36L, 'name': 'Japan'},
 {'count': 817, 'id': 17L, 'name': None},
 {'count': 655, 'id': 18L, 'name': 'Poland'},
 {'count': 638, 'id': 42L, 'name': 'Canada'},
 {'count': 595, 'id': 88L, 'name': 'Singapore'},
 {'count': 540, 'id': 29L, 'name': 'Spain'},
 {'count': 535, 'id': 4L, 'name': 'Italy'},
 {'count': 519, 'id': 60L, 'name': 'Australia'},
 {'count': 481, 'id': 49L, 'na

# User Agents

In [21]:
def plot_user_agents(id_incidents):
    data = []

    for id in id_incidents:
        v = {}
        sql = "select " \
        "count(user_agents.ua) as ua_count,"\
        "user_agents.ua,"\
        "user_agents.device_family "\
        "from sessions, session_user_agent, user_agents "\
        "where sessions.id = session_user_agent.id_session "\
        "and user_agents.id = session_user_agent.id_user_agent "\
        "and sessions.id_incident = {} "\
        "and sessions.attack >0 "\
        "group by user_agents.ua "\
        "order by ua_count desc".format(id)
        #print sql
        tools.cur.execute(sql)
        count = 0
        for elem in tools.cur.fetchall():
            count = count + 1
            if(count > 10):
                break
            v[elem["ua"]] = elem["ua_count"]
        
        data.append({"x":id, "values" : v if count > 0 else {"1":0} })
    
    bar_plot(data, "Incident", "UA portion", "User Agents distribution")

In [22]:
plot_user_agents(id_incidents)


Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



# Hit rate

In [23]:
# hit rate
hit_rate = []
hit_rate_ua = []
#ua = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
for incident in incidents:
    for s in incident['sessions']:
        v = s['request_interval']
        if(v == 1800):
            continue
        if(v != 0):
            v = 60.0/v
        #if(s['ua'] == ua) :
        if(s['attack'] > 0) :
            hit_rate_ua.append(v)
        else:
            hit_rate.append(v)
            
trace_other = go.Box(
    y = hit_rate,
    boxpoints='all',
    jitter=0.5,
    name='Others',
    pointpos=-1.8
)

trace_ua = {}
trace_ua = go.Box(
    y = hit_rate_ua,
    boxpoints='all',
    jitter=0.5,
    name='Bots IPs',
    pointpos=-1.8
)

data = Data([trace_other, trace_ua])
layout = go.Layout(
    showlegend=False,
    height = 900,
    title='Hit rate of bots',
    xaxis=go.XAxis(
        showgrid=True,
        showline=True,
        ticks=''
    ),
    yaxis=go.YAxis(
        showline=True,
        ticks='',
        zeroline=True,
        range = [0,300],
        title = "Hit rate/minute"
    )
)

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)


# Attacks Scatter Plot

In [24]:
# Total attack
features = [
    "request_interval", #1
    "ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    "payload_average",#5
    "error_rate",#6
    "request_depth",#7
    "request_depth_std",#8
    "session_length",#9
    "percentage_cons_requests",#10
]

values = []
incident_indexes = []
i = 1
for incident in incidents:
    for s in incident['sessions']:
        
        row = []
        for f in features:
            row.append(s[f])
            
        row.append(s['attack'])
        if(s['attack'] == 0):
            incident_indexes.append(0) 
        else:
            incident_indexes.append(1) 
        
        values.append(row)
    i = i + 1
    
X = np.array(values)
incident_indexes = np.array(incident_indexes)
X.shape

(77722, 11)

In [34]:
plot3([3,2,5], X, incident_indexes, -1, "Attack ")

# Attack metrics

In [26]:
tools.calculate_attack_metrics(id_incidents)


__________ Botnet 1:
Session length = 314.650094103 sec
Html/image ratio = 0.00169306775412
Payload average = 521.96217064
Hit rate = 0.0444084189924 /minute

__________ Botnet 2:
Session length = 429.990438791 sec
Html/image ratio = 0.0
Payload average = 447.652381765
Hit rate = 0.0522009696864 /minute

__________ Botnet 4:
Session length = 2971.2889329 sec
Html/image ratio = 0.0
Payload average = 8217.80737806
Hit rate = 1.71743123255 /minute

__________ Botnet 5:
Session length = 3587.52279635 sec
Html/image ratio = 0.0
Payload average = 10221.064843
Hit rate = 0.477025810295 /minute

__________ Botnet 6:
Session length = 583.943670503 sec
Html/image ratio = 0.00922240489707
Payload average = 31317.7716535
Hit rate = 0.499720161233 /minute

__________ Botnet 7:
Session length = 2665.97238953 sec
Html/image ratio = 0.0157430477532
Payload average = 15572.3757529
Hit rate = 0.308518248903 /minute


# Attack similarity

In [27]:
# Attack similarities
tools.calculate_distances(
    id_incident = 29, # incident to explore
    id_attack = 1, # attack to explore
    id_incidents = [29,30,31,32,33,34,36,37,39,40,42], # incidents to compare with
    features = [] # specify the features. Use all features if empty
)

#######################  Distance calculator
Target indicent =  29
Target attack =  1
Target cluster index  1 =  -1
Target cluster index  2 =  -1
Incidents =  [29, 30, 31, 32, 33, 34, 36, 37, 39, 40, 42]
Features =  ['request_interval', 'ua_change_rate', 'html2image_ratio', 'variance_request_interval', 'payload_average', 'error_rate', 'request_depth', 'request_depth_std', 'session_length', 'percentage_cons_requests']
{'distance': 32805560077915908.0, 'incident': 40, 'attack': 1}
{'distance': 539312381463058.12, 'incident': 33, 'attack': 7}
{'distance': 478525733852771.94, 'incident': 32, 'attack': 4}
{'distance': 393205301076794.56, 'incident': 31, 'attack': 4}
{'distance': 272005641705060.47, 'incident': 31, 'attack': 6}
{'distance': 71539077165256.312, 'incident': 31, 'attack': 5}
{'distance': 55247098931752.656, 'incident': 34, 'attack': 7}
{'distance': 29492956062758.895, 'incident': 39, 'attack': 1}
{'distance': 27672586525911.398, 'incident': 37, 'attack': 1}
{'distance': 2081710

# Common IPs

In [33]:
# common ips with other attacks
tools.calculate_common_ips(
    incidents1 = [33], # incidents to explore
    id_attack = -1, # attack to explore(use -1 for all attacks)
    incidents2 = [34,36,37,39,40] # incidents to compare with
)

Intersection with incidents:
[34, 36, 37, 39, 40]

Num IPs in the attack 8844:

__________ Incident 39:
Num IPs in the incident 155:
# identical   IPs: 93
% of attack   IPs: 1.05%
% of incident IPs: 60.00%

__________ Incident 34:
Num IPs in the incident 7151:
# identical   IPs: 4249
% of attack   IPs: 48.04%
% of incident IPs: 59.42%

__________ Incident 40:
Num IPs in the incident 164:
# identical   IPs: 86
% of attack   IPs: 0.97%
% of incident IPs: 52.44%

__________ Incident 36:
Num IPs in the incident 111:
# identical   IPs: 7
% of attack   IPs: 0.08%
% of incident IPs: 6.31%

__________ Incident 37:
Num IPs in the incident 2720:
# identical   IPs: 138
% of attack   IPs: 1.56%
% of incident IPs: 5.07%
