Analytics
===
*Bothound project*

# Initialization

In [69]:
# initialization
import numpy as np
import sklearn
from sklearn.cluster import KMeans
from sklearn import preprocessing
from scipy.spatial.distance import cdist,pdist
from scipy.signal import argrelextrema
%matplotlib inline
from pylab import *
from numpy import *
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from scipy.stats import itemfreq

# boeh
from collections import OrderedDict
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()
from bokeh.charts import Bar
from bokeh.charts import Histogram

# enabling folding extension. Run it once.
import notebook
E = notebook.nbextensions.EnableNBExtensionApp()
E.enable_nbextension('usability/codefolding/main')

import yaml
from bothound_tools import BothoundTools

color_set = [
    [0, 0, 255],      #Blue
    [255, 0, 0],      #Red
    [0, 255, 0],      #Green
    [255, 255, 0],    #Yellow
    [255, 0, 255],    #Magenta
    [255, 128, 128],  #Pink
    [128, 128, 128],  #Gray
    [128, 0, 0],      #Brown
    [255, 128, 0],    #Orange
]

stram = open("../conf/bothound.yaml", "r")
conf = yaml.load(stram)
tools = BothoundTools(conf)
tools.connect_to_db()

def get_palette(N=5):
    result = []
    for x in range(N):
        s = color_set[x % len(color_set)]
        result.append([s[0]/255.0,s[1]/255.0,s[2]/255.0,1])
    return result
palette = get_palette(80)

def plot_costs(costs, num_clusters, title):
    KK = range(1,len(costs)+1)

    # elbow curve
    kIdx = num_clusters      
    clr = cm.spectral( np.linspace(0,1,10) ).tolist()
    mrk = 'os^p<dvh8>+x.'

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(KK, costs, 'b*-')
    ax.plot(num_clusters, costs[num_clusters-1], marker='o', markersize=14, 
        markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    #ax.set_ylim((0,100))
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within sum of squeres')
    plt.title(title)
    
def plot_clusters(clusters, num_clusters,title="Histogram"):
    sizes = [0]*num_clusters
    for i in clusters: 
        if(i >= 0) :
            if (i >= num_clusters):
                print i
            sizes[i] = sizes[i]+1
    print (sizes)
    
      #plot histogramm
    left = [] 
    for i in range(len(sizes)):
        left.append(i-0.5)
    fig = plt.figure(figsize=(12,8))
    plt.title(title)
    ax = fig.add_subplot(111)
    ax.bar(left,sizes, color = palette)            
    
    
def get_clustering_model(X, num_clusters):
    model = KMeans(n_clusters=num_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
    model.fit(X)
    
    clusters = model.predict(X)
    plot_clusters(clusters, num_clusters)
    return clusters

def get_best_clustering_model(X, max_number_of_clusters, title):
    cost = []
    KK = range(1,max_number_of_clusters+1)
    kms = []
    # calculate all the clustering and cost
    for no_of_clusters in KK:
        km = KMeans(n_clusters=no_of_clusters, precompute_distances = True, max_iter = 500, n_init = 30)
        km.fit(X)
        kms.append(km)

        sizes = [0]*no_of_clusters
        for i in km.predict(X): 
            if(i >= no_of_clusters):
                print i
            sizes[i] = sizes[i]+1
        print (sizes)

        cost.append(km.inertia_)

    # calculate first derivative
    derivative1 = [cost[i+1]-cost[i] for i in range(len(cost)-1)]
    #print "d1", derivative1

    # calculate second derivative
    derivative2 = [derivative1[i+1]-derivative1[i] for i in range(len(derivative1)-1)]
    #print "d2", derivative2

    max2 = argrelextrema(np.argsort(derivative2), np.less) 
    num_clusters = 4 
    #print "max2", max2
    if(len(max2[0]) > 0):
        num_clusters = max2[0][0] + 3
    else:
        # calculate third derivative
        derivative3 = [derivative2[i+1]-derivative2[i] for i in range(len(derivative2)-1)]
        #print derivative3

        max3 = argrelextrema(np.argsort(derivative3), np.greater) 
        if(len(max3[0]) > 0):
            num_clusters = max3[0][0] + 4 

    model = kms[num_clusters-1]
    
    # plot costs
    plot_costs(cost, model.n_clusters, "Cost of k-Means." + title)

    clusters = model.predict(X)
    plot_clusters(clusters, model.n_clusters, title)
    return clusters, model.n_clusters, cost


import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

from plotly.plotly import iplot
from plotly.graph_objs import Scatter3d, Data, Marker
import plotly.graph_objs as go

def plot3(feature_indexes, X, clusters, selected_cluster, title = "Cluster"):
    clusters_plot = []
    num_clusters = max(clusters)+1
    for i in range(0, num_clusters):
        d = X[clusters == i,: ]
        cluster = Scatter3d(
            x=d[:,feature_indexes[0]],
            y=d[:,feature_indexes[1]],
            z=d[:,feature_indexes[2]],
            mode='markers',
            name = "All traffic" if i == 0 else "{} {}".format(title, i),
            marker=dict(
                color='rgb({}, {}, {})'.format(palette[i][0]*255,palette[i][1]*255,palette[i][2]*255 ),
                size=12,
                line=dict(
                    color='rgb(204, 204, 204)',
                    width=0.0
                ),
                opacity=0.2
            )
        )
        clusters_plot.append(cluster)

    data = Data(clusters_plot)
    bk_color = "rgb(224, 224, 224)"
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0,t=60),
        title='', 
        height = 1000,
        width = 1000,
        legend=dict(
            #x=0,
            #y=1,
            #traceorder='normal',
            font=dict(
                family='sans-serif',
                size=16,
                color='#000'
            ),
            bgcolor='#E2E2E2',
            bordercolor='#FFFFFF',
            borderwidth=2
        ),
        scene=go.Scene(
            xaxis=dict(
                title = features[feature_indexes[0]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
           ),
            yaxis=dict(
                 title = features[feature_indexes[1]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            ),
            zaxis=dict(
                 title = features[feature_indexes[2]],
    showbackground=True, # (!) show axis background
    backgroundcolor=bk_color, # set background color to grey
    gridcolor="rgb(255, 255, 255)",       # set grid line color
    zerolinecolor="rgb(255, 255, 255)",   # set zero grid line color
            )
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig)


def plot_intersection(clusters, num_clusters, id_incident, ips, id_incident2, cluster2 = -1):
    clusters_np = np.array(clusters)
    ips_np = np.array(ips)
    ips2 = set(tools.get_ips(id_incident2, cluster2))
    d = {}
    d["Cluster"] = []
    d["Incident"] = []
    d["data"] = []
    percentages = []
    intersections = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Unique from incident {}".format(id_incident))
        cluster_ips = set(ips_np[clusters_np == cluster])
        intersection = len(ips2.intersection(cluster_ips))
        intersections.append(intersection)
        d["data"].append(len(cluster_ips)-intersection)
        if(len(cluster_ips) == 0):
            percentages.append(0)
        else:
            percentages.append(intersection*100.0/len(cluster_ips))
        
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Incident"].append("Intersection with incident {}".format(id_incident2))
        d["data"].append(intersections[cluster])
        
    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Incident',legend='top_right', 
          title = "Intersection. Incident {} vs. Incident {} (cluster={})".format(id_incident, id_incident2, cluster2) ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def plot_countries(clusters, num_clusters, sessions, num_countries = 10):
    countries = tools.get_countries()
    ids = np.array([s['id_country'] for s in sessions])
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per cluster
    clusters_np = np.array(clusters)
    d = {}
    d["Cluster"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for cluster in range(0, num_clusters):
        ids_cluster = ids[clusters_np == cluster]
        freqs.append(itemfreq(ids_cluster))
        
    for i in range(0,len(best_countries)):
        for cluster in range(0, num_clusters):
            d["Cluster"].append(cluster)
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[cluster]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    
def plot_ban(clusters, num_clusters, sessions):
    clusters_np = np.array(clusters)
    bans = np.array([s['ban'] for s in sessions])
    d = {}
    d["Cluster"] = []
    d["Ban"] = []
    d["data"] = []
    banned = []
    percentage = []
    
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Served")
        cluster_total = bans[clusters_np == cluster]
        cluster_banned = cluster_total[cluster_total==1]
        banned.append(cluster_banned.shape[0])
        if (cluster_total.shape[0] == 0):
            p = 0
        else:
            p = float("{0:.2f}".format(cluster_banned.shape[0]*100.0/cluster_total.shape[0]))
        percentage.append(p)
        d["data"].append(cluster_total.shape[0]-cluster_banned.shape[0])
          
    for cluster in range(0, num_clusters):
        d["Cluster"].append(cluster)
        d["Ban"].append("Banned")
        d["data"].append(banned[cluster])

    df=pd.DataFrame(d)
    p=Bar(df,label='Cluster',values='data',stack='Ban',legend='top_right', 
          title = "Banjax Ban" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   
    print banned
    print percentage
    

def get_countries(incidents, num_countries = 25):
    countries = tools.get_countries()
    ids = []
    for incident in incidents:
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids.append(s['id_country'])
    ids = np.array(ids)
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append({ "count": best_countries[i][1], "name": c['name'], "id": c["id"]})
        #codes.append([best_countries[i][1], c['name']])
    #for c in codes:
    #    print c
    return codes

def plot_attack_countries(id_incidents, num_countries = 10):
    
    attacks = tools.get_attacks(id_incidents)
    
    countries = tools.get_countries()
    
    sql_where = "where id_incident in ("
    for id in id_incidents:
        sql_where = sql_where + "{},".format(id)
    sql_where = sql_where[:-1]
    sql_where += ")"
    
    
    tools.cur.execute("select distinct id_country, count(id_country) as count from sessions " + sql_where +
    " group by id_country order by count desc limit {}".format(num_countries))
    best_countries = tools.cur.fetchall()

    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i]["id_country"]).next()
        codes.append(c)

    # calculate best countries count per incident
    d = {}
    d["Attack"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for attack in attacks:
        tools.cur.execute("select distinct id_country, count(id_country) as count from sessions " + sql_where +
        " and attack={} group by id_country order by count desc".format(attack["id"]))
        freqs.append(tools.cur.fetchall())
        
    for i in range(0,len(best_countries)):
        for index_attack in range(0, len(attacks)):
            d["Attack"].append(attacks[index_attack]["id"])
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[index_attack]:
                if (f["id_country"] == best_countries[i]["id_country"]):
                    d["data"].append(f["count"])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Attack',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)  
    
    for i in range(0, len(attacks)):
        print "____________________ Attack {} _____________________".format(attacks[i]["id"])
        for c in range(0, len(freqs[i])):
            if c > 10:
                continue;
            country = (item for item in countries if item["id"] == freqs[i][c]["id_country"]).next()
            print "{} : {}".format(country["name"], freqs[i][c]["count"])

def plot_incident_countries(incidents, num_countries = 10):
    countries = tools.get_countries()
    ids = []
    for incident in incidents:
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids.append(s['id_country'])
    ids = np.array(ids)
    #first find the best countries 
    if(num_countries > len(countries)):
        num_countries = len(countries)
      
    # find the most ccountries count in total
    freq = itemfreq(ids)
    sorted_countries = sorted(freq, key=lambda k: k[1], reverse=True) 
    best_countries = sorted_countries[0:num_countries]
    codes = []
    for i in range(0,len(best_countries)):
        c = (item for item in countries if item["id"] == best_countries[i][0]).next()
        codes.append(c)

    # calculate best countries count per incident
    d = {}
    d["Incident"] = []
    d["Country"] = []
    d["data"] = []
    freqs= []
    for incident in incidents:
        ids_incident = []
        for s in incident['sessions']:
            if(s['attack'] > 0) :
                ids_incident.append(s['id_country'])
        freqs.append(itemfreq(ids_incident))
        
    for i in range(0,len(best_countries)):
        for index_incident in range(0, len(incidents)):
            d["Incident"].append(incidents[index_incident]["id"])
            d["Country"].append(codes[i]["name"])
            exists = False
            for f in freqs[index_incident]:
                if (f[0] == best_countries[i][0]):
                    d["data"].append(f[1])
                    exists = True
                    break
            if (not exists) :
                d["data"].append(0)

    df=pd.DataFrame(d)
    p=Bar(df,label='Incident',values='data',stack='Country',legend='top_right', 
          title = "Countries" ,
         ylabel = "IP sessions", plot_width=1000, plot_height=600)
    bk.show(p)   

def bar_plot(data, x_label, y_label, title):
   
    keys = []
    for d in data:
        keys = keys + d["values"].keys()
    keys = set(keys)

    d = {}
    d[x_label] = []
    d["legend"] = []
    d["data"] = []
    
    for x in data:    
        for key in keys:
            d[x_label].append(x["x"])
            d["legend"].append(key)
            d["data"].append(x["values"][key] if key in x["values"] else 0)
    df=pd.DataFrame(d)
    p=Bar(df,label='Incident',values='data',stack="legend",legend='top_right', 
        title = title,
        ylabel = y_label, plot_width=1000, plot_height=1000)
    bk.show(p)   




JupyterApp._config_dir_default is deprecated: use @default decorator instead.


EnableNBExtensionApp._config_file_name_default is deprecated: use @default decorator instead.


JupyterApp._data_dir_default is deprecated: use @default decorator instead.


JupyterApp._jupyter_path_default is deprecated: use @default decorator instead.


JupyterApp._log_level_default is deprecated: use @default decorator instead.


JupyterApp._runtime_dir_default is deprecated: use @default decorator instead.


ConfigManager._config_dir_default is deprecated: use @default decorator instead.


ConfigManager._config_dir_default is deprecated: use @default decorator instead.



# Configuration

In [70]:
#id_incidents = [27,19] # Kotsubynske
#id_incidents = [24,25,26,19,27] # Kotsubynske

id_incidents = [29,30,31,32,33,34,42]
#id_incidents = [34]

# Read Data

In [71]:
# Reading from Database
incidents = []
for id in id_incidents:
    print "Indicent", id, "loading..."
    incident = {}
    incident["id"] = id
    incident["sessions"] = tools.get_sessions(id)
    incident["incident"] = tools.get_incident(id)[0]
    incidents.append(incident)
    print "total sessions", len(incident['sessions'])
print "Done."

Indicent 29 loading...
total sessions 22914
Indicent 30 loading...
total sessions 12352
Indicent 31 loading...
total sessions 7758
Indicent 32 loading...
total sessions 3916
Indicent 33 loading...
total sessions 9670
Indicent 34 loading...
total sessions 8760
Indicent 42 loading...
total sessions 12352
Done.


# Countries by attack

In [None]:
plot_attack_countries(id_incidents)



Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead



# Countries by Incident

In [27]:
plot_incident_countries(incidents)
get_countries(incidents)



Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



[{'count': 12085, 'id': 11L, 'name': 'Russian Federation'},
 {'count': 12058, 'id': 2L, 'name': 'United States'},
 {'count': 5196, 'id': 6L, 'name': 'Ukraine'},
 {'count': 3577, 'id': 53L, 'name': 'China'},
 {'count': 2995, 'id': 8L, 'name': 'Germany'},
 {'count': 2584, 'id': 15L, 'name': 'United Kingdom'},
 {'count': 1817, 'id': 20L, 'name': 'France'},
 {'count': 1684, 'id': 5L, 'name': 'Netherlands'},
 {'count': 1517, 'id': 75L, 'name': 'Lithuania'},
 {'count': 980, 'id': 40L, 'name': 'Switzerland'},
 {'count': 880, 'id': 145L, 'name': 'Gibraltar'},
 {'count': 866, 'id': 13L, 'name': 'Turkey'},
 {'count': 844, 'id': 36L, 'name': 'Japan'},
 {'count': 817, 'id': 17L, 'name': None},
 {'count': 655, 'id': 18L, 'name': 'Poland'},
 {'count': 638, 'id': 42L, 'name': 'Canada'},
 {'count': 595, 'id': 88L, 'name': 'Singapore'},
 {'count': 540, 'id': 29L, 'name': 'Spain'},
 {'count': 535, 'id': 4L, 'name': 'Italy'},
 {'count': 519, 'id': 60L, 'name': 'Australia'},
 {'count': 481, 'id': 49L, 'na

# User Agents

In [218]:
        def plot_user_agents(id_incidents):
    data = []

    for id in id_incidents:
        v = {}
        sql = "select " \
        "count(user_agents.ua) as ua_count,"\
        "user_agents.ua,"\
        "user_agents.device_family "\
        "from sessions, session_user_agent, user_agents "\
        "where sessions.id = session_user_agent.id_session "\
        "and user_agents.id = session_user_agent.id_user_agent "\
        "and sessions.id_incident = {} "\
        "and sessions.attack >0 "\
        "group by user_agents.ua "\
        "order by ua_count desc".format(id)
        print sql
        tools.cur.execute(sql)
        count = 0
        for elem in tools.cur.fetchall():
            count = count + 1
            if(count > 10):
                break
            v[elem["ua"]] = elem["ua_count"]
        
        data.append({"x":id, "values" : v if count > 0 else {"1":0} })
    
    print data

    bar_plot(data, "Incident", "UA portion", "User Agents distribution")

In [213]:
plot_user_agents(id_incidents)

select count(user_agents.ua) as ua_count,user_agents.ua,user_agents.device_family from sessions, session_user_agent, user_agents where sessions.id = session_user_agent.id_session and user_agents.id = session_user_agent.id_user_agent and sessions.id_incident = 37 and sessions.attack >0 group by user_agents.ua order by ua_count desc
select count(user_agents.ua) as ua_count,user_agents.ua,user_agents.device_family from sessions, session_user_agent, user_agents where sessions.id = session_user_agent.id_session and user_agents.id = session_user_agent.id_user_agent and sessions.id_incident = 42 and sessions.attack >0 group by user_agents.ua order by ua_count desc
[{'x': 37, 'values': {'1': 0}}, {'x': 42, 'values': {'1': 0}}]



Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



# Hit rate

In [220]:
# hit rate
hit_rate = []
hit_rate_ua = []
#ua = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)"
for incident in incidents:
    for s in incident['sessions']:
        v = s['request_interval']
        if(v == 1800):
            continue
        if(v != 0):
            v = 60.0/v
        #if(s['ua'] == ua) :
        if(s['attack'] == 1) :
            hit_rate_ua.append(v)
        else:
            hit_rate.append(v)
            
trace_other = go.Box(
    y = hit_rate,
    boxpoints='all',
    jitter=0.5,
    name='Others',
    pointpos=-1.8
)

trace_ua = {}
trace_ua = go.Box(
    y = hit_rate_ua,
    boxpoints='all',
    jitter=0.5,
    name='Bots IPs',
    pointpos=-1.8
)

data = Data([trace_other, trace_ua])
layout = go.Layout(
    showlegend=False,
    height = 900,
    title='Hit rate of bots',
    xaxis=go.XAxis(
        showgrid=True,
        showline=True,
        ticks=''
    ),
    yaxis=go.YAxis(
        showline=True,
        ticks='',
        zeroline=True,
        range = [0,300],
        title = "Hit rate/minute"
    )
)

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)


# Total Attacks Scatter Plot

In [221]:
# Total attack
features = [
    "request_interval", #1
    "ua_change_rate",#2
    "html2image_ratio",#3
    "variance_request_interval",#4
    #"payload_average",#5
    "error_rate",#6
    #"request_depth",#7
    "request_depth_std",#8
    "session_length",#9
    #"percentage_cons_requests",#10
]

values = []
incident_indexes = []
i = 1
for incident in incidents:
    for s in incident['sessions']:
        
        row = []
        for f in features:
            row.append(s[f])
            
        row.append(s['attack'])
        if(s['attack'] == 0):
            incident_indexes.append(0) 
        else:
            incident_indexes.append(1) 
        
        values.append(row)
    i = i + 1
    
X = np.array(values)
incident_indexes = np.array(incident_indexes)
X.shape

(15618, 8)

In [222]:
plot3([3,2,5], X, incident_indexes, -1, "Attack ")

# Response histogram

In [223]:
#kotsubynske
data = []
data.append({"x" : 1, "values" : {"200" : 121638, "404" : 31436, "503" : 7112}})
data.append({"x" : 2, "values" : {"200" : 35853 , "403" : 138}})
data.append({"x" : 3, "values" : {"200" : 23472  , "403" : 531, "404" : 25168 }})
data.append({"x" : 4, "values" : {"200" : 16677   , "503" : 22792 , "404" : 1495610  }})
data.append({"x" : 5, "values" : {"200" : 347919    , "403" : 1570  , "404" : 42429   }})

bar_plot(data, "Incident", "Hits", "Responce code distribution")


Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



# Spider devices in bdsmovement

In [224]:
data = []
data.append({"x" : 1, "values" : {"Other" : 712, " Spider" : 12}})
data.append({"x" : 2, "values" : {"Other" : 673, " Spider" : 11}})
data.append({"x" : 3, "values" : {"Other" : 2289, " Spider" : 2229}})
data.append({"x" : 4, "values" : {"Other" : 12974, " Spider" : 12869}})
data.append({"x" : 5, "values" : {"Other" : 5251, " Spider" : 5014}})
data.append({"x" : 6, "values" : {"Other" : 3074, " Spider" : 3019}})
for d in data:
    d["values"]["Other"] = d["values"]["Other"] - d["values"][" Spider"]

print 23131 * 100.0 / (2289+12974+5251+3074)

bar_plot(data, "Incident", "Number of IPs", "Portion of Spider devices")

98.0625741903



Setting a fixed font size value as a string '10pt' is deprecated, set with value('10pt') or ['10pt'] instead


Setting a fixed font size value as a string '14pt' is deprecated, set with value('14pt') or ['14pt'] instead




Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._iopub_socket_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.


Comm._kernel_default is deprecated: use @default decorator instead.


Comm._comm_id_default is deprecated: use @default decorator instead.


Comm._session_default is deprecated: use @default decorator instead.


Comm._topic_default is deprecated: use @default decorator instead.



# Intersection BTSELEM vs BDSMOVEMENT

In [225]:
data = []
data.append({"x" : 1, "values" : {"Bdsmovement.org" : 713, " Identical to Btselem.org" : 12}})
data.append({"x" : 2, "values" : {"Bdsmovement.org" : 673, " Identical to Btselem.org" : 11}})
data.append({"x" : 3, "values" : {"Bdsmovement.org" : 2289, " Identical to Btselem.org" : 2229}})
data.append({"x" : 4, "values" : {"Bdsmovement.org" : 12974, " Identical to Btselem.org" : 12869}})
data.append({"x" : 5, "values" : {"Bdsmovement.org" : 5251, " Identical to Btselem.org" : 5014}})
data.append({"x" : 6, "values" : {"Bdsmovement.org" : 3074, " Identical to Btselem.orgr" : 3019}})
for d in data:
    d["values"]["Bdsmovement.org"] = d["values"]["Bdsmovement.org"] - d["values"][" Identical to Btselem.org"]


bar_plot(data, "Incident", "Number of IPs", "Bdsmovement.org incidents with highlited identical IPs with Btselem.org incident")

KeyError: ' Identical to Btselem.org'