# Exploratory data analysis with labeled data
Now that we have the labels for our data, we can do some initial EDA to see if there is something different between the hackers and the valid users.

## Setup

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sqlite3

with sqlite3.connect('logs/logs.db') as conn:
    logs_2018 = pd.read_sql(
        'SELECT * FROM logs WHERE datetime BETWEEN "2018-01-01" AND "2019-01-01";', 
        conn, parse_dates=['datetime'], index_col='datetime'
    )
    hackers_2018 = pd.read_sql(
        'SELECT * FROM attacks WHERE start BETWEEN "2018-01-01" AND "2019-01-01";', 
        conn, parse_dates=['start', 'end']
    ).assign(
        duration=lambda x: x.end - x.start, 
        start_floor=lambda x: x.start.dt.floor('min'),
        end_ceil=lambda x: x.end.dt.ceil('min')
    )
hackers_2018.head()

This function will tell us if the datetimes had hacker activity:

In [None]:
def check_if_hacker(datetimes, hackers, resolution='1min'):
    """
    Check whether a hacker attempted a log in during that time.
    
    Parameters:
        - datetimes: The datetimes to check for hackers
        - hackers: The dataframe indicating when the attacks started and stopped
        - resolution: The granularity of the datetime. Default is 1 minute.
        
    Returns:
        `pandas.Series` of Booleans.
    """
    date_ranges = hackers.apply(
        lambda x: pd.date_range(x.start_floor, x.end_ceil, freq=resolution), 
        axis=1
    )
    dates = pd.Series(dtype='object')
    for date_range in date_ranges:
        dates = pd.concat([dates, date_range.to_series()])
    return datetimes.isin(dates)

Let's label our data for Q1 so we can look for a separation boundary:

In [None]:
users_with_failures = logs_2018.loc['2018-Q1'].assign(
    failures=lambda x:  1 - x.success
).query('failures > 0').resample('1min').agg(
    {'username':'nunique', 'failures': 'sum'}
).dropna().rename(
    columns={'username':'usernames_with_failures'}
)
labels = check_if_hacker(users_with_failures.reset_index().datetime, hackers_2018)
users_with_failures['flag'] = labels[:users_with_failures.shape[0]].values
users_with_failures.head()

Since we have the labels, we can draw a sample boundary that would separate most of the hackers from the valid users:

In [None]:
ax = sns.scatterplot(
    x=users_with_failures.usernames_with_failures, 
    y=users_with_failures.failures, 
    alpha=0.25,
    hue=users_with_failures.flag
)
plt.ylim(-4, None)
ax.plot([-2, 5], [15, -2], 'r--', label='sample boundary')
# sort the legend entries
handles, labels = ax.get_legend_handles_labels()
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
ax.legend(handles, labels, title='flag')
plt.title('Usernames with failures on minute resolution')

<hr>
<div style="overflow: hidden; margin-bottom: 10px;">
    <div style="float: left;">
        <a href="./2-unsupervised_anomaly_detection.ipynb">
            <button>&#8592; Previous Notebook</button>
        </a>
    </div>
    <div style="float: right;">
        <a href="./4-supervised_anomaly_detection.ipynb">
            <button>Next Notebook &#8594;</button>
        </a>
    </div>
</div>
<hr>