# Unsupervised anomaly detection with One-Class SVM

## Setup

In [None]:
import numpy as np
import pandas as pd

import sqlite3

with sqlite3.connect('../../ch_11/logs/logs.db') as conn:
    logs_2018 = pd.read_sql(
        """
        SELECT * 
        FROM logs 
        WHERE datetime BETWEEN "2018-01-01" AND "2019-01-01";
        """, 
        conn, parse_dates=['datetime'], index_col='datetime'
    )
logs_2018.head()

The `get_X()` function from the chapter:

In [None]:
def get_X(log, day):
    """
    Get data we can use for the X
    
    Parameters:
        - log: The logs dataframe
        - day: A day or single value we can use as a datetime index slice
    
    Returns: 
        A `pandas.DataFrame` object
    """
    return pd.get_dummies(log.loc[day].assign(
        failures=lambda x:  1 - x.success
    ).query('failures > 0').resample('1min').agg(
        {'username': 'nunique', 'failures': 'sum'}
    ).dropna().rename(
        columns={'username': 'usernames_with_failures'}
    ).assign(
        day_of_week=lambda x: x.index.dayofweek, 
        hour=lambda x: x.index.hour
    ).drop(columns=['failures']), columns=['day_of_week', 'hour'])

Get January 2018 data:

In [None]:
X = get_X(logs_2018, '2018-01')
X.columns

## One-class SVM
Train the model:

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

one_class_svm_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('svm', OneClassSVM())
]).fit(X)

Get predictions:

In [None]:
preds = one_class_svm_pipeline.predict(X)
pd.Series(np.where(preds == -1, 'outlier', 'inlier')).value_counts()