### Algorithm implemented :
The following algorithm would be implemented in this Unsupervised Anomaly Detection on the retail stores dataset: 
Markov Chain

#### Use for sequential anomalies (ordered)
We need discretize the data points in define states for markov chain. We will just take 'value' to define state for this example and define 5 levels of values of Markdowns (very low, low, average, high, very high)/(VL, L, A, H, VH). Markov chain will calculate the probability of sequence like (VL, L, L, A, A, L, A). If the probability is very weak we consider the sequence as an anomaly.

In [1]:
# libraries
#%matplotlib notebook

import pandas as pd
import numpy as np

import matplotlib
import seaborn
import matplotlib.dates as md
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
from pyemma import msm
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

In [2]:
# some function for later

# return Series of distance between each point and his distance with the closest centroid
def getDistanceByPoint(data, model):
    distance = pd.Series()
    for i in range(0,len(data)):
        Xa = np.array(data.loc[i])
        Xb = model.cluster_centers_[model.labels_[i]-1]
        distance.set_value(i, np.linalg.norm(Xa-Xb))
    return distance

In [3]:
# train markov model to get transition matrix
def getTransitionMatrix (df):
    df = np.array(df)
    model = msm.estimate_markov_model(df, 1)
    return model.transition_matrix

In [4]:
def markovAnomaly(df, windows_size, threshold):
    transition_matrix = getTransitionMatrix(df)
    real_threshold = threshold**windows_size
    df_anomaly = []
    for j in range(0, len(df)):
        if (j < windows_size):
            df_anomaly.append(0)
        else:
            sequence = df[j-windows_size:j]
            sequence = sequence.reset_index(drop=True)
            df_anomaly.append((sequence, real_threshold, transition_matrix))
    return df_anomaly

In [5]:
# load the master_dataset.xls
df = pd.read_excel('master_dataset.xlsx', sheetname='Sheet1')

In [6]:
# check the markdowns mean
print(df['MarkDown1'].mean())
print(df['MarkDown2'].mean())
print(df['MarkDown3'].mean())
print(df['MarkDown4'].mean())
print(df['MarkDown5'].mean())

8887.617797313967
6107.224317460187
928.7852197802101
3130.1765555554607
4544.031686202577


#### Markov Chains for Markdown 1 sales

In [7]:
# definition of the different state
x1 = (df['MarkDown1'] <=4444).astype(int)
x2= ((df['MarkDown1'] > 4444) & (df['MarkDown1']<=8888)).astype(int)
x3 = ((df['MarkDown1'] > 8888) & (df['MarkDown1']<=13332)).astype(int)
x4 = ((df['MarkDown1'] > 13332) & (df['MarkDown1']<=17776)).astype(int)
x5 = (df['MarkDown1'] >17776).astype(int)
df_mm = x1 + 2*x2 + 3*x3 + 4*x4 + 5*x5

In [8]:
# getting the anomaly labels for our dataset (evaluating sequence of 5 values and anomaly = less than 20% probable)
df_anomaly = markovAnomaly(df_mm, 5, 0.20)
df_anomaly = pd.Series(df_anomaly)
print(df_anomaly.value_counts())

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     5
([1, 1, 1, 1, 1], 0.0003200000000000001, [[0.811656236297, 0.121094641112, 0.0356643947863, 0.0149702229762, 0.0166145048287], [0.211386666503, 0.592422502307, 0.111818786985, 0.0421870340209, 0.0421850101839], [0.0602813271488, 0.108270478853, 0.770382697427, 0.0255371923928, 0.0355283041776], [0.0905276992505, 0.146143512556, 0.0913646964696, 0.586481114515, 0.085482977209], [0.0457805697744, 0.0665885079555, 0.0579189023944, 0.0389511439298, 0.790760875946]])    1
([1, 1, 1, 1, 2], 0.0003200000000000001, [[0.81165623629