# Outlier detection with LOF(Local Outlier Factor)

Based on the clickstream event frequency pattern in Q2Q3_input.csv, apply LOF algorithm to
calculate LOF for each point with the following initial settings:
1. Set k = 2 and use Manhattan distance. 
2. Set k = 3 and use Euclidean distance.
3. Report top 5 outliers 


In [1]:
#python 2.7
%matplotlib inline
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.spatial.distance import pdist, squareform

Populating the interactive namespace from numpy and matplotlib


In [2]:
data_input = pd.read_csv('https://raw.githubusercontent.com/gulabpatel/Statistics/master/Dataset/Q2Q3_input.csv')

In [3]:
data_input.head()

Unnamed: 0,user_id,load_video,pause_video,play_video,seek_video,speed_change_video,stop_video
0,0,2.0,1.0,4.0,1.0,0.0,1.0
1,1,6.0,14.0,14.0,0.0,0.0,1.0
2,2,1.0,0.0,0.0,0.0,0.0,0.0
3,3,2.0,2.0,2.0,0.0,0.0,1.0
4,4,1.0,3.0,22.0,18.0,0.0,0.0


In [4]:
data_input.iloc[1][1]

6.0

In [5]:
#Reachdist function
def reachdist(distance_df, observation, index):
    return distance_df[observation][index]

In [6]:
#LOF algorithm implementation from scratch
def LOF_algorithm(data_input, distance_metric = "cityblock", p = 5):
    distances = pdist(data_input.values, metric=distance_metric)
    dist_matrix = squareform(distances)
    distance_df = pd.DataFrame(dist_matrix)
    
    k = 2 if distance_metric == "cityblock" else 3 
    observations = distance_df.columns
    lrd_dict = {}
    n_dist_index = {}
    reach_array_dict = {}
    
    for observation in observations:
        dist = distance_df[observation].nsmallest(k+1).iloc[k]
        indexes = distance_df[distance_df[observation] <= dist].drop(observation).index
        n_dist_index[observation] = indexes
    
        reach_dist_array = []
        for index in indexes:
            #make a function reachdist(observation, index)
            dist_between_observation_and_index = reachdist(distance_df, observation, index)
            dist_index =  distance_df[index].nsmallest(k+1).iloc[k]
            reach_dist = max(dist_index, dist_between_observation_and_index)
            reach_dist_array.append(reach_dist)
        lrd_observation = len(indexes)/sum(reach_dist_array)
        reach_array_dict[observation] = reach_dist_array
        lrd_dict[observation] = lrd_observation
        
    #Calculate LOF
    LOF_dict = {}
    for observation in observations:
        lrd_array = []
        for index in n_dist_index[observation]:
            lrd_array.append(lrd_dict[index])
        LOF = sum(lrd_array)*sum(reach_array_dict[observation])/np.square(len(n_dist_index[observation]))
        LOF_dict[observation] = LOF

    return sorted(LOF_dict.items(), key=lambda x: x[1], reverse=True)[:p]

In [7]:
LOF_algorithm(data_input, p = 5)

[(19, 11.07),
 (525, 8.867228661749209),
 (66, 5.026785714285714),
 (638, 4.334727219682972),
 (177, 3.6292633292633294)]

In [8]:
LOF_algorithm(data_input, p = 5, distance_metric = 'euclidean')

[(638, 3.0800716645705695),
 (525, 3.010316256261629),
 (19, 2.8402916620868903),
 (66, 2.801410266169121),
 (65, 2.6456528412196416)]

----------------------

#Apply Sklearn 

Documentation : https://scikit-learn.org/stable/modules/outlier_detection.html#isolation-forest

https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_anomaly_comparison.html

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor

In [10]:
df = data_input.copy()

In [11]:
X_train, X_test = train_test_split(df, test_size=0.33, random_state=42)

In [12]:
lof = LocalOutlierFactor(novelty=True)
lof.fit(X_train)



LocalOutlierFactor(algorithm='auto', contamination='legacy', leaf_size=30,
          metric='minkowski', metric_params=None, n_jobs=None,
          n_neighbors=20, novelty=True, p=2)

In [13]:
lof.predict(X_test)  ##Inliers are labeled 1, while outliers are labeled -1.

array([-1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
       -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1, -1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1,  1,  1,
        1,  1,  1,  1,  1

In [14]:
lof.decision_function(X_test)

array([-0.22286463,  0.40121049,  0.39382115,  0.37299218,  0.39349483,
       -0.64078132,  0.39626584,  0.39433938,  0.38184346,  0.395146  ,
       -0.20723262,  0.37445958,  0.38395417,  0.33009596,  0.38503253,
       -0.15691907,  0.23951273,  0.39115323,  0.36645194,  0.34440098,
       -0.61824998,  0.37799249,  0.39715424,  0.40278667,  0.37995124,
        0.38409949,  0.37502625,  0.37463909,  0.37173208,  0.41110184,
        0.32771888,  0.39461345,  0.3877664 ,  0.41798382,  0.17272477,
        0.39190799,  0.39170294,  0.39037857, -0.12602858,  0.38612983,
        0.37923225,  0.32274755,  0.35989714,  0.38964145,  0.39841785,
        0.13624181,  0.40628758,  0.36318703,  0.33587816,  0.3746578 ,
       -0.30842027, -0.30322515,  0.38127225, -0.11446579,  0.35012343,
        0.39455884,  0.27578984,  0.29388924,  0.35252845,  0.38465442,
        0.30475718,  0.39427387,  0.37814206,  0.3240583 , -0.20092219,
        0.38624432, -0.0172637 ,  0.37628522,  0.39376907,  0.41

------------------------------