In [89]:
import numpy as np
import math

In [59]:
#function used to parse the csv file to array
def get_data(file="click-stream event.csv", columns = range(1,7)):
    return np.loadtxt(file,delimiter=",",skiprows=1, usecols=columns)

In [60]:
#function to obtain the manhatten distance of two points
def manhatten_distance(p1, p2):
    return np.sum(np.square(p1 - p2))

In [61]:
# function to obtain the euclidean distance of two points
def euclidean_distance(p1, p2):
    return np.sum(np.absolute(p1 - p2))**(1/2)

In [62]:
#function that outputs distance array between all points
def distance_of_points(data, manhatten):
    distance_array = np.zeros((len(data),len(data)))
    for i in range(len(data)):
        for j in range(i,len(data)):
            distance = manhatten_distance(data[i],data[j]) if manhatten else euclidean_distance(data[i],data[j])
            distance_array[i][j] = distance
            distance_array[j][i] = distance
    return distance_array

In [63]:
#function that outputs kth nearest neighbor distance
def  kth_nearest_distance(distance_array, k):
    list = []
    for row in distance_array:
        row.sort()
        list.append(row[k])
    return list

In [64]:
#function that outputs kth  nearest neighbor list
def  kth_nearest(distance_array, kth):
    list = []
    for i in range(len(distance_array)):
        temp = []
        for j in range(len(distance_array[i])):
            if distance_array[i][j]<=kth[i] and i!=j:
                temp.append(j)
        list.append(temp)
    return list

In [65]:
# function that outputs lrdk
def lrdk_per_row(kth, kth_nearest_row, distance_array, index):
    reachdistsum = 0
    for i in kth_nearest_row:
        reachdistsum += reachdist(i, index, kth, distance_array)
    return len(kth_nearest_row)/reachdistsum

In [66]:
# function that outputs reach distance
def reachdist(p1, p2, kth, distance_array):
    return max(kth[p1],distance_array[p1][p2])

In [67]:
def lrdk (kth, kth_nearest_array, distance_array):
    lrdk_array = []
    for  i in range(len(kth_nearest_array)):
        lrdk_array.append(lrdk_per_row(kth, kth_nearest_array[i], distance_array, i))
    return lrdk_array

In [68]:
# function that outputs lof
def lof(kth_nearest_array, lrdk_array):
    list = []
    for i in range(len(lrdk_array)):
        sumoflrdk = 0
        for j in kth_nearest_array[i]:
            sumoflrdk += lrdk_array[j]/lrdk_array[i]
        list.append(sumoflrdk/len(kth_nearest_array[i]))
    return list

In [69]:
# function  that finds top 10 biggest lof
def top10(lof_array):
    lof_arr = np.array(lof_array)
    top = np.argsort(lof_arr)
    return top[-10:]

In [70]:
data = get_data()

In [71]:
distance_array = distance_of_points(data, False)

In [72]:
kth =  kth_nearest_distance(distance_array, 2)

In [73]:
kth_nearest_array = kth_nearest(distance_array, kth)

In [74]:
lrdk_array = lrdk(kth, kth_nearest_array, distance_array)

In [75]:
lof_array = lof(kth_nearest_array, lrdk_array)

In [76]:
top10(lof_array)

array([679, 682, 684, 691, 687, 688, 693, 689, 692, 694])

# Scikitlearn

In [22]:
from sklearn.neighbors import LocalOutlierFactor

In [58]:
clf = LocalOutlierFactor(n_neighbors=2,metric='manhattan')

In [61]:
y_pred = clf.fit_predict(get_data())

# Cell-Based

In [130]:
def cell_dimensions(data, d):
    d = d/2/2**(1/2)
    x, y = np.max(data,axis=0)
    x_dim = math.ceil(x/d)
    y_dim = math.ceil(y/d)
    return x_dim, y_dim

In [147]:
def cell_and_count(data, d):
    d = d/2/2**(1/2)
    x_dim, y_dim = cell_dimensions(data, d)
    cell = [[[] for i in range(y_dim)] for i in range(x_dim)]
    for i in range(len(data)):
        x = int(data[i][0]/d)
        y = int(data[i][1]/d)
        if x == x_dim:
            x -= 1
        if y == y_dim:
            y -= 1
        cell[x][y].append(i)
    count = np.zeros((x_dim, y_dim))
    print(count.shape)
    return cell

In [148]:
def label_red (cell):
#   0: white 1: pink 2:red
    labels = np.zeros()

In [149]:
data = get_data(columns=[2,3])

In [150]:
cell = cell_and_count(data,2)

(293, 937)


In [151]:
cell

[[[2, 12, 14, 16, 21, 59, 213, 242, 669],
  [11,
   23,
   28,
   36,
   60,
   72,
   99,
   109,
   118,
   220,
   342,
   396,
   472,
   522,
   528,
   558,
   609,
   610,
   684],
  [101, 112, 176, 182, 191, 202, 234, 259, 343, 409, 453, 464, 568, 654],
  [],
  [108, 117, 190, 250, 386, 554, 628, 659],
  [41, 103, 132, 161, 188, 595, 631, 642, 645],
  [],
  [20, 71, 221, 354, 597],
  [15, 45, 115, 346, 503, 626],
  [224, 521],
  [],
  [38, 215, 413, 629],
  [258, 295, 379, 478, 691],
  [],
  [520, 586],
  [81, 547],
  [48, 280, 518, 523, 674],
  [],
  [389],
  [226],
  [],
  [],
  [],
  [],
  [538],
  [230, 334],
  [432],
  [],
  [444],
  [],
  [],
  [],
  [],
  [483],
  [],
  [],
  [],
  [],
  [],
  [459],
  [],
  [83],
  [],
  [],
  [],
  [571],
  [30],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [442],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
  [],
 