<a href="https://colab.research.google.com/github/disha-cpu/Resources-For-Stat-Data/blob/master/2D_DBSCAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from plotnine import *
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
import plotly.express as px
import math
%matplotlib inline

In [None]:
# read the data and filter it
df = pd.read_csv('/content/Temperature-data.csv')
df1 = df
df['Time'] = pd.to_datetime(df['Time'])
df['Minutes'] = df['Time'].dt.minute
df['Minutes'] = df['Minutes'].astype(float)
d1 = df[["Minutes","MRI Room"]]
d1 = d1.head(d1.size//10)
d1

Unnamed: 0,Minutes,MRI Room
0,52.0,19.0
1,53.0,19.0
2,54.0,18.9
3,55.0,18.9
4,56.0,18.9
...,...,...
50519,42.0,18.0
50520,43.0,18.1
50521,44.0,18.1
50522,45.0,18.1


In [None]:
# visualize the data
px.scatter(d1, x='Minutes', y='MRI Room').update_traces(marker={'size': 5})

In [None]:
# normalising data
d1[["Minutes","MRI Room"]] = StandardScaler().fit_transform(d1.values)

In [None]:
# find distances of each point
mins = len(df)//1000
nn = NearestNeighbors(n_neighbors=mins+1)

nn.fit(d1[["Minutes", "MRI Room"]])

distances, neighbors = nn.kneighbors(d1[["Minutes", "MRI Room"]])

# sort the distances
distances = np.sort(distances[:, mins], axis = 0)

#plot the distances
distances_df = pd.DataFrame({"distances": distances,
                             "index": list(range(0,len(distances)))})

px.line(distances_df, x='index', y='distances')

def calc_distance(x1, y1, a, b, c):
  d = abs((a * x1 + b * y1 + c)) / (math.sqrt(a * a + b * b))
  return d

def find_eps():
  # (y1 – y2)x + (x2 – x1)y + (x1y2 – x2y1) = 0
  a = distances[0] - distances[-1]  #y
  b = distances_df.index[-1] - distances_df.index[0]    #x
  c1 = distances_df.index[0] * distances[-1]
  c2 = distances_df.index[-1] * distances[0]
  c = c1 - c2

  distance_of_points_from_line = []
  for k in range(len(distances_df)):
    distance_of_points_from_line.append(
        calc_distance(distances_df.index[k], distances[k], a, b, c))
    
  dist = pd.Series(distance_of_points_from_line)
  index_max = dist.idxmax()
  return distances[index_max]

In [None]:
db1 = DBSCAN(eps = find_eps(), min_samples = mins).fit(d1)

d1["assignments"] = db1.labels_
color = d1["assignments"].astype(str)
n_noise_ = list(db1.labels_).count(-1)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of noise points: 373


In [None]:
px.scatter(d1, x='Minutes', y='MRI Room', color = color).update_traces(marker={'size': 5})

In [None]:
anomaly = d1[d1.assignments == -1]
anomalies_index = list(anomaly.index)
time_stamp = []
temp_list = []
for index in anomalies_index:
  time = df1.loc[index]['Time']
  time_stamp.append(time)
  temp = df1.loc[index]["MRI Room"]
  temp_list.append(temp)
dic = {
    'Time': time_stamp,
    'MRI Room': temp_list
}

anomaly_df = pd.DataFrame(dic)
anomaly_df

Unnamed: 0,Time,MRI Room
0,2021-04-12 06:28:00,35.5
1,2021-04-12 06:29:00,35.7
2,2021-04-12 06:30:00,35.7
3,2021-04-12 06:31:00,35.5
4,2021-04-12 06:32:00,35.3
...,...,...
368,2021-05-10 12:02:00,36.4
369,2021-05-10 12:03:00,36.0
370,2021-05-10 12:04:00,35.8
371,2021-05-10 12:05:00,35.7


In [None]:
from sklearn import metrics
metrics.silhouette_score(d1, d1['assignments'])

0.4676408928055916