#### import modules and packages

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

import matplotlib.pyplot as plt
from datetime import datetime as dt

from onc.onc import ONC

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from collections import Counter
from pylab import rcParams

rcParams['figure.figsize'] = 14,6

%matplotlib inline

#### Load data

In [None]:
onc = ONC('533c0804-3482-4efb-813b-681d357440d6')


#Baynes location codes
#BSM.J1 	5mbss 
#BSM.J2 	20mbss 
#BSM.J3 	40mbss 

filters = {
    'locationCode': 'BSM.J1',
    'deviceCategoryCode': 'CTD',
    'qualityControl': 'raw',
    'sensorCategoryCodes': 'conductivity,temperature',
    'dateFrom':'2020-01-19T00:00:00.000Z',
    'dateTo':'2020-01-19T23:59:59.999Z'
}
result = onc.getDirectByLocation(filters)
#onc.print(result)

d = {'Conductivity (S/m)':[],'Temperature (C)':[]}
d['Conductivity (S/m)'] = result['sensorData'][0]['data']['values']
d['Temperature (C)'] = result['sensorData'][1]['data']['values']
df = pd.DataFrame.from_dict(d)
df

In [None]:
fig = df.plot.scatter(x='Conductivity (S/m)',y='Temperature (C)')

In [None]:
df.info()

#### Preparing DBSCAN model ( train model and identify outliers)

In [None]:
# prepare data for model
X = df[['Conductivity (S/m)','Temperature (C)']]

In [None]:
X = X.values.astype('float32',copy =False)
X

In [None]:
ms = 10
ep1 = 0.01
db = DBSCAN(eps=ep1, min_samples=ms).fit(X)

#### Visualize results

In [None]:
labels = db.labels_

In [None]:
len(set(labels))

In [None]:
# Separate outliers from clustered data
outliers_X = X[labels == -1]
clusters_X = X[labels != -1]

In [None]:
colors_clusters = labels[labels != -1]
colors_outliers = 'black'

In [None]:
# get info about the clusters
clusters = Counter(labels)
print(clusters)
print('Number of cluster = {}'.format(len(clusters)-1))

In [None]:
def show_clusters(X, labels):
    df = pd.DataFrame(dict(x = X[:,0],y =X[:,1], label = labels))
    color = {-1: 'black', 0:'blue',1:'red'}
    fig, ax = plt.subplots()
    grouped = df.groupby('label')
    for key, group in grouped:
        group.plot(ax =ax, kind ='scatter', x = 'x', y ='y', label = key, color = color[key])
    plt.ylabel("Conductivity (S/m)")
    plt.xlabel("Temperature (C)")

In [None]:
show_clusters(X, labels)

In [None]:
# plot clusters and outliers
fig = plt.figure()
ax = fig.add_axes([.1, .1, 1, 1])

ax.scatter(outliers_X[:,0],outliers_X[:,1], c = colors_outliers,s =50)
ax.scatter(clusters_X[:,0],clusters_X[:,1], c = colors_clusters,s =50)
plt.ylabel("Conductivity (S/m)")
plt.xlabel("Temperature (C)")
plt.tight_layout()

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(X,labels)

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors = 2)
nbrs = neigh.fit(X)

In [None]:
distances, indices = nbrs.kneighbors(X)

In [None]:
distances = np.sort(distances, axis = 0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
db1 = DBSCAN(eps=ep1, min_samples=0.01).fit(X)

In [None]:
labels1 = db1.labels_

In [None]:
silhouette_score(X,labels1)

In [None]:
db2 = DBSCAN(eps=ep1, min_samples=0.01, algorithm ='ball_tree').fit(X)

In [None]:
labels2 = db2.labels_

In [None]:
silhouette_score(X,labels2)

In [None]:
db3 = DBSCAN(eps=ep1, min_samples=0.01, algorithm ='kd_tree').fit(X)

In [None]:
labels3 = db3.labels_

In [None]:
silhouette_score(X,labels3)

In [None]:
db4 = DBSCAN(eps=ep1, min_samples=0.01, algorithm ='brute').fit(X)

In [None]:
labels4 = db4.labels_

In [None]:
silhouette_score(X,labels4)

In [None]:
db5 = DBSCAN(eps=ep1, min_samples=10,algorithm='ball_tree', metric='haversine').fit(X)
labels5 = db5.labels_
silhouette_score(X,labels5)

In [None]:
def pnorm(x,y):
  return np.sum(np.abs(x-y)**1)**(1/1)
    # np.sum(np.abs(x-y)**p)**(1/p)

db6 = DBSCAN(eps=ep1, min_samples=10,algorithm='ball_tree', metric=pnorm).fit(X)
labels6 = db5.labels_
silhouette_score(X,labels6)