# DBSCAN Model

### Import packages and data

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric
import warnings
warnings.simplefilter("ignore")
import scipy.cluster.hierarchy as shc
%matplotlib inline

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, homogeneity_completeness_v_measure

import matplotlib.pyplot as plt
from matplotlib import cm
from scipy.cluster import hierarchy
from scipy.spatial import distance

In [3]:
data = pd.read_csv("Data/customer_data.csv")

In [4]:
del data["Unnamed: 0"]

In [5]:
#data = data[0:10000]

In [6]:
data.head()

Unnamed: 0,UniqueID,FrequencyofVisits,Age,Sex,Payer,Facility
0,5962ec84553dcb84e508e2fc45789278,12,37.0,Feminino,ADVANCECARE,CCB
1,547c0d52bbfe1b93b9411d7b55f3bd82,3,37.0,Feminino,ADVANCECARE,HCP
2,1770ad573874abe83cd04bfaf89c8eb6,2,37.0,Feminino,ENTIDADES COM DESCONTO,CCB
3,11b9573f2fc9dd49594cd1f301cb7be6,16,37.0,Feminino,MÉDIS,HCS
4,84fdd6d40ab2a9b95f3c957237225d33,1,37.0,Feminino,PARTICULARES,CCB


In [None]:
data1= data[["Age","FrequencyofVisits"]]

In [None]:
data1.head()

In [6]:
data.isnull().values.any(axis=0)

array([False, False, False, False, False, False])

### Gower Distance

In [7]:
def gower_distance(X):
    
    #Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    #Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)

        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

### Cosine distance 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
sim=cosine_similarity(data_processed,  Y=None, dense_output=True)

In [None]:
df_sim=pd.DataFrame(sim)
df_sim1=round(-(df_sim-1.0), 4)

In [None]:
df_sim1.head()

In [8]:
# Rename data to X
X1=data1

NameError: name 'data1' is not defined

In [8]:
X=data

In [None]:
# Store the output of function gower_distance for X as dist 
dist1=gower_distance(X1)

In [9]:
dist=gower_distance(X)

In [None]:
dist1.shape

In [None]:
dist.shape

In [None]:
data1.shape

In [None]:
data.shape

In [None]:
dist1=pd.DataFrame(dist1)
dist=pd.DataFrame(dist)

In [None]:
dist1.head(5)

In [None]:
dist.head()

In [None]:
dist1=dist1.fillna(0)
dist=dist.fillna(0)

In [None]:
dist1.head(5)

In [None]:
dist.head(5)

### DBSCAN

In [None]:
estimator_kmeans = KMeans(n_clusters=4)
kmeans_labels = estimator_kmeans.fit(dist1).labels_

In [None]:
np.unique(kmeans_labels)

In [None]:
X1.shape

In [None]:
dbscan = DBSCAN(eps=0.3, min_samples=10, metric="precomputed", n_jobs=-1)

In [None]:
db=dbscan.fit(dist1.values)

In [None]:
db_cos=dbscan.fit(df_sim1.values)

In [None]:
np.unique(db.labels_)

In [None]:
np.unique(db_cos.labels_)

In [None]:
data1["cluster_id"] = db.labels_

In [None]:
data1["cluster_cos"] = db_cos.labels_

In [None]:
data1.head()

In [None]:
numerical_data = data1.select_dtypes(np.number)
categorical_data = data1.select_dtypes([object, "category"])

In [None]:
def cluster_summary(cluster_id):
    cluster = data1[data1.cluster_id==cluster_id]
    cluster_summary = cluster[categorical_data.columns].mode().to_dict(orient="records")[0]
    cluster_summary.update(cluster.mean().to_dict())
    #cluster_summary["cluster_id"] = cluster_id
    return cluster_summary

In [None]:
def cluster_comparison(*cluster_ids):
    summaries = []
    for cluster_id in cluster_ids:
        summaries.append(cluster_summary(cluster_id))
    return pd.DataFrame(summaries).set_index("cluster_id").T

In [None]:
cluster_comparison(0,1,2)

### Evaluate the clustering technique

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabaz_score 

In [None]:
silhouette_score(dist1, db.labels_)

In [None]:
calinski_harabaz_score(dist1, db.labels_)

In [None]:
silhouette_score(dist1, kmeans_labels)

In [None]:
calinski_harabaz_score(dist1, kmeans_labels)

In [None]:
silhouette_score(df_sim1, db_cos.labels_)

In [None]:
dissimilarity = distance.squareform(dist.values)
linkage = hierarchy.linkage(dissimilarity, method="complete")
clusters = hierarchy.fcluster(linkage, 0.8, criterion="distance") # change to decide where the colors begin to differentiate(must do both)
plt.subplot(122)
hierarchy.dendrogram(linkage, color_threshold=0.8) # change to decide where the colors begin to differentiate(must do both)
plt.xlabel("Row")
plt.ylabel("Dissimilarity")
plt.figure(figsize=(5,5))
plt.show()

In [None]:
dend = shc.dendrogram(linkage)