In [9]:
%%time 
import numpy as np 
import pandas as pd 
import scipy
import matplotlib
import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN 
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize 
from sklearn.decomposition import PCA 
from sklearn import metrics 
from sklearn.metrics import silhouette_score, homogeneity_completeness_v_measure
from sklearn.metrics.pairwise import euclidean_distances
from hdbscan import HDBSCAN
from scipy.stats import spearmanr 

import warnings 
warnings.simplefilter("ignore") 



CPU times: user 281 µs, sys: 989 µs, total: 1.27 ms
Wall time: 2.06 ms


In [10]:
df = pd.read_pickle('picklefinal/postFAMD.pkl')

In [11]:
df.shape

(1468101, 10)

In [4]:
df = df.drop(columns = [8,9])

In [5]:
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
00000f7264c27ba6fea0c837ed6aa0aa,2738.866104,-1283.474094,-937.754525,2742.902283,-569.733342,-1439.002363,926.613108,817.699409
00001e984eba85527fd3122056451279,3489.873372,-582.13316,-1559.529845,-1089.873401,138.202214,-269.501375,-86.50472,863.355542
0000219e4b37d2504fb6b8c28e24a2d4,2937.887853,1631.360292,126.488715,867.030033,-3073.741307,-759.911687,-866.664264,552.896295
000026c67a83fa72aec14512887bb173,2920.606795,-1378.205303,2014.482509,-663.424336,122.193676,-1189.100587,1762.811919,1095.543238
000028899fe7782862d40bb1b87807ee,3092.45447,-380.875608,-169.559082,425.601597,3208.156099,-714.99451,-525.517818,-361.401155


In [5]:
### NORMALIZE DATA 
newdf = df[['Freq', 'Age']]
newdf

from sklearn import preprocessing

x = newdf.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
newdf = pd.DataFrame(x_scaled)
newdf.head()

newdf.columns = ['Freq','Age']

df = df.drop(columns = ['Freq','Age'])

df['Age'] = newdf['Age'].values
df['Freq'] = newdf['Freq'].values

In [7]:
list_of_names = list(df.columns)

In [8]:
df.columns = list_of_names


In [9]:
df.head()

Unnamed: 0,Freq,Age,Large_Hospital,Medium_Hospital,Payer_Out_of_Pocket,Payer_Special_Program,Payer_State,region_Porto,region_Santarem,region_Viseu,General_Service_Emergency,General_Service_Examination,General_Service_Others,General_Service_Surgery,General_Service_Treatment,Sex_Masculino,Sex_Missing
0,-0.279657,0.297376,0.171623,-0.555808,-0.493714,-0.189121,-0.538143,1.852339,-0.221605,-0.164676,0.563824,0.583638,-0.134845,-0.336768,-0.384018,1.116113,-0.055554
1,0.02368,1.016946,1.036258,-0.555808,-0.493714,-0.189121,-0.028397,-0.543157,-0.221605,-0.164676,-0.43676,0.898848,-0.134845,-0.336768,-0.384018,-0.895966,-0.055554
2,-0.50716,-1.343594,-1.125331,-0.555808,-0.493714,-0.189121,-0.538143,1.852339,-0.221605,-0.164676,-0.43676,0.268427,-0.134845,-0.336768,-0.384018,-0.895966,-0.055554
3,-0.431325,-0.158936,-1.125331,1.974664,-0.493714,-0.189121,-0.538143,-0.543157,-0.221605,-0.164676,-0.43676,1.056454,-0.134845,1.815089,-0.384018,1.116113,-0.055554
4,-0.431325,-1.277779,1.036258,-0.555808,-0.493714,-0.189121,0.736222,-0.543157,-0.221605,-0.164676,4.566161,-1.307628,-0.134845,-0.336768,-0.384018,1.116113,-0.055554


In [10]:
### drop some columns so we can have a smaller dataframe 

df = df.drop(columns = ["Large_Hospital","Medium_Hospital","Freq"])

### KNN to find epsilon

In [None]:
%%time 
neigh = NearestNeighbors(n_neighbors=14)
nbrs = neigh.fit(df)  
distances, indices = nbrs.kneighbors(df)

In [None]:
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
plt.xlim(600000,800000)

In [6]:
#function for plotting nearest neighbors graph 
import numpy as np
import pandas as pd
import math

def k_distances(X, n=None, dist_func=None):
    """Function to return array of k_distances.

    X - DataFrame matrix with observations
    n - number of neighbors that are included in returned distances (default number of attributes + 1)
    dist_func - function to count distance between observations in X (default euclidean function)
    """
    if type(X) is pd.DataFrame:
        X = X.values
    k=0
    if n == None:
        k=X.shape[1]+2
    else:
        k=n+1

    if dist_func == None:
        # euclidean distance square root of sum of squares of differences between attributes
        dist_func = lambda x, y: math.sqrt(
            np.sum(
                np.power(x-y, np.repeat(2,x.size))
            )
        )

    Distances = pd.DataFrame({
        "i": [i//10 for i in range(0, len(X)*len(X))],
        "j": [i%10 for i in range(0, len(X)*len(X))],
        "d": [dist_func(x,y) for x in X for y in X]
    })
    return np.sort([g[1].iloc[k].d for g in iter(Distances.groupby(by="i"))])


In [None]:
#plot nearest neighbors graph 
import matplotlib.pyplot as plt

d = k_distances(scaled_ds,51,spearmanr(scaled_ds))
plt.plot(d)
plt.ylabel("k-distances")
plt.grid(True) 
plt.show()


### DBSCAN

In [27]:
# setting up DBSCAN   
db = DBSCAN(eps = 20, min_samples = 15).fit(df)
#fitting model 
model = db.fit(df)

In [28]:
#retrieve labels 
dblabels = model.labels_

In [29]:
#number of labels 
np.unique(dblabels).shape

(10178,)

In [30]:
unique, counts = np.unique(dblabels, return_counts=True)
dict(zip(unique, counts))

{-1: 108985,
 0: 697,
 1: 22,
 2: 358,
 3: 282,
 4: 875,
 5: 298,
 6: 3731,
 7: 840,
 8: 522,
 9: 997,
 10: 151,
 11: 39,
 12: 629,
 13: 35,
 14: 825,
 15: 96,
 16: 914,
 17: 861,
 18: 767,
 19: 1645,
 20: 119,
 21: 2553,
 22: 929,
 23: 823,
 24: 288,
 25: 446,
 26: 1956,
 27: 2053,
 28: 1792,
 29: 310,
 30: 2296,
 31: 486,
 32: 16,
 33: 191,
 34: 485,
 35: 756,
 36: 160,
 37: 169,
 38: 1334,
 39: 203,
 40: 1100,
 41: 717,
 42: 551,
 43: 97,
 44: 179,
 45: 313,
 46: 227,
 47: 28,
 48: 366,
 49: 19,
 50: 47,
 51: 132,
 52: 1099,
 53: 522,
 54: 751,
 55: 393,
 56: 968,
 57: 2483,
 58: 1307,
 59: 138,
 60: 139,
 61: 1781,
 62: 202,
 63: 236,
 64: 319,
 65: 16,
 66: 1786,
 67: 614,
 68: 173,
 69: 43,
 70: 111,
 71: 187,
 72: 1183,
 73: 2776,
 74: 1028,
 75: 81,
 76: 321,
 77: 169,
 78: 690,
 79: 315,
 80: 463,
 81: 3406,
 82: 152,
 83: 899,
 84: 131,
 85: 1706,
 86: 44,
 87: 62,
 88: 206,
 89: 1538,
 90: 51,
 91: 125,
 92: 196,
 93: 246,
 94: 1194,
 95: 209,
 96: 99,
 97: 104,
 98: 17,
 99

### Identifying core points

In [None]:
#identifying core samples 

core_samples = np.zeros_like(dblabels, dtype=bool)

core_samples[dbscan.core_sample_indices_]= True 
print(core_samples)

### Calculate number of clusters 

In [None]:
n_clusters = len(set(dblabels)) - (1 if -1 in labels else 0)

In [None]:
n_clusters 