In [85]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math


# Data pre-processing

In [86]:
df = pd.read_csv('df_formatted.csv')
df.dtypes

df_num = df[['Satisfaction_Level','Last_Evaluation',
            'Average_Montly_Hours',
             'Time_Spend_Company','Number_Projects']]
df_num.dtypes


Satisfaction_Level      float64
Last_Evaluation         float64
Average_Montly_Hours      int64
Time_Spend_Company        int64
Number_Projects           int64
dtype: object

## Data standardization

In [87]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))

scaler.fit(df['Number_Projects'].values.reshape(-1, 1))
df_num['Number_Projects'] = scaler.transform(df_num['Number_Projects'].values.
                                        reshape(-1, 1))
scaler.fit(df['Average_Montly_Hours'].values.reshape(-1, 1))
df_num['Average_Montly_Hours'] = scaler.transform(df_num['Average_Montly_Hours'].
                                              values.reshape(-1, 1))
scaler.fit(df['Time_Spend_Company'].values.reshape(-1, 1))
df_num['Time_Spend_Company'] = scaler.transform(df_num['Time_Spend_Company'].values.
                                            reshape(-1, 1))
df_num['Average_Montly_Hours'] = [round(i, 2) for i in df_num['Average_Montly_Hours']]
df_num['Time_Spend_Company'] = [round(i, 2) for i in df_num['Time_Spend_Company']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.p

In [88]:
df_num.head()

Unnamed: 0,Satisfaction_Level,Last_Evaluation,Average_Montly_Hours,Time_Spend_Company,Number_Projects
0,0.38,0.53,0.29,0.13,0.0
1,0.8,0.86,0.78,0.5,0.6
2,0.11,0.88,0.82,0.25,1.0
3,0.72,0.87,0.59,0.38,0.6
4,0.37,0.52,0.29,0.13,0.0


## Scelta di un intervallo per eps e MinPts tramite k-NN

In [89]:
from sklearn.neighbors import NearestNeighbors

# grafico k-nearest-neighbors
plt.close()
plt.figure(figsize=(12,6))

# k =  numero di vicinik = 4

for k in range(3,20,1): 
#for k in [3,5,7,10,100]:

    # applico l'algoritmo k-NN con distanza euclidea e k-MinPts
    nbrs = NearestNeighbors(n_neighbors=k, metric='minkowski', p=2).fit(df_num.values) 

    # estraggo la matrice delle distanze, ogni colonna rappresenta le distanze dal k-esimo vicino
    # l'ultima colonna è dunque la distanza che ci interessa
    Mdistances = nbrs.kneighbors(df_num.values)[0]


    df_kdist = pd.DataFrame(Mdistances) # converto la matrice delle distanze in DataFrame
    df_kdist.columns = [str(i) for i in range(1,k+1)] #rinomino le colonne
    df_kdist = df_kdist.sort_values(str(k)) # metto in ordine di distanza
    kdist = df_kdist[str(k)] # seleziono massima distanza tra i k-vicini

    x=np.arange(1,15000,1)
    y= kdist

    plt.plot(x,y)
    plt.title('Nearest Neighbors')
    plt.ylabel('k-distances for k nearest neighbors')
    plt.xlabel('Data objects sorted by k-distances')
    plt.legend(loc=2)
    plt.tight_layout()

# linee orizzontali per intervallo eps    
lista_h = [0.15,0.25]
for h in lista_h:
    plt.annotate(str(h),xy=(10000,h*1.1))
    plt.axhline(h)


plt.grid()
plt.savefig('nearestneighbors.pdf')    

# Prove totali DBSCAN

In [93]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

eps = []
sil = []
num_cl = []
range_MinPts = range(3,20,1)
range_eps = np.linspace(0.15,0.25,10) 
for e in range_eps:
    for num in range_MinPts:
        db = DBSCAN(eps=e, min_samples=num,metric='euclidean',
                         algorithm='auto',n_jobs=-1).fit(df_num)
        labels = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        silhouette = metrics.silhouette_score(df_num, labels)
        if(n_clusters_ < 10):
            print ('con eps = %f e min samples %d' % (e,num))
            print('Estimated number of clusters: %d' % n_clusters_)
            print("Silhouette Coefficient: %0.3f"
              % metrics.silhouette_score(df_num, labels))
            eps.append(e)
            num_cl.append(n_clusters_)
            sil.append(metrics.silhouette_score(df_num, labels))

con eps = 0.150000 e min samples 14
Estimated number of clusters: 9
Silhouette Coefficient: 0.075
con eps = 0.150000 e min samples 16
Estimated number of clusters: 9
Silhouette Coefficient: 0.099
con eps = 0.150000 e min samples 17
Estimated number of clusters: 9
Silhouette Coefficient: 0.096
con eps = 0.150000 e min samples 18
Estimated number of clusters: 9
Silhouette Coefficient: 0.093
con eps = 0.150000 e min samples 19
Estimated number of clusters: 8
Silhouette Coefficient: 0.099
con eps = 0.161111 e min samples 19
Estimated number of clusters: 9
Silhouette Coefficient: 0.103
con eps = 0.172222 e min samples 13
Estimated number of clusters: 8
Silhouette Coefficient: 0.067
con eps = 0.172222 e min samples 14
Estimated number of clusters: 8
Silhouette Coefficient: 0.081
con eps = 0.172222 e min samples 15
Estimated number of clusters: 9
Silhouette Coefficient: 0.068
con eps = 0.172222 e min samples 16
Estimated number of clusters: 8
Silhouette Coefficient: 0.078
con eps = 0.172222 e

con eps = 0.238889 e min samples 19
Estimated number of clusters: 4
Silhouette Coefficient: 0.309
con eps = 0.250000 e min samples 3
Estimated number of clusters: 4
Silhouette Coefficient: 0.316
con eps = 0.250000 e min samples 4
Estimated number of clusters: 4
Silhouette Coefficient: 0.316
con eps = 0.250000 e min samples 5
Estimated number of clusters: 3
Silhouette Coefficient: 0.331
con eps = 0.250000 e min samples 6
Estimated number of clusters: 3
Silhouette Coefficient: 0.331
con eps = 0.250000 e min samples 7
Estimated number of clusters: 2
Silhouette Coefficient: 0.342
con eps = 0.250000 e min samples 8
Estimated number of clusters: 2
Silhouette Coefficient: 0.342
con eps = 0.250000 e min samples 9
Estimated number of clusters: 2
Silhouette Coefficient: 0.342
con eps = 0.250000 e min samples 10
Estimated number of clusters: 2
Silhouette Coefficient: 0.342
con eps = 0.250000 e min samples 11
Estimated number of clusters: 2
Silhouette Coefficient: 0.342
con eps = 0.250000 e min sa

## Assegnazione cluster

In [95]:
from sklearn import metrics
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.227778, min_samples=13,metric='euclidean',
                 algorithm='auto',n_jobs=-1).fit(df_num)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print ('con eps = 0.22 e min samples 15' )
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
  % metrics.silhouette_score(df_num, labels))

con eps = 0.22 e min samples 15
Estimated number of clusters: 3
Silhouette Coefficient: 0.310


In [96]:
df_num['cluster'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [97]:
df_num

Unnamed: 0,Satisfaction_Level,Last_Evaluation,Average_Montly_Hours,Time_Spend_Company,Number_Projects,cluster
0,0.38,0.53,0.29,0.13,0.0,0
1,0.80,0.86,0.78,0.50,0.6,0
2,0.11,0.88,0.82,0.25,1.0,0
3,0.72,0.87,0.59,0.38,0.6,0
4,0.37,0.52,0.29,0.13,0.0,0
5,0.41,0.50,0.27,0.13,0.0,0
6,0.10,0.77,0.71,0.25,0.8,0
7,0.92,0.85,0.76,0.38,0.6,0
8,0.89,1.00,0.60,0.38,0.6,0
9,0.42,0.53,0.21,0.13,0.0,0


# Dati csv

In [98]:
df_num.groupby('cluster').describe().to_csv('dbscan.csv')