Skip to content

Commit

Permalink
add silhouetteIndex from clustering result
Browse files Browse the repository at this point in the history
#73
update documentation
  • Loading branch information
bvenn committed Jun 10, 2020
1 parent 28d06fc commit bca8f92
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 33 deletions.
6 changes: 5 additions & 1 deletion docsrc/content/Clustering.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -336,14 +336,18 @@ Reference: 'Review on Determining of Cluster in K-means Clustering'; Kodinariya
*)


// The following example expects the raw data to be clustered by k means clustering.
// If you already have clustered data use the 'silhouetteIndex' function instead.

let silhouetteData =
System.IO.File.ReadAllLines(__SOURCE_DIRECTORY__ + "/data/silhouetteIndexData.txt")
|> Array.map (fun x ->
let tmp = x.Split '\t'
[|float tmp.[0]; float tmp.[1]|])

let sI =
ML.Unsupervised.ClusterNumber.silhouetteIndex
ML.Unsupervised.ClusterNumber.silhouetteIndexKMeans
50 // number of bootstraps
(kmeans euclideanNaNSquared (randomCentroids rnd) silhouetteData)
silhouetteData // input data
Expand Down
72 changes: 40 additions & 32 deletions src/FSharp.Stats/ML/Unsupervised/GapStatistics.fs
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,46 @@ module ClusterNumber =
SilhouetteIndexStDev= std
}

let silhouetteIndex (bootstraps:int) (iClustering:int -> KClusteringResult<float []>) (data:float [] []) maxk =
//ToDo: change input to generic clustering result
//silhouetteIndex (bootstraps:int) (iClustering:int -> float [] [] []) (data:float [] []) maxk =
/// Calculates the silhouette score for a clustered data set where the coordinates of each data point is given as float [].
/// The index ranges from -1 (bad clustering result) to 1 (perfekt clustering result)
let silhouetteIndex (clusteredData:float [] [] []) =
let averageDistance (item: float []) (cluster:float[][]) =
cluster
|> Array.averageBy (fun j -> ML.DistanceMetrics.Array.euclideanNaNSquared item j)

[|2..maxk|]
let silhouetteIndex_k =
clusteredData
|> Array.mapi (fun i cluster ->
let externalPoints =
clusteredData
|> Array.indexed
|> Array.filter (fun (j,cl) -> j <> i)
|> Array.map snd
cluster
|> Array.map (fun point ->
let clustersize = float cluster.Length
let intraCluster =
averageDistance point cluster
//correction for datapoint itself sum/(n-1) not sum/n
|> fun intra -> intra * clustersize / (max 1. (clustersize - 1.)) //max ensures correct result at singletons
let interCluster =
//filters out cluster of current point to get interCluster distance
externalPoints
|> Array.map (averageDistance point)
|> Array.min //defines the neighboring cluster
(interCluster - intraCluster) / (max interCluster intraCluster))
|> fun tmp -> tmp.Length,Seq.mean tmp)
|> fun silhouetteIndices ->
let count = Array.sumBy fst silhouetteIndices
silhouetteIndices
|> Array.sumBy (fun (n,sI) -> (float n) * sI)
|> fun sISum -> sISum / float count
silhouetteIndex_k

/// The silhouette index can be used to determine the optimal cluster number in k means clustering.
/// bootstraps indicates the number the k means clustering is performed for each k and maxK indicated the maximal cluster number.
let silhouetteIndexKMeans (bootstraps:int) (iClustering:int -> KClusteringResult<float []>) (data:float [] []) maxK =
[|2..maxK|]
|> Array.map (fun k ->
printfn "iteration k = %i" k
[|1..bootstraps|]
Expand All @@ -67,39 +99,15 @@ module ClusterNumber =
|> Array.map (fun (index,cluster) ->
cluster
|> Array.map (fun ((index,centroid),data) -> data))
let silhouetteIndex_k =
clusteredData
|> Array.mapi (fun i cluster ->
let externalPoints =
clusteredData
|> Array.indexed
|> Array.filter (fun (j,cl) -> j <> i)
|> Array.map snd
cluster
|> Array.map (fun point ->
let clustersize = float cluster.Length
let intraCluster =
averageDistance point cluster
//correction for datapoint itself sum/(n-1) not sum/n
|> fun intra -> intra * clustersize / (max 1. (clustersize - 1.)) //max ensures correct result at singletons
let interCluster =
//filters out cluster of current point to get interCluster distance
externalPoints
|> Array.map (averageDistance point)
|> Array.min //defines the neighboring cluster
(interCluster - intraCluster) / (max interCluster intraCluster))
|> fun tmp -> tmp.Length,Seq.mean tmp)
|> fun silhouetteIndices ->
let count = Array.sumBy fst silhouetteIndices
silhouetteIndices
|> Array.sumBy (fun (n,sI) -> (float n) * sI)
|> fun sISum -> sISum / float count
silhouetteIndex_k
silhouetteIndex clusteredData
)
|> PSeq.toArray
|> fun x -> createSilhouetteResult k (Seq.mean x) (Seq.stDev x)
)




module GapStatistics =

open FSharp.Stats.ML
Expand Down

0 comments on commit bca8f92

Please sign in to comment.