Skip to content

Commit

Permalink
documentation, tests, clean up
Browse files Browse the repository at this point in the history
cleaned up, added docs and code examples, adjusted tests
  • Loading branch information
s-weil committed Oct 27, 2023
1 parent 4b6f1cf commit 9919c4d
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 139 deletions.
158 changes: 70 additions & 88 deletions src/FSharp.Stats/ML/Unsupervised/KNN.fs
Expand Up @@ -7,7 +7,7 @@ type LabeledPoint<'a, 'l> = {
}

with
static member create(p, l)= {
static member create(p, l) = {
p = p
label = l
}
Expand All @@ -19,23 +19,31 @@ module KNN =

module Array =

/// <summary>TODO.</summary>
/// <remarks>May mutate the order of `labeledPoints` and is not thread safe.</remarks>
/// <summary>
/// The [k-nearest neighbors algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) to classify a new data point into the target class,
/// depending on the features of its neighboring data points.
/// </summary>
/// <remarks>May mutate the order of `labeledPoints`.</remarks>
/// <param name="distance">the distance function, e.g. `euclidean`</param>
/// <param name="labeledPoints">second vector</param>
/// <param name="k">The number of nearest neighbors to look from x</param>
/// <param name="labeledPoints">the array of classified (or labeled) points, use for the classification</param>
/// <param name="k">The number of nearest neighbors from x to look for.</param>
/// <param name="x">The point to classify</param>
/// <returns>The most common labels from the k nearest neighbors for x.</returns>
/// <returns>The most common label from the k nearest neighbors for x.</returns>
/// <example>
/// <code>
/// TODO
/// <code>
/// let reds = [| [ 2.0; 4.0 ]; [ 1.0; 3.0 ]; [ 2.0; 4.0 ]; [ 3.0; 2.0 ]; [ 2.0; 1.0 ] |] |> Array.map (fun p -> LabeledPoint<float list, string>.create(p, "red"))
/// let blues = [| [ 5.0; 6.0 ]; [ 4.0; 5.0 ]; [ 4.0; 6.0 ]; [ 6.0; 6.0 ]; [ 5.0; 4.0 ] |] |> Array.map (fun p -> LabeledPoint<float list, string>.create(p, "blue"))
///
/// let labeledPoints = Array.append reds blues
/// let prediction = FSharp.Stats.ML.Unsupervised.KNN.Array.predict FSharp.Stats.DistanceMetrics.euclidean labeledPoints 3
///
/// let color = prediction [3.0; 3.0] // should be: Some "red"
/// let color = prediction [6.0; 6.0] // should be: Some "blue"
/// </code>
/// </example>
let inline predict (distance : Distance<'a>) (labeledPoints: LabeledPoint<'a, 'l> array) (k : int) (x: 'a) : 'l option =
if Array.isEmpty labeledPoints || k <= 0 then
None
elif k = 1 then
Some labeledPoints.[0].label
else
labeledPoints |> Array.sortInPlaceBy (fun lp -> distance lp.p x)

Expand All @@ -49,48 +57,28 @@ module KNN =

Some label

let inline predictInRef<'l when 'l: equality and 'l: comparison>
(distance : Distance<'a>)
(labeledPoints: inref<LabeledPoint<'a, 'l> array>)
(k : int)
(x : 'a)
: 'l option =

if Array.isEmpty labeledPoints || k <= 0 then
None
elif k = 1 then
Some labeledPoints.[0].label
else

let distanceIndices =
labeledPoints
|> Array.mapi (fun idx p -> idx, distance p.p x)

let kNearestNeighborIndices =
distanceIndices
|> Array.sortBy snd // snd = distance value
|> Array.take k

let labels = Array.zeroCreate k

for i in 0..k do
let idx, _ = kNearestNeighborIndices.[i]
let label: 'l = labeledPoints.[idx].label
labels.[i] <- label

let label =
labels
|> Seq.countBy id
|> Seq.maxBy fst
|> fst

Some label



module Seq =

let inline predict<'l when 'l: equality and 'l: comparison>
/// <summary>
/// The [k-nearest neighbors algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) to classify a new data point into the target class,
/// depending on the features of its neighboring data points.
/// </summary>
/// <param name="distance">the distance function, e.g. `euclidean`</param>
/// <param name="labeledPoints">the sequence of classified (or labeled) points, use for the classification</param>
/// <param name="k">The number of nearest neighbors from x to look for.</param>
/// <param name="x">The point to classify</param>
/// <returns>The most common label from the k nearest neighbors for x.</returns>
/// <example>
/// <code>
/// let points = seq { [ 2.0; 4.0 ]; [ 1.0; 3.0 ]; [ 2.0; 4.0 ]; [ 3.0; 2.0 ]; [ 2.0; 1.0 ]; [ 5.0; 6.0 ]; [ 4.0; 5.0 ]; [ 4.0; 6.0 ]; [ 6.0; 6.0 ]; [ 5.0; 4.0 ] } ///
/// let labels = seq { "red"; "red"; "red"; "red"; "red"; "blue"; "blue"; "blue"; "blue"; "blue" }
/// let prediction = FSharp.Stats.ML.Unsupervised.KNN.Seq.KNN.Seq.predict FSharp.Stats.DistanceMetrics.euclidean points labels 3
///
/// let color = prediction [3.0; 3.0] // should be: Some "red"
/// let color = prediction [6.0; 6.0] // should be: Some "blue"
/// </code>
/// </example>
let inline predict<'a, 'l when 'l: equality and 'l: comparison>
(distance : Distance<'a>)
(points : 'a seq)
(labels : 'l seq)
Expand All @@ -100,10 +88,7 @@ module KNN =

if Seq.isEmpty points || Seq.length points <> Seq.length labels || k <= 0 then
None
elif k = 1 then
Some (Seq.head labels)
else

else
let distanceIndices=
points
|> Seq.mapi (fun idx p -> idx, distance p x)
Expand All @@ -121,39 +106,37 @@ module KNN =

Some label

// let inline predict<'l when 'l: equality and 'l: comparison>
// (distance : Distance<'a>)
// (labeledPoints : LabeledPoint<'a, 'l> seq)
// (k : int)
// (x : 'a)
// : 'l option =

// if Seq.isEmpty labeledPoints || k <= 0 then
// None
// elif k = 1 then
// Some (Seq.head labeledPoints).label
// else

// let distanceIndices =
// labeledPoints
// |> Seq.map (fun p -> p, distance p.p x)

// let kNearestNeighborIndices =
// distanceIndices
// |> Seq.sortBy snd // snd = distance value
// |> Seq.take k

// let label =
// kNearestNeighborIndices
// |> Seq.countBy (fun (p, _) -> p.label)
// |> Seq.maxBy fst
// |> fst

// Some label



/// Python Style KNeighborsClassifier

/// <summary>
/// The [k-nearest neighbors algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) to classify new data points into their target classes,
/// depending on the features of their neighboring data points.
/// </summary>
/// <remarks>Convencience methods for using `KNN.Array.predict`, similiar to `KNeighborsClassifier` in `sklearn.neighbors`.</remarks>
/// <param name="distance">the distance function, e.g. `euclidean`</param>
/// <param name="k">The number of nearest neighbors from x to look for.</param>
/// <returns>The most common label from the k nearest neighbors for x.</returns>
/// <example>
/// <code>
/// let reds = [| [ 2.0; 4.0 ]; [ 1.0; 3.0 ]; [ 2.0; 4.0 ]; [ 3.0; 2.0 ]; [ 2.0; 1.0 ] |]
/// let blues = [| [ 5.0; 6.0 ]; [ 4.0; 5.0 ]; [ 4.0; 6.0 ]; [ 6.0; 6.0 ]; [ 5.0; 4.0 ] |]
///
/// let knnClassifier = KNN.Classifier(FSharp.Stats.DistanceMetrics.euclidean, 3)
///
/// // fit the classifier and predict new points
/// // version 1.
/// let labeledPoints = Map [ "blue", blues; "red", reds ]
/// knnClassifier.fit(labeledPoints)
/// let color = knnClassifier.predict [3.0; 3.0] // should be: Some "red"
/// let colors = knnClassifier.predict [| [3.0; 3.0]; [6.0; 6.0] |] // should be: [| Some "red", Some "blue" |]
///
/// // version 2.
/// let points = Array.append reds blues
/// let labels = [| "red"; "red"; "red"; "red"; "red"; "blue"; "blue"; "blue"; "blue"; "blue" |]
/// knnClassifier.fit(points, labels)
/// let color = knnClassifier.predict [3.0; 3.0] // should be: Some "red"
/// let colors = knnClassifier.predict [| [3.0; 3.0]; [6.0; 6.0] |] // should be: [| Some "red", Some "blue" |]
/// </code>
/// </example>
type Classifier<'a, 'l when 'l: equality and 'l: comparison>(distance: Distance<'a>, k: int) =

[<DefaultValue>] val mutable labeledPoints : LabeledPoint<'a, 'l> array
Expand All @@ -179,7 +162,6 @@ module KNN =
points |> Array.map (fun p -> LabeledPoint.create<'a, 'l>(p, label)))
this.labeledPoints <- Seq.toArray lps


member this.predict(x, ?overwriteK) : 'l option =
Array.predict distance this.labeledPoints (defaultArg overwriteK this.K) x

Expand Down
113 changes: 62 additions & 51 deletions tests/FSharp.Stats.Tests/ML.fs
Expand Up @@ -390,20 +390,22 @@ module hClust =
module KNN =
open FSharp.Stats.ML.Unsupervised
open FSharp.Stats.ML.Unsupervised.KNN.Array
open FSharp.Stats.ML.Unsupervised.KNN.Seq
open FSharp.Stats.Vector

[<Tests>]
let knnTests =
testList "KNN Tests" [
testCase "blueVsRedPoints" <| fun () ->
let blues =
testCase "Array.blueVsRedPoints" <| fun () ->
let reds =
[|
[ 2.0; 4.0 ]
[ 1.0; 3.0 ]
[ 2.0; 4.0 ]
[ 3.0; 2.0 ]
[ 2.0; 1.0 ]
|] |> Array.map (fun p -> LabeledPoint<float list, string>.create(p, "red"))
let reds =
let blues =
[|
[ 5.0; 6.0 ]
[ 4.0; 5.0 ]
Expand All @@ -412,8 +414,8 @@ module KNN =
[ 5.0; 4.0 ]
|] |> Array.map (fun p -> LabeledPoint<float list, string>.create(p, "blue"))

let labeledPoints = Array.append blues reds
let prediction = predict FSharp.Stats.DistanceMetrics.euclidean labeledPoints
let labeledPoints = Array.append reds blues
let prediction = KNN.Array.predict FSharp.Stats.DistanceMetrics.euclidean labeledPoints

let predicted = prediction 3 [3.0; 3.0]

Expand All @@ -425,30 +427,63 @@ module KNN =
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "blue" "label should be blue"

testCase "symmetricallyDistributedPoints" <| fun () ->
let points = Array.init 20 (fun idx -> 0.1 * float idx)

testCase "Seq.blueVsRedPoints" <| fun () ->
let points = seq {
vector [ 2.0; 4.0 ]
vector [ 1.0; 3.0 ]
vector [ 2.0; 4.0 ]
vector [ 3.0; 2.0 ]
vector [ 2.0; 1.0 ]
vector [ 5.0; 6.0 ]
vector [ 4.0; 5.0 ]
vector [ 4.0; 6.0 ]
vector [ 6.0; 6.0 ]
vector [ 5.0; 4.0 ]
}
let labels = seq { "red"; "red"; "red"; "red"; "red"; "blue"; "blue"; "blue"; "blue"; "blue" }
let prediction = KNN.Seq.predict FSharp.Stats.DistanceMetrics.Vector.euclidean points labels 3

let predicted = prediction (vector [3.0; 3.0])

let blues =
points |> Array.map (fun p -> LabeledPoint<float, string>.create(p, "blue"))
let reds =
points |> Array.map (fun p -> LabeledPoint<float, string>.create(-p, "red"))
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "red" "label should be red"

let labeledPoints = Array.append blues reds
let predicted = prediction (vector [6.0; 6.0])

let distance a b = abs (a - b)
let prediction = KNN.Array.predict distance labeledPoints
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "blue" "label should be blue"

// '0' is an ambigious case due to the symmetry. may deppend on initial sorting, ...
for sample in 1..100 do
let predicted = prediction 3 (float sample)
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "blue" "label should be blue"

let predicted = prediction 3 (float -sample)
Expect.isTrue predicted.IsSome "Has Label"
Expect.equal predicted.Value "red" "label should be red"
testCase "KnnClassifier.blueVsRedPoints" <| fun () ->
let knnClassifier = KNN.Classifier(FSharp.Stats.DistanceMetrics.euclidean, 3)

let reds = [| [ 2.0; 4.0 ]; [ 1.0; 3.0 ]; [ 2.0; 4.0 ]; [ 3.0; 2.0 ]; [ 2.0; 1.0 ] |]
let blues = [| [ 5.0; 6.0 ]; [ 4.0; 5.0 ]; [ 4.0; 6.0 ]; [ 6.0; 6.0 ]; [ 5.0; 4.0 ] |]

testCase "symmetricallyDistributedPointsWithClassifier" <| fun () ->
let labeledPoints = Map [ "blue", blues; "red", reds ]
knnClassifier.fit(labeledPoints)

let color = knnClassifier.predict [3.0; 3.0]
Expect.isTrue color.IsSome "Has Label"
Expect.equal color.Value "red" "label should be red"

let color = knnClassifier.predict [6.0; 6.0]
Expect.isTrue color.IsSome "Has Label"
Expect.equal color.Value "blue" "label should be blue"

let points = Array.append reds blues
let labels = [| "red"; "red"; "red"; "red"; "red"; "blue"; "blue"; "blue"; "blue"; "blue" |]
knnClassifier.fit(points, labels)

let color = knnClassifier.predict [3.0; 3.0]
Expect.isTrue color.IsSome "Has Label"
Expect.equal color.Value "red" "label should be red"

let color = knnClassifier.predict [6.0; 6.0]
Expect.isTrue color.IsSome "Has Label"
Expect.equal color.Value "blue" "label should be blue"

testCase "KnnClassifier.1d" <| fun () ->
let points = Array.init 200 (fun idx -> 0.1 * float idx)

let labeledPoints = Map [
Expand All @@ -460,8 +495,8 @@ module KNN =
let knnClassifier = KNN.Classifier(distance, 5)
knnClassifier.fit(labeledPoints)

let positiveSamples = Array.init 100 (fun idx -> float (idx + 1))
let negativeSamples = Array.init 100 (fun idx -> float -(idx + 1))
let positiveSamples = Array.init 300 (fun idx -> float (idx + 1))
let negativeSamples = Array.init 300 (fun idx -> float -(idx + 1))

let positivePredictions = knnClassifier.predict positiveSamples
let negativePredictions = knnClassifier.predict negativeSamples
Expand All @@ -474,30 +509,6 @@ module KNN =

Expect.isTrue negLabel.IsSome "Has Label"
Expect.equal negLabel.Value "red" "label should be red"
)


// testCase "symmetricallyDistributedPointsPARALLEL" <| fun () ->
// let points = Array.init 20 (fun idx -> 0.1 * float idx)

// let blues =
// points |> Array.map (fun p -> LabeledPoint<float, string>.create(p, "blue"))
// let reds =
// points |> Array.map (fun p -> LabeledPoint<float, string>.create(-p, "red"))

// let labeledPoints = Array.append blues reds

// let distance a b = abs (a - b)
// let prediction = KNN.Array.predict distance labeledPoints

// Array.init 200 (fun idx -> 1.0 + float idx * float (sign idx))
// |> Array.Parallel.iter (fun x ->
// let prediction = KNN.Array.Parallel.predictInRef distance &labeledPoints 3 x

// Expect.isTrue prediction.IsSome "Has Label"
// Expect.equal prediction.Value "blue" "label should be blue"
// )


)
]

0 comments on commit 9919c4d

Please sign in to comment.