-
Notifications
You must be signed in to change notification settings - Fork 54
/
Distributions.fsx
1013 lines (739 loc) · 29.3 KB
/
Distributions.fsx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(**
---
title: Probability distributions
index: 1
category: Documentation
categoryindex: 0
---
*)
(*** hide ***)
(*** condition: prepare ***)
#I "../src/FSharp.Stats/bin/Release/netstandard2.0/"
#r "FSharp.Stats.dll"
#r "nuget: Plotly.NET, 4.0.0"
Plotly.NET.Defaults.DefaultDisplayOptions <-
Plotly.NET.DisplayOptions.init (PlotlyJSReference = Plotly.NET.PlotlyJSReference.NoReference)
(*** condition: ipynb ***)
#if IPYNB
#r "nuget: Plotly.NET, 4.0.0"
#r "nuget: Plotly.NET.Interactive, 4.0.0"
#r "nuget: FSharp.Stats"
#endif // IPYNB
(**
# Probability Distributions
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/fslaborg/FSharp.Stats/gh-pages?urlpath=/tree/home/jovyan/Distributions.ipynb)
[![Notebook]({{root}}img/badge-notebook.svg)]({{root}}{{fsdocs-source-basename}}.ipynb)
_Summary:_ this tutorial shows how to use the various types of probability distributions in FSharp.Stats.
### Table of contents
- [Continuous](#Continuous)
- [Normal distribution](#Normal-distribution)
- [Multivariate normal distribution](#Multivariate-normal-distribution)
- [F distribution](#F-distribution)
- [Discrete](#Discrete)
- [Bernoulli distribution](#Bernoulli-distribution)
- [Binomial distribution](#Binomial-distribution)
- [Hypergerometric distribution](#Hypergerometric-distribution)
- [Poisson distribution](#Poisson-distribution)
- [Gamma distribution](#Gamma-distribution)
- [Negative binomial distribution](#Negative-binomial-distribution)
- [Empirical](#Empirical)
- [Density estimation](#Density-estimation)
- [Distance](#Distance)
FSharp.Stats provides a wide range of probability distributions. Given the
distribution parameters they can be used to investigate their statistical properties
or to sample non-uniform random numbers.
For every distribution the probability density function (PDF) and cumulative probability function (CDF) can be accessed.
By using the PDF you can access the probability for exactly X=k success states. The CDF is used when the cumulative probabilities of X<=k is required.
## Continuous
### Normal distribution
The normal or Gaussian distribution is a very common continuous probability distribution.
Due to the central limit theorem, randomly sampled means of a random and independent distribution tend to approximate a normal distribution
It describes the probability, that under a given mean and
a given dispersion (standard deviation) an event occurs exactly k times.
It is defined by two parameters N(mu,tau):
- mu = mean
- tau = standard deviation
Example: The distribution of bread weights of a local manufacturer follows a normal distribution with mean 500 g and a standard
deviation of 20 g.
NormA: What is the probability of bread weights to be lower than 470 g?
NormB: What is the probability of bread weights to be higher than 505 g?
NormC: Sample independently 10 values from the normal distribution and calculate their mean.
*)
open FSharp.Stats
open FSharp.Stats.Distributions
// Creates a normal distribution with µ = 500 and tau = 20
let normal = ContinuousDistribution.normal 500. 20.
// NormA: What is the probability of bread weights to be equal or lower than 470 g?
let normA = normal.CDF 470.
// Output: 0.06681 = 6.68 %
// NormB: What is the probability of bread weights to be higher than 505 g?
let normB = 1. - (normal.CDF 505.)
// Output: 0.401294 = 40.13 %
// NormC: Sample independently 10 values from the normal distribution and calculate their mean.
let normC =
Seq.init 10 (fun _ -> normal.Sample())
|> Seq.mean
(*** include-value:normC ***)
// Set a seed so that sampling is reproducible
let seed = 1
Random.SetSampleGenerator(Random.RandThreadSafe(seed))
List.init 3 (fun _ -> normal.Sample())
(*** include-it ***)
Random.SetSampleGenerator(Random.RandThreadSafe(seed))
List.init 3 (fun _ -> normal.Sample())
(*** include-it ***)
// Get back to unseeded sampling
Random.SetSampleGenerator(Random.RandThreadSafe())
List.init 3 (fun _ -> normal.Sample())
(*** include-it ***)
open Plotly.NET
let plotNormal =
[400. .. 600.]
|> List.map (fun x -> x,normal.PDF x)
|> Chart.Area
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "N(500,20) PDF"
(*** condition: ipynb ***)
#if IPYNB
plotNormal
#endif // IPYNB
(***hide***)
plotNormal |> GenericChart.toChartHTML
(***include-it-raw***)
let plotNormalCDF =
[400. .. 600.]
|> List.map (fun x -> x,normal.CDF x)
|> Chart.Area
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "N(500,20) CDF"
(*** condition: ipynb ***)
#if IPYNB
plotNormalCDF
#endif // IPYNB
(***hide***)
plotNormalCDF |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Multivariate normal distribution
Multivariate normal distributions are initialized with a mean vector and a covariance matrix.
*)
let mvn = ContinuousDistribution.multivariateNormal (vector [-1.;5.]) (matrix [[0.5;1.];[0.25;1.2]])
let axisXRange = [-5. .. 0.2 .. 5.]
let axisYRange = [ 0. .. 0.2 .. 10.]
// probability density function
let mvnPdfs =
axisYRange |> List.map (fun y ->
axisXRange
|> List.map (fun x ->
mvn.PDF (vector [x;y])
)
)
let mvnSamples =
Array.init 1000 (fun _ -> mvn.Sample())
let surface = Chart.Surface(mvnPdfs,axisXRange,axisYRange)
let samples =
mvnSamples
|> Array.map (fun t -> t.[0],t.[1])
|> Array.unzip
|> fun (x,y) -> Chart.Scatter3D(x,y,Array.init x.Length (fun _ -> Random.rndgen.NextFloat() / 3.),StyleParam.Mode.Markers)
let mvnChart =
[surface;samples]
|> Chart.combine
|> Chart.withTitle "Bivariate normal distribution with sampling"
(*** condition: ipynb ***)
#if IPYNB
mvnChart
#endif // IPYNB
(***hide***)
mvnChart |> GenericChart.toChartHTML
(***include-it-raw***)
(**
###Students t distribution
*)
let studentTParams = [(0.,1.,1.);(0.,1.,2.);(0.,1.,5.);]
let xStudentT = [-10. ..0.1.. 10.]
let pdfStudentT mu tau dof =
xStudentT
|> List.map (Continuous.StudentT.PDF mu tau dof)
|> List.zip xStudentT
let cdfStudentT mu tau dof =
xStudentT
|> List.map (Continuous.StudentT.CDF mu tau dof)
|> List.zip xStudentT
let v =
studentTParams
|> List.map (fun (mu,tau,dof) -> Chart.Spline(pdfStudentT mu tau dof,Name=sprintf "mu=%.1f tau=%.1f dof=%.1f" mu tau dof,ShowMarkers=false))
|> Chart.combine
|> Chart.withTemplate ChartTemplates.lightMirrored
(*** condition: ipynb ***)
#if IPYNB
v
#endif // IPYNB
(***hide***)
v |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### F distribution
The F distribution or Fisher distribution, also known as Fisher-Snedecor distribution, is a continuous probability distribution.
An F-distributed random variable results from the quotient of two Chi-square-distributed random variables each divided by the associated number of degrees of freedom.
The F-distribution has two independent degrees of freedom(dof) as parameters, and thus forms a two-parameter distribution family.
Generally speaking, the F-tests and the resulting F-Distribution is utilized for comparing multiple levels of independent variables with multiple groups.
In practice, it is most commonly used to compare the variances within a group to the variance between different groups, as seen in the Analysis of varaince.
*)
let fParams = [(2.,1.);(5.,2.);(10.,1.);(100.,100.)]
let xF = [0. .. 1. .. 5.]
let pdfF a b =
xF
|> List.map (Continuous.F.PDF a b)
|> List.zip xF
let fPDFs =
fParams
|> List.map (fun (a,b) -> Chart.Line(pdfF a b,Name=sprintf "dof1=%.1f dof2=%.1f" a b,LineWidth=3.) )
|> Chart.combine
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "Different F-Distributions PDFs, x=[0,5]"
(*** condition: ipynb ***)
#if IPYNB
fPDFs
#endif // IPYNB
(***hide***)
fPDFs |> GenericChart.toChartHTML
(***include-it-raw***)
(**
*)
let cdfF a b =
xF
|> List.map (Continuous.F.CDF a b)
|> List.zip xF
let fCDFs =
fParams
|> List.map (fun (a,b) -> Chart.Line(cdfF a b,Name=sprintf "dof1=%.1f dof2=%.1f" a b,LineWidth=3.) )
|> Chart.combine
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "Different F-Distributions CDFs, x=[0,5]"
(*** condition: ipynb ***)
#if IPYNB
fCDFs
#endif // IPYNB
(***hide***)
fCDFs |> GenericChart.toChartHTML
(***include-it-raw***)
(**
## Discrete
### Bernoulli distribution
> A Bernoulli distribution is a (..) random experiment that has only two outcomes
(usually called a "Success" or a "Failure"). For example, the probability of getting a heads (a "success") while flipping a coin is 0.5.
The probability of "failure" is 1 – P (1 minus the probability of success, which also equals 0.5 for a coin toss). It is a special case of the
binomial distribution for n = 1. In other words, it is a [binomial distribution](#Binomial-distribution) with a single trial (e.g. a single coin toss).
<br>_~ by [statisticshowto](https://www.statisticshowto.com/bernoulli-distribution/)_
Mathematically, "success" and "failure" are represented as 1.0 and 0.0, respectively.
It is defined by one parameter B(p):
- p = probability of success
Example: A weighted coin with a probability of 0.6 to land on tails. Most bernoulli distribution calculations are rather intuitive:
BernA: What is the mean of a bernoulli distribution with the weighted coin?
BernB: What is the probability to land on heads?
*)
open FSharp.Stats
open FSharp.Stats.Distributions
// Assumes "tails" to be success
let bernoulli = DiscreteDistribution.bernoulli 0.6
// BernA: What is the mean of a bernoulli distribution with the weighted coin?
let bernA = bernoulli.Mean
// Output: 0.6
// Altough the bernoulli distribution can never return 0.6 (only 0.0 or 1.0) on average it will return heads at the same probability it has to land on heads.
// BernB: What is the probability to land on heads?
let bernB = bernoulli.PMF 0
// Output: 0.4
// Again: Heads = 0.0 = failure and tails = 1.0 = success.
let plotBernoulli =
[0; 1]
|> List.map (fun x -> x, bernoulli.PMF x)
|> Chart.Column
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "B(0.6)"
(*** condition: ipynb ***)
#if IPYNB
plotBernoulli
#endif // IPYNB
(***hide***)
plotBernoulli |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Binomial distribution
The binomial distribution describes the probability, that under a given success probability and
a given number of draws an event occurs exactly k times (with replacement).
It is defined by two parameters B(n,p):
- n = number of draws
- p = probability of success
Example: The school bus is late with a probability of 0.10.
BinoA: What is the probability of running late exactly 5 times during a 30 day month?
BinoB: What is the probability of running late for a maximum of 5 times?
BinoC: What is the probability of running late for at least 5 times?
*)
open FSharp.Stats
open FSharp.Stats.Distributions
// Creates a binomial distribution with n=30 and p=0.90
let binomial = DiscreteDistribution.binomial 0.1 30
// BinoA: What is the probability of running late exactly 5 times during a 30 day month?
let binoA = binomial.PMF 5
// Output: 0.1023 = 10.23 %
// BinoB: What is the probability of running late for a maximum of 5 times?
let binoB = binomial.CDF 5.
// Output: 0.9268 = 92.68 %
// BinoC: What is the probability of running late for at least 5 times?
let binoC = 1. - (binomial.CDF 4.)
// Output: 0.1755 = 17.55 %
let plotBinomial =
[0..30]
|> List.map (fun x -> x,binomial.PMF x)
|> Chart.Column
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "B(30,0.1)"
(*** condition: ipynb ***)
#if IPYNB
plotBinomial
#endif // IPYNB
(***hide***)
plotBinomial |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Hypergerometric distribution
The hypergeometric distribution describes the probability, that under a given number of success and failure events and
a given number of draws an event occurs exactly k times (without replacement).
It is defined by three parameters:
- N = finite population representing the total number of events.
- K = number of success events in this population.
- n = number of draws from the population.
Example: You participate in a lottery, where you have to choose 6 numbers out of 49. The lottery queen draws 6 numbers randomly,
where the order does not matter.
HypA: What is the probability that your 6 numbers are right?
HypB: How to simulate artificial draws from the distribution?
HypC: What is the probability that you have at least 3 right ones?
HypD: What is the probability that you have a maximum of 3 right ones?
*)
// Creates a hypergeometric distribution with N=49, K=6, n=6.
let hyper = DiscreteDistribution.hypergeometric 49 6 6
// HypA: What is the probability that your 6 numbers are right?
let hypA = hyper.PMF 6
// Output: 7.15-08
// HypB: How to simulate artificial draws from the distribution?
let hypB = hyper.Sample()
// Output: Number of success events randomly sampled from the distribution.
// HypC: What is the probability that you have at least 3 right ones?
// CDF is implemented to calculate P(X <= k)
let hypC = 1. - hyper.CDF 2
// Output: 0.01864 = 1.86 %
// HypD: What is the probability that you have a maximum of 3 right ones?
let hypD = hyper.CDF 3
// Output: 0.99901 = 99.90 %
(***hide***)
let plotHyper =
[0..6]
|> List.map (fun x -> x,hyper.PMF x)
|> Chart.Column
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "N=49, K=6, n=6"
(*** condition: ipynb ***)
#if IPYNB
plotHyper
#endif // IPYNB
(***hide***)
plotHyper |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Poisson distribution
The poisson distribution describes what the probability for a number of events is, occurring in a certain time, area, or volume.
It is defined by just one parameters Po(lambda) where lambda is the average rate.
Example: During a year, a forest is struck by a lightning 5.5 times.
PoA: What is the probability that the lightning strikes exactly 3 times?
PoB: What is the probability that the lightning strikes less than 2 times?
PoC: What is the probability that the lightning strikes more than 7 times?
*)
// Creates a poisson distribution with lambda= .
let poisson = DiscreteDistribution.poisson 5.5
(*** do-not-eval ***)
// PoA: What is the probability that the lightning strikes exactly 3 times?
let poA = poisson.PMF 3
// Output: 0.11332 = 11.33 %
// PoB: What is the probability that the lightning strikes less or equal to 2 times?
let poB =
// CDF not implemented yet
//poisson.CDF 2.
[0 .. 2] |> List.sumBy poisson.PMF
// Output: 0.088376 = 8.84 %
// PoC: What is the probability that the lightning strikes more than 7 times?
let poC = 1. - poisson.CDF 6.
// Output: Not implemented yet. Manual addition necessary
(** *)
let plotPo =
[0..20]
|> List.map (fun x -> x,poisson.PMF x)
|> Chart.Column
|> Chart.withTemplate ChartTemplates.lightMirrored
//|> Chart.withSize(600.,400.)
|> Chart.withTitle "Po(5.5)"
(*** condition: ipynb ***)
#if IPYNB
plotPo
#endif // IPYNB
(***hide***)
plotPo |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Gamma distribution
*)
let gammaParams = [(1.,2.);(2.,2.);(3.,2.);(5.,1.);(9.0,0.5);(7.5,1.);(0.5,1.);]
let xgamma = [0. ..0.1.. 20.]
let pdfGamma a b =
xgamma
|> List.map (Continuous.Gamma.PDF a b)
|> List.zip xgamma
let gammaPlot =
gammaParams
|> List.map (fun (a,b) -> Chart.Point(pdfGamma a b,Name=sprintf "a=%.1f b=%.1f" a b) )//,ShowMarkers=false))
|> Chart.combine
|> Chart.withTemplate ChartTemplates.lightMirrored
(*** condition: ipynb ***)
#if IPYNB
gammaPlot
#endif // IPYNB
(***hide***)
gammaPlot |> GenericChart.toChartHTML
(***include-it-raw***)
let cdfGamma a b =
xgamma
|> List.map (Continuous.Gamma.CDF a b)
|> List.zip xgamma
let gammaCDFPlot=
gammaParams
|> List.map (fun (a,b) -> Chart.Spline(cdfGamma a b,Name=sprintf "a=%.1f b=%.1f" a b) )//,ShowMarkers=false))
|> Chart.combine
|> Chart.withTemplate ChartTemplates.lightMirrored
(*** condition: ipynb ***)
#if IPYNB
gammaCDFPlot
#endif // IPYNB
(***hide***)
gammaCDFPlot |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Negative binomial distribution
_Note_: There is no unique definition of the negative binomial distribution. In definition _A_ the random variable `x` is the number of trials, while in definition _B_
the random variable `x` is the number of failures. The definition of `r` (number of successes) and `p` (probability of each bernoulli trial) is the same for both definitions.
In FSharp.Stats both definitions are implemented. The number of trials (A) can be modelled with `negativeBinomial_trials`, while the number of failures can be modelled with `negativeBinomial_failures`.
For further discussion about this issue check out [#256](https://github.com/fslaborg/FSharp.Stats/issues/256).
#### A - Negative binomial distribution (trials)
The distribution models the number of trials needed (`x`) to get the `r`<sup>th</sup> success in repeated independent Bernoulli trials.
Until the (x-1)<sup>th</sup> trial, (r-1) successes must be accomplished, which matches the standard binomial distribution.
Therefore, to get the `r`<sup>th</sup> success in the `x`<sup>th</sup> trial, you have to multiply `Binom(p,x-1,r-1)` by `p`.
#### B - Negative binomial distribution (faiures)
The distribution models the number of failures (`k`) prior to the `r`<sup>th</sup> success in repeated independent Bernoulli trials.
Until the (k+r-1)<sup>th</sup> trial, (r-1) successes must be accomplished, which matches the standard binomial distribution.
Therefore, to get the `r`<sup>th</sup> success after `k`<sup>th</sup> failures, you have to multiply `Binom(p,k+r-1,r-1)` by `p`.
The PDF and CDF of both distributions can be converted into each other by:
```
(negativeBinomial_trials r p).PMF x = (negativeBinomial_failures r p).PMF (x-r)
(negativeBinomial_failures r p).PMF k = (negativeBinomial_trials r p).PMF (k+r)
(negativeBinomial_trials r p).CDF x = (negativeBinomial_failures r p).CDF (x-r)
(negativeBinomial_failures r p).CDF k = (negativeBinomial_trials r p).CDF (k+r)
```
> Task1:
> What is the probability of obtaining the third success on the 10 trial. The individual success probability is 0.09.
*)
//number of trials
let x = 10
//probability of the independent bernoulli trials
let p = 0.09
//number of successes
let r = 3
// number of failures
let k = x - 3
//Solution A:
let negB_A = Discrete.NegativeBinomial_trials.PMF r p x
//returns 0.01356187619
//Solution B:
let negB_B = Discrete.NegativeBinomial_failures.PMF r p k
//returns 0.01356187619
(**
**Comparison of PMF**
*)
let negBinom_trials =
Distributions.DiscreteDistribution.negativeBinomial_trials 3 0.3
let negBinom_failures =
Distributions.DiscreteDistribution.negativeBinomial_failures 3 0.3
negBinom_trials.CDF 1
let pmfComparison =
let pmfs_trials = [0..30] |> List.map (fun x -> x,negBinom_trials.PMF x)
let pmfs_failures = [0..30] |> List.map (fun x -> x,negBinom_failures.PMF x)
[
Chart.Column (pmfs_trials, Opacity=0.5, Name= "trials (x)")
Chart.Column (pmfs_failures, Opacity=0.5, Name= "failures (k)")
]
|> Chart.combine
|> Chart.withLayoutStyle(BarMode=StyleParam.BarMode.Overlay)
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "PMF"
|> Chart.withXAxisStyle "x or k"
|> Chart.withYAxisStyle "P(X=x) or P(X=k)"
(*** condition: ipynb ***)
#if IPYNB
pmfComparison
#endif // IPYNB
(***hide***)
pmfComparison |> GenericChart.toChartHTML
(***include-it-raw***)
(**
You can clearly see, that the PMF distributions are shifted according to the number of successes because: $failures (k) = trials (x) - successes (r)$.
**Comparison of CDF**
*)
let cdfComparison =
let cdfs_trials = [0..30] |> List.map (fun x -> x,negBinom_trials.CDF x)
let cdfs_failures = [0..30] |> List.map (fun x -> x,negBinom_failures.CDF x)
[
Chart.Column (cdfs_trials, Opacity=0.5, Name= "trials (x)")
Chart.Column (cdfs_failures, Opacity=0.5, Name= "failures (k)")
]
|> Chart.combine
|> Chart.withLayoutStyle(BarMode=StyleParam.BarMode.Overlay)
|> Chart.withTemplate ChartTemplates.lightMirrored
|> Chart.withTitle "CDF"
|> Chart.withXAxisStyle "x or k"
|> Chart.withYAxisStyle "P(X=x) or P(X=k)"
(*** condition: ipynb ***)
#if IPYNB
cdfComparison
#endif // IPYNB
(***hide***)
cdfComparison |> GenericChart.toChartHTML
(***include-it-raw***)
(**
You can clearly see, that the CDF distributions are shifted according to the number of successes because: $failures (k) = trials (x) - successes (r)$.
## Empirical
You can create empirically derived distributions and sample randomly from these.
In this example 100,000 random samples from a student t distribution are drawn and visualized
*)
// Randomly taken samples; in this case from a gaussian normal distribution.
let sampleDistribution =
let template = ContinuousDistribution.normal 5. 2.
Seq.init 100000 (fun _ -> template.Sample())
//creates an empirical distribution with bandwidth 0.1
let empiricalDistribution =
EmpiricalDistribution.create 0.1 sampleDistribution
(***hide***)
let plotEmpirical =
empiricalDistribution
|> Empirical.getZip
|> Chart.Column
|> Chart.withTemplate ChartTemplates.lightMirrored
(*** condition: ipynb ***)
#if IPYNB
plotEmpirical
#endif // IPYNB
(***hide***)
plotEmpirical |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Categorical data
You also can create probability mass functions (PMF) from categorical (nominal) data.
You can check for just the elements present in your input sequence, or include elements of a predefined set.
*)
let myText =
"Hello World, I am a test Text with all kind of Characters!112"
// Define your set of characters that should be checked for
// Any character that is not present in these sets is neglected
let mySmallAlphabet = "abcdefghijklmnopqrstuvwxyz" |> Set.ofSeq
let myAlphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" |> Set.ofSeq
let myAlphabetNum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" |> Set.ofSeq
(**
These alphabets can be used to create the probability maps.
*)
// takes the characters and determines their probabilities without considering non-existing characters
let myFrequencies0 = EmpiricalDistribution.createNominal() myText
// takes upper and lower case characters and determines their probability
let myFrequencies1 = EmpiricalDistribution.createNominal(Template=myAlphabet) myText
// takes upper and lower case characters and numbers and determines their probability
let myFrequencies2 = EmpiricalDistribution.createNominal(Template=myAlphabetNum) myText
// takes only lower case characters and determines their probability
// The big M is neglected because it is not present in the template alphabet.
let myFrequencies3 = EmpiricalDistribution.createNominal(Template=mySmallAlphabet) myText
(**
An additional field for transforming the input sequence may be beneficial if it does not matter if an character is lower case or upper case:
*)
// converts all characters to lower case characters and determines their probability
let myFrequencies4 = EmpiricalDistribution.createNominal(Template=mySmallAlphabet,Transform=System.Char.ToLower) myText
(**
With a template set, you can access the probability of 'z' even if it is not in your original input sequence.
*)
myFrequencies3.['z'] //results in 0.0
(**
#### Visualization of the PMF
*)
let categoricalDistribution =
[
Chart.Column(myFrequencies0 |> Map.toArray,"0_noTemplate") |> Chart.withYAxisStyle "probability"
Chart.Column(myFrequencies1 |> Map.toArray,"1_bigAlphabet") |> Chart.withYAxisStyle "probability"
Chart.Column(myFrequencies2 |> Map.toArray,"2_numAlphabet") |> Chart.withYAxisStyle "probability"
Chart.Column(myFrequencies3 |> Map.toArray,"3_smallAlphabet") |> Chart.withYAxisStyle "probability"
Chart.Column(myFrequencies4 |> Map.toArray,"4_toLower + smallAlphabet") |> Chart.withYAxisStyle "probability"
]
|> Chart.Grid(4,1)
|> Chart.withTemplate ChartTemplates.lightMirrored
(*** condition: ipynb ***)
#if IPYNB
categoricalDistribution
#endif // IPYNB
(***hide***)
categoricalDistribution |> GenericChart.toChartHTML
(***include-it-raw***)
(**
### Distribution merging
You can merge two distributions by using `Empirical.merge`, subroutines like `Empirical.add`, or the generic function `Empirical.mergeBy`.
Merging two distributions leads to a combined distribution. If keys are present in both distributions the value at `distA` is superseded with
the value at `distB`.
Please note, that when handling continuous data, the binning of both input distributions must be identical! When using categorical data,
the binning does not matter and the parameter can be set to `true`.
*)
let a =
[("k1",1);("k2",3)]
|> Map.ofList
let b =
[("k2",2);("k3",4)]
|> Map.ofList
let mergedDist = Empirical.merge true a b
(*** condition: ipynb ***)
#if IPYNB
mergedDist
#endif // IPYNB
(***hide***)
(sprintf "mergeDist = %A" mergedDist)
(***include-it-raw***)
(**
Adding two distributions leads to a combined distribution. If keys are present in both distributions the values at `distA` and `distB` are added.
*)
let addedDist = Empirical.add true a b
(*** condition: ipynb ***)
#if IPYNB
addedDist
#endif // IPYNB
(***hide***)
(sprintf "addedDist = %A" addedDist)
(***include-it-raw***)
(**
A custom merging function can be defined:
*)
let customDist = Empirical.mergeBy true (fun valueA valueB -> valueA * valueB) a b
(*** condition: ipynb ***)
#if IPYNB
customDist
#endif // IPYNB
(***hide***)
(sprintf "customDist = %A" customDist)
(***include-it-raw***)
(**
## Density estimation
*)
let nv = Array.init 1000 (fun _ -> Distributions.Continuous.Normal.Sample 5. 2.)
let xy = KernelDensity.estimate KernelDensity.Kernel.gaussian 1.0 nv
let kernelChart =
Chart.SplineArea xy
|> Chart.withTemplate ChartTemplates.lightMirrored
(*** condition: ipynb ***)
#if IPYNB
kernelChart
#endif // IPYNB
(***hide***)
kernelChart |> GenericChart.toChartHTML
(***include-it-raw***)
(**
## Distance
In this example we will calculate the Wasserstein Metric between 3 distributions. One can imagine this metric as the amount of work needed to move the area (pile of dirt) of one distribution to the area of the other. That's why it's also called Earth Movers Distance.
Be aware that this distance measure is dependent on the actual absolute values of the distributions.
*)
let distribution1 =
let normal = ContinuousDistribution.normal 300. 15.
Array.init 1000 (fun _ -> normal.Sample())
let distribution2 =
let normal = ContinuousDistribution.normal 350. 20.
Array.init 500 (fun _ -> normal.Sample())
let distribution3 =
let normal = ContinuousDistribution.normal 500. 20.
Array.init 1000 (fun _ -> normal.Sample())
let pilesOfDirt =
[
Chart.Histogram(distribution1,Name = "Distribution1")
Chart.Histogram(distribution2,Name = "Distribution2")
Chart.Histogram(distribution3,Name = "Distribution3")
]
|> Chart.combine
(*** condition: ipynb ***)
#if IPYNB
pilesOfDirt
#endif // IPYNB
(***hide***)
pilesOfDirt |> GenericChart.toChartHTML
(***include-it-raw***)
let distance1and2 = Distance.OneDimensional.wassersteinDistance distribution1 distribution2
(***include-value: distance1and2 ***)
let distance1and3 = Distance.OneDimensional.wassersteinDistance distribution1 distribution3
(***include-value: distance1and3 ***)
(**
As expected the distance between Distribution 1 and 2 is the lowest:
*)
let distributions =
[|distribution1;distribution2;distribution3|]
let mapColor min max value =
let proportionR =
((255. - 200.) * (value - min) / (max - min))
|> int
|> fun x -> 255 - x
let proportionG =
((255. - 200.) * (value - min) / (max - min))
|> int
|> fun x -> 255 - x
let proportionB =
((255. - 200.) * (value - min) / (max - min))
|> int
|> fun x -> 255 - x
Color.fromARGB 1 proportionR proportionG proportionB
let distancesTable =
let headerColors = ["white";"#1f77b4";"#ff7f0e";"#2ca02c"] |> List.map Color.fromString |> Color.fromColors
let distances =
distributions
|> Array.map (fun x ->
distributions
|> Array.map (fun y ->
Distance.OneDimensional.wassersteinDistance x y
|> (sprintf "%.2f")
)
)
|> Array.transpose
|> Array.append [|[|"Distribution1";"Distribution2";"Distribution3"|]|]
|> Array.transpose
let cellColors =
distances
|> Array.mapi (fun i a ->
a
|> Array.mapi (fun j v ->
if j = 0 then
if i = 0 then Color.fromHex "#1f77b4"
elif i = 1 then Color.fromHex "#ff7f0e"
else Color.fromHex "#2ca02c"
else
mapColor 0. 200. (float v)
)
)
|> Array.transpose
|> Seq.map Color.fromColors
|> Color.fromColors
Chart.Table(