-
Notifications
You must be signed in to change notification settings - Fork 3
/
InformationTheorySpec.scala
110 lines (94 loc) · 3.13 KB
/
InformationTheorySpec.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package com.elbauldelprogramador.utils
import com.elbauldelprogramador.BddSpec
import com.elbauldelprogramador.utils.InformationTheory._
import java.util.concurrent.TimeUnit
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.time.Time
import org.apache.flink.api.scala._
import org.apache.flink.ml.common.LabeledVector
import org.apache.flink.ml.math.DenseVector
class InformationTheorySpec extends BddSpec with Serializable {
private val env = ExecutionEnvironment.createLocalEnvironment()
env.setParallelism(4)
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(
3, // number of restart attempts
Time.of(10, TimeUnit.SECONDS) // delay
))
val data = env.readCsvFile[(Int, Double, Double, Double, Double, Double, Double, Double, Int)](getClass.getResource("/abalone.csv").getPath)
val dataSet = data map { tuple ⇒
val list = tuple.productIterator.toList
val numList = list map { x ⇒
x match {
case d: Double ⇒ d
case i: Int ⇒ i
}
}
LabeledVector(numList(8), DenseVector(numList.take(8).toArray))
}
// https://stackoverflow.com/a/11107546/1612432
def roundAt(p: Int)(n: Double): Double = {
val s = math pow (10, p)
(math round n * s) / s
}
def truncateAt(p: Int)(n: Double): Double = {
val s = math pow (10, p)
(math floor n * s) / s
}
def truncAt2(n: Double) = truncateAt(2)(n)
def roundAt4(n: Double) = roundAt(4)(n)
def truncAt3(n: Double) = truncateAt(3)(n)
def roundAt2(n: Double) = roundAt(2)(n)
val column0 = dataSet.map(lv ⇒ (lv.label, lv.vector(0)))
val column0v = column0.collect
val y = column0v map (_._1)
val x = column0v map (_._2)
"Informaion Theroy Spec" - {
"When computing entropy for the first column of Abalone" - {
"Should return entropy H(X) equal to 0.9474" in {
assert(roundAt4(entropy(x)) === 0.9474)
}
}
"When computing conditional entropy on the first column with label H(X|Y)" - {
"Should be 0.9215" in {
assert(roundAt4(conditionalEntropy(x, y)) === 0.9215)
}
}
"When computing Mutual Information on the first column with label" - {
"Should be 0.0259" in {
assert(roundAt4(mutualInformation(x, y)) === 0.0259)
}
}
"When computing Symmetrical Uncertainty on the first column with label" - {
"Should be 0.026" in {
assert(truncAt3(symmetricalUncertainty(column0)) === 0.026)
}
}
"When computing Symmetrical Uncertainty on the whole Abalone Dataset" - {
"""Should be [
0.02,
0.06,
0.07,
0.09,
0.11,
0.08,
0.07,
0.09
]""" in {
val expected = List(
0.02,
0.06,
0.07,
0.09,
0.11,
0.08,
0.07,
0.09).map(truncAt2)
val su = for (i ← 0 until 8) yield {
val attr = dataSet.map(lv ⇒ (lv.label, lv.vector(i)))
InformationTheory.symmetricalUncertainty(attr)
}
assert(su.map(truncAt2) === expected)
}
}
}
}