# Data Science from Scratch - In Scala

In [1]:
import $ivy.`org.apache.spark::spark-sql:2.4.0` // Or use any other 2.x version here
import $ivy.`sh.almond::almond-spark:0.8.2` // Not required since almond 0.7.0 (will be automatically added when importing spark)
import $ivy.`org.scalanlp::breeze:1.0`
import $ivy.`org.typelevel::spire:0.14.1`

Downloading https://repo1.maven.org/maven2/org/typelevel/spire_2.12/0.14.1/spire_2.12-0.14.1.pom
Downloaded https://repo1.maven.org/maven2/org/typelevel/spire_2.12/0.14.1/spire_2.12-0.14.1.pom
Downloading https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.1/scala-library-2.12.1.pom
Downloading https://repo1.maven.org/maven2/org/typelevel/spire-macros_2.12/0.14.1/spire-macros_2.12-0.14.1.pom
Downloading https://repo1.maven.org/maven2/org/typelevel/machinist_2.12/0.6.1/machinist_2.12-0.6.1.pom
Downloading https://repo1.maven.org/maven2/org/typelevel/algebra_2.12/0.7.0/algebra_2.12-0.7.0.pom
Downloaded https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.1/scala-library-2.12.1.pom
Downloaded https://repo1.maven.org/maven2/org/typelevel/machinist_2.12/0.6.1/machinist_2.12-0.6.1.pom
Downloaded https://repo1.maven.org/maven2/org/typelevel/spire-macros_2.12/0.14.1/spire-macros_2.12-0.14.1.pom
Downloaded https://repo1.maven.org/maven2/org/typelevel/algebra_2.12/0.7.0/

[32mimport [39m[36m$ivy.$                                   // Or use any other 2.x version here
[39m
[32mimport [39m[36m$ivy.$                               // Not required since almond 0.7.0 (will be automatically added when importing spark)
[39m
[32mimport [39m[36m$ivy.$                         
[39m
[32mimport [39m[36m$ivy.$                            [39m

In [2]:
import breeze.linalg._
import breeze.numerics._
import spire.implicits._
import spire.math._

[32mimport [39m[36mbreeze.linalg._
[39m
[32mimport [39m[36mbreeze.numerics._
[39m
[32mimport [39m[36mspire.implicits._
[39m
[32mimport [39m[36mspire.math._[39m

---

## Chapter 5: Statistics

### Central Tendencies

In [55]:
val x = DenseVector[Double](1, 2, 3, 4, 5, 5, 4, 3, 2, 2, 1, 3, 3, 3, 3, 4, 3, 3, 2, 2)

[36mx[39m: [32mDenseVector[39m[[32mDouble[39m] = DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 4.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 3.0, 3.0, 4.0, 3.0, 3.0, 2.0, 2.0)

In [56]:
x.toArray.groupBy(identity).mapValues(_.length).toSeq.sortBy(_._2).reverse(0)

[36mres55[39m: ([32mDouble[39m, [32mInt[39m) = ([32m3.0[39m, [32m8[39m)

In [57]:
implicit def int2bool(i:Int) = if (i == 0) false else true

def mean(xs: DenseVector[Double]): Double = {
    xs.sum / xs.length
}

def median(xs: DenseVector[Double]): Double = {
    val sortedXs = xs.toArray.sorted
    if((sortedXs.length % 2)) {
        return sortedXs(sortedXs.length / 2)
    } else {
        val midLo = sortedXs.length / 2
        val midHi = midLo + 1
        return mean(DenseVector[Double](midLo, midHi))
    }
}

def quantile(xs: DenseVector[Double], p: Double): Double = {
    val pIdx = (xs.length * p).floor.toInt
    val sortedXs = xs.toArray.sorted
    sortedXs(pIdx)
}

def mode(xs: DenseVector[Double]): Double = {
    xs.toArray.groupBy(identity).mapValues(_.length).toSeq.sortBy(_._2).reverse(0)._1
}

defined [32mfunction[39m [36mint2bool[39m
defined [32mfunction[39m [36mmean[39m
defined [32mfunction[39m [36mmedian[39m
defined [32mfunction[39m [36mquantile[39m
defined [32mfunction[39m [36mmode[39m

### Dispersion

In [69]:
(x, x * x)

cmd69.sc:1: could not find implicit value for parameter ev: spire.algebra.RightModule[breeze.linalg.DenseVector[Double],breeze.linalg.DenseVector[Double]]
val res69 = (x, x.:*(x))
                    ^Compilation Failed

: 

In [71]:
x.sum

[36mres70[39m: [32mDouble[39m = [32m15.0[39m

In [82]:
def demean(xs: DenseVector[Double]): DenseVector[Double] = {
    val xsMean = mean(xs)
    xs - xsMean
}

def variance(xs: DenseVector[Double]): Double = {
    val deviationFromMean: DenseVector[Double] = demean(xs)
    val squareDevFromMean: DenseVector[Double] = deviationFromMean * deviationFromMean
    squareDevFromMean.sum / (xs.size-1)
}

def stdev(xs: DenseVector[Double]): Double = {
    variance(xs).sqrt
}

def interquartileRange(xs: DenseVector[Double]): Double = {
    quantile(xs, 0.75) - quantile(xs, 0.25)
}

defined [32mfunction[39m [36mdemean[39m
defined [32mfunction[39m [36mvariance[39m
defined [32mfunction[39m [36mstdev[39m
defined [32mfunction[39m [36minterquartileRange[39m

### Correlation

In [94]:
def covariance(xs: DenseVector[Double], ys: DenseVector[Double]): Double = {
    demean(xs).dot(demean(ys)) / (xs.size-1)
}

def correlation(xs: DenseVector[Double], ys: DenseVector[Double]): Double = {
    covariance(xs, ys) / (stdev(xs) * stdev(ys))
}

defined [32mfunction[39m [36mcovariance[39m
defined [32mfunction[39m [36mcorrelation[39m

---

## Chapter 6: Probability