* [data-science-overview](https://kotlinlang.org/docs/reference/data-science-overview.html)
* [擁抱科學](https://www.kotliner.cn/2020/01/making-kotlin-ready-for-data-science/)
* [data-science-resource](https://github.com/thomasnield/kotlin-data-science-resources)
* [kotlin-jupyter](https://github.com/Kotlin/kotlin-jupyter)
* [KotlinConf 2019 觀視指南 – 數據科學篇](https://blog.jetbrains.com/cn/2020/03/kotlinconf-2019-ds/)

# 初體驗

In [60]:
1+1

2

In [1]:
var c = 1
var d = 2
print(c + d)

3

In [2]:
// 變數需要宣告
a = 1
b = 2
a + b

Unresolved reference: a
Unresolved reference: b
Unresolved reference: a
Unresolved reference: b

In [3]:
var str1 = "Hello kotlin"
str1

Hello kotlin

In [6]:
d * 2

4

In [13]:
var mylist = listOf(1,2,3)
println(mylist)

[1, 2, 3]


In [1]:
:help

Commands:
    :help - display help
    :classpath - show current classpath

Magics
    %use - include supported libraries
        Usage: %use klaxon(5.0.1), lets-plot
    %trackClasspath - log current classpath changes
    %output - setup output settings
        Usage: %output --max-cell-size=1000 --no-stdout --max-time=100 --max-buffer=400

Supported libraries:
    dataframe https://github.com/nikitinas/krangl-typed
    deeplearning4j-cuda https://github.com/eclipse/deeplearning4j
    deeplearning4j https://github.com/eclipse/deeplearning4j
    default 
    exposed https://github.com/JetBrains/Exposed
    fuel https://github.com/kittinunf/fuel
    gral https://github.com/eseifert/gral
    khttp https://github.com/jkcclemens/khttp
    klaxon https://github.com/cbeust/klaxon
    kmath https://github.com/mipt-npm/kmath
    koma https://koma.kyonifer.com/index.html
    kotlin-statistics https://github.com/thomasnield/kotlin-statistics
    krangl https://github.com/holgerbrandl/krangl
    

In [5]:
%use spark

In [6]:
:classpath

Current classpath (182 paths):
C:\Users\Me\.julia\conda\3\Lib\site-packages\run_kotlin_kernel\jars\jupyter-lib-0.8.1.98.jar
C:\Users\Me\.julia\conda\3\Lib\site-packages\run_kotlin_kernel\jars\kotlin-stdlib-1.4.20-dev-1121.jar
C:\Users\Me\.julia\conda\3\Lib\site-packages\run_kotlin_kernel\jars\kotlin-stdlib-common-1.4.20-dev-1121.jar
C:\Users\Me\.julia\conda\3\Lib\site-packages\run_kotlin_kernel\jars\annotations-13.0.jar
C:\Users\Me\.ivy2\cache\org.jetbrains\kotlin-numpy\jars\kotlin-numpy-0.1.0.jar
C:\Users\Me\.ivy2\cache\org.apache.spark\spark-mllib_2.11\jars\spark-mllib_2.11-2.4.4.jar
C:\Users\Me\.ivy2\cache\org.scala-lang.modules\scala-parser-combinators_2.11\bundles\scala-parser-combinators_2.11-1.1.0.jar
C:\Users\Me\.ivy2\cache\org.scala-lang\scala-library\jars\scala-library-2.11.12.jar
C:\Users\Me\.ivy2\cache\org.apache.spark\spark-core_2.11\jars\spark-core_2.11-2.4.4.jar
C:\Users\Me\.ivy2\cache\org.apache.avro\avro\bundles\avro-1.8.2.jar
C:\Users\Me\.ivy2\cache\org.codehaus.jacks

# include supported libraries (%use)
## Default repositories
* Bintray JCenter
* Maven Central
* JitPack

In [1]:
%use lets-plot
// %use klaxon(5.2) 

In [3]:
val rand = java.util.Random()
val data1 = mapOf<String, Any>(
    "rating" to List(200) { rand.nextGaussian() } + List(200) { rand.nextGaussian() * 1.5 + 1.5 },
    "cond" to List(200) { "A" } + List(200) { "B" }
)

var p = lets_plot(data1)
p += geom_density(color="dark_green", alpha=.3) {x="rating"; fill="cond"}
p + ggsize(500, 250)

Configuring Maven dependencies

In [1]:
@file:Repository("https://repo1.maven.org/maven2")
@file:DependsOn("org.nield:kotlin-statistics:1.2.1")

In [2]:
import java.time.LocalDate
import java.time.temporal.ChronoUnit
import org.nield.kotlinstatistics.*

data class Patient(val firstName: String,
                   val lastName: String,
                   val gender: Gender,
                   val birthday: LocalDate,
                   val whiteBloodCellCount: Int) {

    val age = ChronoUnit.YEARS.between(birthday, LocalDate.now())
}

val patients = listOf(
        Patient("John", "Simone", Gender.MALE, LocalDate.of(1989, 1, 7), 4500),
        Patient("Sarah", "Marley", Gender.FEMALE, LocalDate.of(1970, 2, 5), 6700),
        Patient("Jessica", "Arnold", Gender.FEMALE, LocalDate.of(1980, 3, 9), 3400),
        Patient("Sam", "Beasley", Gender.MALE, LocalDate.of(1981, 4, 17), 8800),
        Patient("Dan", "Forney", Gender.MALE, LocalDate.of(1985, 9, 13), 5400),
        Patient("Lauren", "Michaels", Gender.FEMALE, LocalDate.of(1975, 8, 21), 5000),
        Patient("Michael", "Erlich", Gender.MALE, LocalDate.of(1985, 12, 17), 4100),
        Patient("Jason", "Miles", Gender.MALE, LocalDate.of(1991, 11, 1), 3900),
        Patient("Rebekah", "Earley", Gender.FEMALE, LocalDate.of(1985, 2, 18), 4600),
        Patient("James", "Larson", Gender.MALE, LocalDate.of(1974, 4, 10), 5100),
        Patient("Dan", "Ulrech", Gender.MALE, LocalDate.of(1991, 7, 11), 6000),
        Patient("Heather", "Eisner", Gender.FEMALE, LocalDate.of(1994, 3, 6), 6000),
        Patient("Jasper", "Martin", Gender.MALE, LocalDate.of(1971, 7, 1), 6000)
)

enum class Gender {
    MALE,
    FEMALE
}

val clusters = patients.multiKMeansCluster(k = 3,
        maxIterations = 10000,
        trialCount = 50,
        xSelector = { it.age.toDouble() },
        ySelector = { it.whiteBloodCellCount.toDouble() }
)

In [3]:
clusters.forEachIndexed { index, item ->
    println("CENTROID: $index")
    item.points.forEach {
        println("\t$it")
    }
}

CENTROID: 0
	Patient(firstName=Dan, lastName=Forney, gender=MALE, birthday=1985-09-13, whiteBloodCellCount=5400)
	Patient(firstName=Lauren, lastName=Michaels, gender=FEMALE, birthday=1975-08-21, whiteBloodCellCount=5000)
	Patient(firstName=James, lastName=Larson, gender=MALE, birthday=1974-04-10, whiteBloodCellCount=5100)
	Patient(firstName=Dan, lastName=Ulrech, gender=MALE, birthday=1991-07-11, whiteBloodCellCount=6000)
	Patient(firstName=Heather, lastName=Eisner, gender=FEMALE, birthday=1994-03-06, whiteBloodCellCount=6000)
	Patient(firstName=Jasper, lastName=Martin, gender=MALE, birthday=1971-07-01, whiteBloodCellCount=6000)
CENTROID: 1
	Patient(firstName=John, lastName=Simone, gender=MALE, birthday=1989-01-07, whiteBloodCellCount=4500)
	Patient(firstName=Jessica, lastName=Arnold, gender=FEMALE, birthday=1980-03-09, whiteBloodCellCount=3400)
	Patient(firstName=Michael, lastName=Erlich, gender=MALE, birthday=1985-12-17, whiteBloodCellCount=4100)
	Patient(firstName=Jason, l

In [39]:
@file:Repository("https://jcenter.bintray.com")
@file:DependsOn("com.beust:klaxon:5.0.1")
import com.beust.klaxon.*

In [40]:
// %use klaxon
%use klaxon(5.2) // 指定版本

## Koma

In [10]:
%use koma

:: problems summary ::
		module not found: com.kyonifer#koma-core-ejml;0.12ABCDEF

	==== jcenter.bintray.com: tried

	  https://jcenter.bintray.com/com/kyonifer/koma-core-ejml/0.12ABCDEF/koma-core-ejml-0.12ABCDEF.pom

	  -- artifact com.kyonifer#koma-core-ejml;0.12ABCDEF!koma-core-ejml.jar:

	  https://jcenter.bintray.com/com/kyonifer/koma-core-ejml/0.12ABCDEF/koma-core-ejml-0.12ABCDEF.jar

	==== repo.maven.apache.org: tried

	  https://repo.maven.apache.org/maven2/com/kyonifer/koma-core-ejml/0.12ABCDEF/koma-core-ejml-0.12ABCDEF.pom

	  -- artifact com.kyonifer#koma-core-ejml;0.12ABCDEF!koma-core-ejml.jar:

	  https://repo.maven.apache.org/maven2/com/kyonifer/koma-core-ejml/0.12ABCDEF/koma-core-ejml-0.12ABCDEF.jar

	==== jitpack.io: tried

	  https://jitpack.io/com/kyonifer/koma-core-ejml/0.12ABCDEF/koma-core-ejml-0.12ABCDEF.pom

	  -- artifact com.kyonifer#koma-core-ejml;0.12ABCDEF!koma-core-ejml.jar:

	  https://jitpack.io/com/kyonifer/koma-core-ejml/0.12ABCDEF/koma-core-ejml-0.12ABC

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-core-ejml/0.13/koma-core-ejml-0.13.pom

	  -- artifact com.kyonifer#koma-core-ejml;0.13!koma-core-ejml.jar:

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-core-ejml/0.13/koma-core-ejml-0.13.jar

	==== jcenter.bintray.com: tried

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-core-ejml/0.13/koma-core-ejml-0.13.pom

	  -- artifact com.kyonifer#koma-core-ejml;0.13!koma-core-ejml.jar:

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-core-ejml/0.13/koma-core-ejml-0.13.jar

	==== jcenter.bintray.com: tried

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-core-ejml/0.13/koma-core-ejml-0.13.pom

	  -- artifact com.kyonifer#koma-core-ejml;0.13!koma-core-ejml.jar:

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-core-ejml/0.13/koma-core-ejml-0.13.jar

	==== jcenter.bintray.com: tried

	  https://jcenter.bintray.com/kyonifer/maven/com/kyonifer/koma-cor

Failed to resolve com.kyonifer:koma-core-ejml:0.13:
File 'com.kyonifer:koma-core-ejml:0.13' not found
unresolved dependency: com.kyonifer#koma-core-ejml;0.13: not found
Failed to resolve com.kyonifer:koma-plotting:0.13:
File 'com.kyonifer:koma-plotting:0.13' not found
unresolved dependency: com.kyonifer#koma-plotting;0.13: not found

In [9]:
@file:Repository("https://jcenter.bintray.com/kyonifer/maven")
@file:DependsOn("com.kyonifer:koma-core-ejml:0.12")
@file:DependsOn("com.kyonifer:koma-plotting:0.12")

:: problems summary ::
		module not found: com.kyonifer#koma-plotting;0.1ABCDEF

	==== jcenter.bintray.com: tried

	  https://jcenter.bintray.com/com/kyonifer/koma-plotting/0.1ABCDEF/koma-plotting-0.1ABCDEF.pom

	  -- artifact com.kyonifer#koma-plotting;0.1ABCDEF!koma-plotting.jar:

	  https://jcenter.bintray.com/com/kyonifer/koma-plotting/0.1ABCDEF/koma-plotting-0.1ABCDEF.jar

	==== repo.maven.apache.org: tried

	  https://repo.maven.apache.org/maven2/com/kyonifer/koma-plotting/0.1ABCDEF/koma-plotting-0.1ABCDEF.pom

	  -- artifact com.kyonifer#koma-plotting;0.1ABCDEF!koma-plotting.jar:

	  https://repo.maven.apache.org/maven2/com/kyonifer/koma-plotting/0.1ABCDEF/koma-plotting-0.1ABCDEF.jar

	==== jitpack.io: tried

	  https://jitpack.io/com/kyonifer/koma-plotting/0.1ABCDEF/koma-plotting-0.1ABCDEF.pom

	  -- artifact com.kyonifer#koma-plotting;0.1ABCDEF!koma-plotting.jar:

	  https://jitpack.io/com/kyonifer/koma-plotting/0.1ABCDEF/koma-plotting-0.1ABCDEF.jar

	==== jetbrains.bintray.co

In [42]:
import koma.*
import koma.extensions.*

var a = randn(100,2)
var b = cumsum(a)

figure(1)
// Second parameter is color
plot(a, 'b', "First Run")
plot(a+1, 'y', "First Run Offset")
xlabel("Time (s)")
ylabel("Magnitude")
title("White Noise")

figure(2)
plot(b, 'g') // green
xlabel("Velocity (lightweeks/minute)")
ylabel("Intelligence")
title("Random Walk")

In [43]:
Klaxon()

com.beust.klaxon.Klaxon@3a1237db

# parse json

In [7]:
data class User(val id: Int, val first_name:String, val email:String, val gender:String)

In [47]:
import java.io.*
var users = Klaxon().parseArray<User>(File("data/user.json").readText())

In [48]:
users?.count()

4

In [15]:
users?.filter{
    it.gender.contains("Male")
}?.count()

2

In [49]:
class Greeter(val name: String){
    fun greet(){
        println("Hello $name")
    }
}
Greeter("Gx").greet()

Hello Gx


# Plot: lets-plot
* is a library for declaratively creating plots based on tabular data. This library is inspired by R’s ggplot and The Grammar of Graphics, and is integrated tightly with the Kotlin kernel. It is multi-platform and can be used not just with JVM, but also from JS and Python.

In [44]:
%use lets-plot

In [50]:
val data = mapOf("gender" to users?.map{it.gender})
lets_plot(data) + stat_count() + ggsize(500,250)

In [52]:
val data = mapOf<String, Any>(
    "cat1" to listOf("a","a","b","a","a","a","b","b")
    "cat2" to listOf("c","c","d","d","d","c","d","c")
)
val p = ggplot(data)
val layer = geom_bar{
    x = "cat1",
    fill = "cat2"
}
(p+layer)

In [2]:
%use krangl

In [1]:
// @file:Repository("https://jitpack.io")
// @file:DependsOn("com.github.User:Repo:Tag")
@file:DependsOn("com.github.holgerbrandl:kravis:-SNAPSHOT")
// %use kravis

In [16]:
import krangl.*
import kravis.*
import krangl.sleepData

sleepData
    .addColumn("rem_proportion") { it["sleep_rem"] / it["sleep_total"] }
        // Analyze correlation
    .plot(x = "sleep_total", y = "rem_proportion", color = "vore", size = "brainwt")
        .geomPoint(alpha = 0.7)
        .guides(size = LegendType.none)
        .title("Correlation between dream and total sleep time")

ggplot(mapping=aes(x=`sleep_total`,y=`rem_proportion`,color=`vore`,size=`brainwt`), data=data01) + 
	geom_point(stat='identity', position=position_identity(), na.rm=FALSE, inherit.aes=TRUE, alpha=0.7) + 
	guides(size='none') + 
	ggtitle("Correlation between dream and total sleep time")

In [26]:
sleepData.rows.forEach{
    println(it)
}

{name=Cheetah, genus=Acinonyx, vore=carni, order=Carnivora, conservation=lc, sleep_total=12.1, sleep_rem=null, sleep_cycle=null, awake=11.9, brainwt=null, bodywt=50.0}
{name=Owl monkey, genus=Aotus, vore=omni, order=Primates, conservation=null, sleep_total=17.0, sleep_rem=1.8, sleep_cycle=null, awake=7.0, brainwt=0.0155, bodywt=0.48}
{name=Mountain beaver, genus=Aplodontia, vore=herbi, order=Rodentia, conservation=nt, sleep_total=14.4, sleep_rem=2.4, sleep_cycle=null, awake=9.6, brainwt=null, bodywt=1.35}
{name=Greater short-tailed shrew, genus=Blarina, vore=omni, order=Soricomorpha, conservation=lc, sleep_total=14.9, sleep_rem=2.3, sleep_cycle=0.133333333, awake=9.1, brainwt=2.9E-4, bodywt=0.019}
{name=Cow, genus=Bos, vore=herbi, order=Artiodactyla, conservation=domesticated, sleep_total=4.0, sleep_rem=0.7, sleep_cycle=0.666666667, awake=20.0, brainwt=0.423, bodywt=600.0}
{name=Three-toed sloth, genus=Bradypus, vore=herbi, order=Pilosa, conservation=null, sleep_total=14.4, sleep_rem=2

{name=Rabbit, genus=Oryctolagus, vore=herbi, order=Lagomorpha, conservation=domesticated, sleep_total=8.4, sleep_rem=0.9, sleep_cycle=0.416666667, awake=15.6, brainwt=0.0121, bodywt=2.5}
{name=Sheep, genus=Ovis, vore=herbi, order=Artiodactyla, conservation=domesticated, sleep_total=3.8, sleep_rem=0.6, sleep_cycle=null, awake=20.2, brainwt=0.175, bodywt=55.5}
{name=Chimpanzee, genus=Pan, vore=omni, order=Primates, conservation=null, sleep_total=9.7, sleep_rem=1.4, sleep_cycle=1.416666667, awake=14.3, brainwt=0.44, bodywt=52.2}
{name=Tiger, genus=Panthera, vore=carni, order=Carnivora, conservation=en, sleep_total=15.8, sleep_rem=null, sleep_cycle=null, awake=8.2, brainwt=null, bodywt=162.564}
{name=Jaguar, genus=Panthera, vore=carni, order=Carnivora, conservation=nt, sleep_total=10.4, sleep_rem=null, sleep_cycle=null, awake=13.6, brainwt=0.157, bodywt=100.0}
{name=Lion, genus=Panthera, vore=carni, order=Carnivora, conservation=vu, sleep_total=13.5, sleep_rem=null, sleep_cycle=null, awake

In [6]:
import kravis.* 
import krangl.irisData 

irisData.ggplot("Species" to x, "Petal.Length" to y)
    .geomBoxplot()
    .geomPoint(position = PositionJitter(width = 0.1), alpha = 0.3)
    .title("Petal Length by Species")

Line_6.jupyter.kts (4:10 - 16) Unresolved reference: ggplot
Line_6.jupyter.kts (4:30 - 31) Unresolved reference: x
Line_6.jupyter.kts (4:51 - 52) Unresolved reference: y

# {K}otlin DSL for data w{rangl}ing
* provides functionality for data manipulation using a functional-style API; it allows you to filter, transform, aggregate, and reshape tabular data.

In [None]:
%use krangl

In [49]:
// Read data-frame from disk
val iris = DataFrame.readCSV("data/iris.csv")

In [56]:
println(iris.head(5))
println("iris is DataFrame: ${iris is DataFrame}")

A DataFrame: 5 x 5
    Sepal.Length   Sepal.Width   Petal.Length   Petal.Width   Species
1            5.1           3.5            1.4           0.2    setosa
2            4.9             3            1.4           0.2    setosa
3            4.7           3.2            1.3           0.2    setosa
4            4.6           3.1            1.5           0.2    setosa
5              5           3.6            1.4           0.2    setosa
iris is DataFrame: true


In [53]:
iris.schema()

DataFrame with 150 observations
Sepal.Length  [Dbl]  5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4,...
Sepal.Width   [Dbl]  3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3, 3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7...
Petal.Length  [Dbl]  1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, ...
Petal.Width   [Dbl]  0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, ...
Species       [Str]  setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, setosa, seto...


In [58]:
iris.sortedBy("Sepal.Width").head(5)

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5.0,2.0,3.5,1.0,versicolor
6.0,2.2,4.0,1.0,versicolor
6.2,2.2,4.5,1.5,versicolor
6.0,2.2,5.0,1.5,virginica
4.5,2.3,1.3,0.3,setosa


In [59]:
iris.sortedByDescending("Sepal.Width").head(5)

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5.7,4.4,1.5,0.4,setosa
5.5,4.2,1.4,0.2,setosa
5.2,4.1,1.5,0.1,setosa
5.8,4.0,1.2,0.2,setosa
5.4,3.9,1.3,0.4,setosa


In [84]:
// Read data-frame from disk
// val iris = DataFrame.readCSV("data/iris.csv")


// Create data-frame in memory
val df: DataFrame = dataFrameOf(
    "first_name", "last_name", "age", "weight")(
    "Max", "Doe", 23, 55,
    "Franz", "Smith", 23, 88,
    "Horst", "Keanes", 12, 82
)

// Or from csv
// val otherDF = DataFrame.readCSV("path/to/file")

// Print rows
df                              // with implict string conversion using default options
df.print(colNames = false)      // with custom printing options

// Print structure
df.schema()


// Add columns with mutate
// by adding constant values as new column
df.addColumn("salary_category") { 3 }

// by doing basic column arithmetics
df.addColumn("age_3y_later") { it["age"] + 3 }

// Note: krangl dataframes are immutable so we need to (re)assign results to preserve changes.
val newDF = df.addColumn("full_name") { it["first_name"] + " " + it["last_name"] }

// // Also feel free to mix types here since krangl overloads  arithmetic operators like + for dataframe-columns
df.addColumn("user_id") { it["last_name"] + "_id" + rowNumber }

// // Create new attributes with string operations like matching, splitting or extraction.
df.addColumn("with_anz") { it["first_name"].asStrings().map { it!!.contains("anz") } }

// // Note: krangl is using 'null' as missing value, and provides convenience methods to process non-NA bits
df.addColumn("first_name_initial") { it["first_name"].map<String>{ it.first() } }

print("newDF:${newDF.head(5)}")

// // or add multiple columns at once
// df.addColumns(
//     "age_plus3" to { it["age"] + 3 },
//     "initials" to { it["first_name"].map<String> { it.first() } concat it["last_name"].map<String> { it.first() } }
// )


// // Sort your data with sortedBy
df.sortedBy("age")
// // and add secondary sorting attributes as varargs
df.sortedBy("age", "weight")
df.sortedByDescending("age")
df.sortedBy { it["weight"].asInts() }


// // Subset columns with select
// df.select{ it is IntCol } // functional style column selection
df.select("last_name", "weight")    // positive selection
df.remove("weight", "age")  // negative selection
df.select({ endsWith("name") })    // selector mini-language


// // Subset rows with vectorized filter
df.filter { it["age"] eq 23 }
df.filter { it["weight"] gt 50 }
// df.filter({ it["last_name"].isMatching { startsWith("Do")  }})

// // In case vectorized operations are not possible or available we can also filter tables by row
// which allows for scalar operators
df.filterByRow { it["age"] as Int > 5 }
df.filterByRow { (it["age"] as Int).rem(10) == 0 } // round birthdays :-)


// // Summarize

// // do simple cross tabulations
df.count("age", "last_name")

// // ... or calculate single summary statistic
// df.summarize("mean_age" to { it["age"].mean(true) })

// // ... or multiple summary statistics
// df.summarize(
//     "min_age" to { it["age"].min() },
//     "max_age" to { it["age"].max() }
// )

// // for sake of r and python adoptability you can also use `=` here
// df.summarize(
//     "min_age" `=` { it["age"].min() },
//     "max_age" `=` { it["age"].max() }
// )

// // Grouped operations
// val groupedDf: DataFrame = df.groupBy("age") // or provide multiple grouping attributes with varargs
// val sumDF = groupedDf.summarize(
//     "mean_weight" to { it["weight"].mean(removeNA = true) },
//     "num_persons" to { nrow }
// )

// // Optionally ungroup the data
// sumDF.ungroup().print()

// // generate object bindings for kotlin.
// // Unfortunately the syntax is a bit odd since we can not access the variable name by reflection
// sumDF.printDataClassSchema("Person")

// // This will generate and print the following conversion code:
// data class Person(val age: Int, val mean_weight: Double, val num_persons: Int)

// val records = sumDF.rows.map { row -> Person(row["age"] as Int, row["mean_weight"] as Double, row["num_persons"] as Int) }

// // Now we can use the krangl result table in a strongly typed way
// records.first().mean_weight

// // Vice versa we can also convert an existing set of objects into
// val recordsDF = records.asDataFrame()
// recordsDF.print()

// // to populate a data-frame with selected properties only, we can do
// val deparsedDF = records.deparseRecords { mapOf("age" to it.age, "weight" to it.mean_weight) }

A DataFrame: 3 x 4
1          Max         Doe    23       55
2        Franz       Smith    23       88
3        Horst      Keanes    12       82

DataFrame with 3 observations
first_name  [Str]  Max, Franz, Horst
last_name   [Str]  Doe, Smith, Keanes
age         [Int]  23, 23, 12
weight      [Int]  55, 88, 82
newDF:A DataFrame: 3 x 5
    first_name   last_name   age   weight      full_name
1          Max         Doe    23       55        Max Doe
2        Franz       Smith    23       88    Franz Smith
3        Horst      Keanes    12       82   Horst Keanes

age,last_name,n
23,Doe,1
23,Smith,1
12,Keanes,1


In [34]:
%use kmath

In [37]:
val a = floor(10 * Random.random(2, 2))
val b = floor(10 * Random.random(2, 2))

Unresolved reference: Random
Unresolved reference: Random

In [36]:
%use kotlin-statistics

In [7]:
val median = sequenceOf(1.0, 3.0, 5.0).median()
println(median) // prints "3.0"

Unresolved reference: median

In [8]:
class Item(val name: String, val value: Double)

val sequence = sequenceOf(
        Item("Alpha", 4.0),
        Item("Beta", 6.0),
        Item("Gamma", 7.2),
        Item("Delta", 9.2),
        Item("Epsilon", 6.8),
        Item("Zeta", 2.4),
        Item("Iota", 8.8)
)

In [9]:
val standardDeviationsByLength = sequence
        .map { it.name.length to it.value }
        .standardDeviationBy()
println("Std Devs by lengths: $standardDeviationsByLength")

Std Devs by lengths: {5=2.6229754097208, 4=3.208322510804258, 7=0.0}


In [10]:
//declare Product class
class Product(val id: Int,
              val name: String,
              val category: String,
              val section: Int,
              val defectRate: Double)

// Create list of Products
val products = listOf(Product(1, "Rayzeon", "ABR", 3, 1.1),
        Product(2, "ZenFire", "ABZ", 4, 0.7),
        Product(3, "HydroFlux", "ABR", 3, 1.9),
        Product(4, "IceFlyer", "ZBN", 1, 2.4),
        Product(5, "FireCoyote", "ABZ", 4, 3.2),
        Product(6, "LightFiber", "ABZ",2,  5.1),
        Product(7, "PyroKit", "ABR", 3, 1.4),
        Product(8, "BladeKit", "ZBN", 1, 0.5),
        Product(9, "NightHawk", "ZBN", 1, 3.5),
        Product(10, "NoctoSquirrel", "ABR", 2, 1.1),
        Product(11, "WolverinePack", "ABR", 3, 1.2)
        )

// Data Class for Grouping
data class Key(val category: String, val section: Int)

// Get Count by Category and Section
val countByCategoryAndSection =
        products.countBy { Key(it.category, it.section) }

println("Counts by Category and Section")
countByCategoryAndSection.entries.forEach { println(it) }

// Get Average Defect Rate by Category and Section
val averageDefectByCategoryAndSection =
        products.averageBy(keySelector = { Key(it.category, it.section) }, doubleSelector = { it.defectRate })

println("\nAverage Defect Rate by Category and Section")
averageDefectByCategoryAndSection.entries.forEach { println(it) }

Counts by Category and Section
Key(category=ABR, section=3)=4
Key(category=ABZ, section=4)=2
Key(category=ZBN, section=1)=3
Key(category=ABZ, section=2)=1
Key(category=ABR, section=2)=1

Average Defect Rate by Category and Section
Key(category=ABR, section=3)=1.4000000000000001
Key(category=ABZ, section=4)=1.9500000000000002
Key(category=ZBN, section=1)=2.1333333333333333
Key(category=ABZ, section=2)=5.1
Key(category=ABR, section=2)=1.1


In [11]:
class Email(val message: String, val isSpam: Boolean)

val emails = listOf(
        Email("Hey there! I thought you might find this interesting. Click here.", isSpam = true),
        Email("Get viagra for a discount as much as 90%", isSpam = true),
        Email("Viagra prescription for less", isSpam = true),
        Email("Even better than Viagra, try this new prescription drug", isSpam = true),

        Email("Hey, I left my phone at home. Email me if you need anything. I'll be in a meeting for the afternoon.", isSpam = false),
        Email("Please see attachment for notes on today's meeting. Interesting findings on your market research.", isSpam = false),
        Email("An item on your Amazon wish list received a discount", isSpam = false),
        Email("Your prescription drug order is ready", isSpam = false),
        Email("Your Amazon account password has been reset", isSpam = false),
        Email("Your Amazon order", isSpam = false)
)

val nbc = emails.toNaiveBayesClassifier(
        featuresSelector = { it.message.splitWords().toSet() },
        categorySelector = {it.isSpam }
)



fun String.splitWords() =  split(Regex("\\s")).asSequence()
         .map { it.replace(Regex("[^A-Za-z]"),"").toLowerCase() }
         .filter { it.isNotEmpty() }

In [12]:
 // TEST 1
val input = "discount viagra wholesale, hurry while this offer lasts".splitWords().toSet()
val predictedCategory = nbc.predict(input)
// Assert.assertTrue(predictedCategory == true)
println(predictedCategory == true)

// TEST 2
val input2 = "interesting meeting on amazon cloud services discount program".splitWords().toSet()
val predictedCategory2 = nbc.predict(input2)
// Assert.assertTrue(predictedCategory2 == false)
println(predictedCategory2 == false)

true
true


In [13]:
val a = floor(10 * Random.random(2, 2))
val b = floor(10 * Random.random(2, 2))

// stack arrays row wise
val v = vstack(a, b)
println(v.shape.joinToString())
// 4, 2

// stack arrays column wise
val h = hstack(a, b)
println(h.shape.joinToString())
// 2, 4

Unresolved reference: floor
Unresolved reference: Random
Unresolved reference: floor
Unresolved reference: Random
Unresolved reference: vstack
Unresolved reference: hstack

In [26]:
%use Komputation

Failed to process '%use Komputation' command. Unknown library 'Komputation'

In [14]:
(1..3).sum()

6

In [15]:
(1..5).map{ i-> 10*i}.sum()

150

In [17]:
fun f(n: Int) = 10 + 3*(0..n).map { it * it }.sum()
f(10)

1165

In [19]:
(1..10).flatMap { i -> (4..20).map { j -> 2 *i * j } }.sum()

22440

In [20]:
val myVector = doubleArrayOf(1.0, 5.2, 2.4)

In [21]:
val myVector = rowVectorOf(1.0, 5.2, 2.4)

Unresolved reference: rowVectorOf

In [24]:
val xValues = listOf(23, 65, 45, 23, 66)
val xAverage = xValues.average()
xAverage

44.4

Use Kotlin numpy wrapper

In [4]:
@file:Repository("https://dl.bintray.com/kotlin/kotlin-numpy/org/jetbrains/kotlin-numpy/")
@file:DependsOn("org.jetbrains:kotlin-numpy:0.1.0")

:: problems summary ::
:::: ERRORS
	unknown resolver kotlin.bintray.com

	unknown resolver kotlin.bintray.com



In [3]:
%use kotlin-numpy

Failed to process '%use kotlin-numpy' command. Unknown library 'kotlin-numpy'

In [2]:
import org.jetbrains.numkt.core.*
import org.jetbrains.numkt.math.*
import org.jetbrains.numkt.*


val aa = arange(15).reshape(3,5)
print(aa)

fun main() {
    val a = arange(15).reshape(3, 5) // KtNDArray<Int>([[ 0,  1,  2,  3,  4],
                                                     // [ 5,  6,  7,  8,  9],
                                                     // [10, 11, 12, 13, 14]]

    println(a.shape.contentEquals(intArrayOf(3, 5))) // true
    println(a.ndim == 2)                             // true
    println(a.dtype)                                 // class java.lang.Integer

    // create an array of ints, we square each element and the shape to (3, 5) 
    val b = (arange(15) `**` 2).reshape(3, 5)

    // c is the product of a and b, element-wise
    val c = a * b
    println(c)
    // Output:
    // [[   0    1    8   27   64]
    //  [ 125  216  343  512  729]
    //  [1000 1331 1728 2197 2744]]
    
    // d is the dot product of the transposed c and a
    val d = c.transpose().dot(a)
    println(d)
    // Output:
    // [[10625 11750 12875 14000 15125]
    //  [14390 15938 17486 19034 20582]
    //  [18995 21074 23153 25232 27311]
    //  [24530 27266 30002 32738 35474]
    //  [31085 34622 38159 41696 45233]]

}

// main()

Line_1.jupyter.kts (1:22 - 27) Unresolved reference: numkt
Line_1.jupyter.kts (2:22 - 27) Unresolved reference: numkt
Line_1.jupyter.kts (3:22 - 27) Unresolved reference: numkt
Line_1.jupyter.kts (6:10 - 16) Unresolved reference: arange
Line_1.jupyter.kts (7:1 - 6) Overload resolution ambiguity: 
public inline fun print(message: Any?): Unit defined in kotlin.io
public inline fun print(message: Boolean): Unit defined in kotlin.io
public inline fun print(message: Byte): Unit defined in kotlin.io
public inline fun print(message: Char): Unit defined in kotlin.io
public inline fun print(message: CharArray): Unit defined in kotlin.io
public inline fun print(message: Double): Unit defined in kotlin.io
public inline fun print(message: Float): Unit defined in kotlin.io
public inline fun print(message: Int): Unit defined in kotlin.io
public inline fun print(message: Long): Unit defined in kotlin.io
public inline fun print(message: Short): Unit defined in kotlin.io
Line_1.jupyter.kts (10:13 - 19)

In [1]:
val a = ones<Int>(2, 3)
val b = Random.random(2, 3)
a *= 3

b += a
b

Line_0.jupyter.kts (1:9 - 13) Unresolved reference: ones
Line_0.jupyter.kts (2:9 - 15) Unresolved reference: Random
Line_0.jupyter.kts (3:3 - 5) Function '<ERROR FUNCTION>' should return Unit to be used by corresponding operator '*='
Line_0.jupyter.kts (5:3 - 5) Function '<ERROR FUNCTION>' should return Unit to be used by corresponding operator '+='

In [58]:
var a = 1
var b = 2
println(a+b)

3


In [61]:
val s = "hello kotlin"
s

hello kotlin