In [1]:
//If using Almond dependency
import $ivy.`org.vegas-viz::vegas:0.3.11`

[32mimport [39m[36m$ivy.$                            [39m

In [None]:
//If using Apache Toree dependency
%AddDeps org.vegas-viz vegas_2.11 0.3.11 --transitive

In [None]:
//If using Apache Toree dependency
implicit val render = vegas.render.ShowHTML(kernel.display.content("text/html", _))

# Data Exploration with Functional Programming using Jupyter Notebook, Scala and Vegas
## A Statistical Analysis of the Titanic Dataset

Titanic survivor dataset captures the various details of people who survived or not survived in the shipwreck. Using this data, we want to build a model which predicts the propability of someone's survival. It is a classification problem that maps all attributes like sex, fare, age on the most probable state: Survived or not

![Titanic](Titanic.jpg)
(Source: https://commons.wikimedia.org/wiki/RMS_Titanic)


The dataset contains the following attributes (for more information: see Kaggle):

| **Variable** | **Definition**                                | **Key**                                           |
|--------------|-----------------------------------------------|---------------------------------------------------|
| survival     | Survival                                      | 1=Yes, 0= No                                      |
| pclass       | Ticket class                                  | 1 = 1st, 2 = 2nd, 3 = 3rd                         |
| sex          | Sex                                           |                                                   |
| age          | Age                                           | Age in years                                      |
| sibsp        |  # of siblings / spouses aboard   the Titanic |                                                   |
| parch        |  # of parents / children aboard   the Titanic |                                                   |
| ticket       | Ticket number                                 |                                                   |
| fare         | Passenger fare                                |                                                   |
| cabin        | Cabin number                                  |                                                   |
| embarked     | Port of Embarkation                           |  C = Cherbourg, Q = Queenstown,   S = Southampton |


The dataset is splittet into three files:
* A Training Dataset (train.csv)
* A Test Dataset (test.csv)
* A Set which contains sample data for the submission (gender_submission.csv).

At first, we need to load the data creating maps for each set.

In [2]:
import vegas._
import vegas.data.External._
import java.io.PrintWriter
import scala.util.Try

// Regular Expressions for extracting the information
val DATA_ACCESS_PATTERN_test = """(\d+),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)""".r
val DATA_ACCESS_PATTERN_train=  """(\d+),(\d),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)""".r

// Reading text file
// Stores the information in a map consisting of a property name (key) and its value
def loadDataCSV(filename:String):List[Map[String,Any]]= {

  val src = scala.io.Source.fromFile(filename)
  val iter = src.getLines().drop(1) //skip first line (property names)
    
    val result= (for (row <- iter) yield readData(row)).toList
   
    src.close
    result.flatMap(_ match{ case p:Option[Map[String,Any]]=>p})
}
  

// Extracting all information storing it into a Map[String,Any]
def readData(line:String):Option[Map[String,Any]]={
    
    def toInt(key: String, s: String): Option[(String, Int)] = Try(s.toInt).toOption.map((key, _))

    def toFloat(key: String, s: String): Option[(String, Float)] = Try(s.toFloat).toOption.map((key, _))
  
    def toString(key:String, s:String):Option[(String,String)]=
        if (s.nonEmpty) Some((key,s)) else None

    def createPassengerMap(t1:String,t2:String,t3:String,t4:String,t5:String,t6:String,t7:String,
                           t8:String,t9:String,t10:String,t11:String,t12:String):Option[Map[String,Any]]={
        
        val l=List(
            toInt("passengerID",t1),
            toInt("survived",t2),
            toInt("pclass",t3),
            toString("name",t4),
            toString("sex",t5),
            toFloat("age",t6),
            toInt("sibsp",t7),
            toInt("parch",t8),
            toString("ticket",t9),
            toFloat("fare",t10),
            toString("cabin",t11),
            {if (t12.length>0) Some(("embarked",t12(0))) else None})
         Some(l.flatMap(_ match{ case p:Option[(String,Any)]=>p}).toMap)        
    }
    
    val result = line match{
       case DATA_ACCESS_PATTERN_test(t1,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12) => 
                 createPassengerMap(t1,"-1",t3,t4,t5,t6,t7,t8,t9,t10,t11,t12)
       
       case DATA_ACCESS_PATTERN_train(t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12) => {
                  createPassengerMap(t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12)
       }
       
     }
     result
}

// Method for printing a passenger in a readable manner
def printPassenger(p:Map[String,Any]):Unit={
    
    println("\n---------------------------------------------------------------------")
    println("passengerID:"+p.getOrElse("passengerID",-1))
    println("survived:"+p.getOrElse("survived",-1))
    println("pclass:"+p.getOrElse("pclass",-1))
    println("name:"+p.getOrElse("name","-"))
    println("sex:"+p.getOrElse("sex","-"))
    println("age:"+p.getOrElse("age",-1))
    println("sibsp:"+p.getOrElse("sibsp",-1))
    println("parch:"+p.getOrElse("parch",-1))
    println("ticket:"+p.getOrElse("ticket","-"))
    println("fare:"+p.getOrElse("fare",-1))
    println("cabin:"+p.getOrElse("cabin",-1))
    println("embarked:"+p.getOrElse("embarked",'-'))
    println("---------------------------------------------------------------------\n")
}
  
//produces sometimes an missing argument list error - can be ignored
def applyModel[CLASS,ID](model:(Map[String,Any],String)=> (ID,CLASS), 
            testdata: Seq[Map[String,Any]], idKey:String):Seq[(ID,CLASS)]= {
    
    testdata.map(d => model(d,idKey))
}  

def createSubmitFile[ID,CLASS](filename:String, data:Seq[(ID,CLASS)],header:String):Unit= {
    val pw = new PrintWriter(filename)
    pw.println(header)
    data.foreach(e=>pw.println(e._1.toString+","+e._2.toString))
    pw.close
}

[32mimport [39m[36mvegas._
[39m
[32mimport [39m[36mvegas.data.External._
[39m
[32mimport [39m[36mjava.io.PrintWriter
[39m
[32mimport [39m[36mscala.util.Try

// Regular Expressions for extracting the information
[39m
[36mDATA_ACCESS_PATTERN_test[39m: [32mscala[39m.[32mutil[39m.[32mmatching[39m.[32mRegex[39m = (\d+),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)
[36mDATA_ACCESS_PATTERN_train[39m: [32mscala[39m.[32mutil[39m.[32mmatching[39m.[32mRegex[39m = (\d+),(\d),(\d),"(.+)",(male|female),([0-9]*\.[0-9]+|[0-9]+|d*),(\d*),(\d*),(.*),([0-9]*\.[0-9]+|[0-9]+|d*),(.*),(\w*)
defined [32mfunction[39m [36mloadDataCSV[39m
defined [32mfunction[39m [36mreadData[39m
defined [32mfunction[39m [36mprintPassenger[39m
defined [32mfunction[39m [36mapplyModel[39m
defined [32mfunction[39m [36mcreateSubmitFile[39m

In [3]:
val train= loadDataCSV("train.csv")
val test= loadDataCSV("test.csv")
val all= train ++ test
  
println("Train Dataset:"+ train.size+" Elements")
println("Test Dataset:"+ test.size+" Elements")
println("Whole Dataset:"+ all.size+" Elements")

Train Dataset:891 Elements
Test Dataset:418 Elements
Whole Dataset:1309 Elements


[36mtrain[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"name"[39m -> [32m"Braund, Mr. Owen Harris"[39m,
    [32m"fare"[39m -> [32m7.25F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m22.0F[39m,
    [32m"ticket"[39m -> [32m"A/5 21171"[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"passengerID"[39m -> [32m1[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m1[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m0[39m
  ),
  [33mMap[39m(
    [32m"name"[39m -> [32m"Cumings, Mrs. John Bradley (Florence Briggs Thayer)"[39m,
    [32m"fare"[39m -> [32m71.2833F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m38.0F[39m,
    [32m"ticket"[39m -> [32m"PC 17599"[39m,
    [32m"cabin"[39m -> [32m"C85"[39m,
    [32m"sex"[39m -> [32m"female"[39m,
    [32m"passengerID"[39m -> [32m2[39m,
 

Now we can examine a small sample of the data set

In [4]:
all.take(2).foreach(printPassenger)


---------------------------------------------------------------------
passengerID:1
survived:0
pclass:3
name:Braund, Mr. Owen Harris
sex:male
age:22.0
sibsp:1
parch:0
ticket:A/5 21171
fare:7.25
cabin:-1
embarked:S
---------------------------------------------------------------------


---------------------------------------------------------------------
passengerID:2
survived:1
pclass:1
name:Cumings, Mrs. John Bradley (Florence Briggs Thayer)
sex:female
age:38.0
sibsp:1
parch:0
ticket:PC 17599
fare:71.2833
cabin:C85
embarked:C
---------------------------------------------------------------------



Count the missing values in a passenger set.

In [5]:
val attList= List("passengerID","pclass","survived","name","sex","age","sibsp","parch",
        "ticket","fare","cabin","embarked")


def countAllMissingValues(data:List[Map[String,Any]],attList:List[String]):Map[String,Int] = {
  attList.map(att => (att,0)).toMap.
    map(x => (x._1, data.count(passenger => passenger.get(x._1).isEmpty)))
}


val train_mv = countAllMissingValues(train,attList)
print(train_mv)
val test_mv= countAllMissingValues(test,attList)
assert(train_mv("cabin")== 687 && train_mv("age")==177 && train_mv("embarked")== 2)
assert(test_mv("cabin")== 327 && test_mv("age")==86 && test_mv("fare")== 1)

Map(name -> 0, fare -> 0, parch -> 0, age -> 177, ticket -> 0, cabin -> 687, sex -> 0, passengerID -> 0, pclass -> 0, sibsp -> 0, embarked -> 2, survived -> 0)

[36mattList[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"passengerID"[39m,
  [32m"pclass"[39m,
  [32m"survived"[39m,
  [32m"name"[39m,
  [32m"sex"[39m,
  [32m"age"[39m,
  [32m"sibsp"[39m,
  [32m"parch"[39m,
  [32m"ticket"[39m,
  [32m"fare"[39m,
  [32m"cabin"[39m,
  [32m"embarked"[39m
)
defined [32mfunction[39m [36mcountAllMissingValues[39m
[36mtrain_mv[39m: [32mMap[39m[[32mString[39m, [32mInt[39m] = [33mMap[39m(
  [32m"name"[39m -> [32m0[39m,
  [32m"fare"[39m -> [32m0[39m,
  [32m"parch"[39m -> [32m0[39m,
  [32m"age"[39m -> [32m177[39m,
  [32m"ticket"[39m -> [32m0[39m,
  [32m"cabin"[39m -> [32m687[39m,
  [32m"sex"[39m -> [32m0[39m,
  [32m"passengerID"[39m -> [32m0[39m,
  [32m"pclass"[39m -> [32m0[39m,
  [32m"sibsp"[39m -> [32m0[39m,
  [32m"embarked"[39m -> [32m2[39m,
  [32m"survived"[39m -> [32m0[39m
)
[36mtest_mv[39m: [32mMap[39m[[32mString[39m, [32mInt[39m] = [33mMap[39m

# DATA VISUALISATION

In [6]:
Vegas("Passengers split by sex" ).
    withData(train).
    mark(Bar).
    encodeX("sex", Ordinal,axis=Axis(title="Sex")).
    encodeY("passengerID", Quantitative,AggOps.Count,axis=Axis(title="Passengers")).
    show

In [7]:
val passengers= train.size
val survivedPass= (train.filter(m=>m("survived")==1)).size
val rate= survivedPass.toDouble/passengers
println("propability of surviving:"+rate)

Vegas("Passengers classified by survival" ).
    withData(train).
    mark(Bar).
    addTransform("survival", "datum.survived == 0 ? \"Dead\" : \"Alive\"").
    encodeX("survival", Ordinal,axis=Axis(title="Survival")).
    encodeY("passengerID", Quantitative,AggOps.Count,axis=Axis(title="Passengers")).show

propability of surviving:0.3838383838383838


[36mpassengers[39m: [32mInt[39m = [32m891[39m
[36msurvivedPass[39m: [32mInt[39m = [32m342[39m
[36mrate[39m: [32mDouble[39m = [32m0.3838383838383838[39m

In [8]:
Vegas("Survival split by sex").
      withData(train).
      mark(Bar).
      addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
      encodeY("passengerID",Quantitative, AggOps.Count, axis=Axis(title="Passengers")).
      encodeX("sex", Ord).
      encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
      show

In [9]:
Vegas("Age Distribution of all Passengers").
    withData(train).
    mark(Bar).
    addTransformCalculation("age","datum.age-(datum.age % 10)").
    encodeX("age", dataType= Nominal,scale=Scale(bandSize=20.0)).
    encodeY("passengerID", Quant ,AggOps.Count, axis=Axis(title="Passengers")).
    show

In [10]:
Vegas("Age Distribution splitted by Survival").
    withData(train).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    mark(Bar).
    encodeX("age", Quant, bin=Bin(step=10.0,min= 0.0),axis=Axis(title="Age")).
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

In [11]:
Vegas("Age Distribution splitted by Survival").
    withData(train).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    addTransform("Age Description", "datum.age <= 4 ? \"1-Infant\" : datum.age<=15 ? \"2-Child\" : datum.age<=50 ? \"3-Adult\" : \"4-Old\"").
    mark(Bar).
    encodeX("Age Description", Ordinal).
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

In [12]:
Vegas("Survival split by age").
    withData(train).
    mark(Bar).
    addTransformCalculation("age", "datum.age-(datum.age % 10)").
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeX("age", Nominal, scale = Scale(bandSize = 20.0)).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

In [12]:
// could be intresting, maybe survival is higher when cabin close to rescue boats
// https://www.encyclopedia-titanica.org/cabins.html
// to many missing values

//Vegas("Survival by cabin").
//    withData(train).
//    mark(Bar).
//    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
//    addTransformCalculation("cabin", "(datum.cabin").
//    encodeX("cabin", Nominal, scale = Scale(bandSize = 20.0)).
//    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
//    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
//    show

In [13]:
Vegas("Survival split by embarked").
    withData(train).
    mark(Bar).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeX("embarked", Nominal, scale = Scale(bandSize = 20.0)).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

In [14]:
//fare (cost for ticket)
val nrClasses = 3
val sorted_train = train.filter(x => x.get("fare")!=None).sortBy(x=>x("fare").asInstanceOf[Float])
val sizeSet = sorted_train.size
val sizeIntervals = sizeSet/nrClasses
val intervals = for (x <- 1 to nrClasses-1) yield sorted_train(x*sizeIntervals)("fare").asInstanceOf[Float]
val new_Train= for(pass <- sorted_train) 
    yield pass.updated("fareclass",intervals.count(_ < pass("fare").asInstanceOf[Float]))
Vegas("Fare Distribution splitted by Calculated Intervals").
    withData(new_Train).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    mark(Bar).
    encodeX("fareclass", Ord).
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

[36mnrClasses[39m: [32mInt[39m = [32m3[39m
[36msorted_train[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"name"[39m -> [32m"Leonard, Mr. Lionel"[39m,
    [32m"fare"[39m -> [32m0.0F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m36.0F[39m,
    [32m"ticket"[39m -> [32m"LINE"[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"passengerID"[39m -> [32m180[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m0[39m
  ),
  [33mMap[39m(
    [32m"name"[39m -> [32m"Harrison, Mr. William"[39m,
    [32m"fare"[39m -> [32m0.0F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m40.0F[39m,
    [32m"ticket"[39m -> [32m"112059"[39m,
    [32m"cabin"[39m -> [32m"B94"[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"passengerID"[39m -> [32

In [15]:
Vegas("pclass Distribution splitted by Calculated Intervals").
    withData(train).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    mark(Bar).
    encodeX("pclass", Ord).
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

In [16]:
Vegas("number of Siblings Distribution splitted by Calculated Intervals").
    withData(train).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    mark(Bar).
    encodeX("sibsp", Ord).
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

In [17]:
Vegas("number of parents/children Distribution splitted by Calculated Intervals").
    withData(train).
    addTransform("survival", "datum.survived == 0 ? \"No\" : \"Yes\"").
    mark(Bar).
    encodeX("parch", Ord).
    encodeY("passengerID", Quantitative ,AggOps.Count,axis=Axis(title="Passengers")).
    encodeColor("survival", Nominal, scale=Scale(rangeNominals=List("#EA98D2", "#659CCA"))).
    show

#  DATA PREPARATION

Name, Cabin & Ticket are deleted from the dataset

In [18]:

def data_preparation(data:List[Map[String,Any]]):List[Map[String,Any]]={
    // drop name, cabin, ticket
    val a = data.map(passenger => passenger.-("name"))
    val b = a.map(passenger => passenger.-("cabin"))
    val c = b.map(passenger => passenger.-("ticket"))
    // add random age to the missing values
    val r = scala.util.Random
    val train_age = train.filter(passenger => passenger.contains("age")).map(x => x("age").asInstanceOf[Float])
    val train_avg_age = c.map(passenger => 
                              if (!passenger.contains("age")) {
                                (passenger.+("age" -> r.nextInt(80).asInstanceOf[Float]))
                            }else{
                                passenger
                             })
    // add 2 values freely (Q), because only 2 missing values
    val train_embarked = train_avg_age.map(passenger => if (!passenger.contains("embarked")){
        (passenger.+("embarked" -> 'Q'))
    }else{
        passenger
    })
    val train_2 = train_embarked
    train_2
}

val t = data_preparation(train)


// drop passengerID, because no relevant information
val train_no_id = t.map(passenger => passenger.-("passengerID"))

defined [32mfunction[39m [36mdata_preparation[39m
[36mt[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"fare"[39m -> [32m7.25F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m22.0F[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"passengerID"[39m -> [32m1[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m1[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m0[39m
  ),
  [33mMap[39m(
    [32m"fare"[39m -> [32m71.2833F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m38.0F[39m,
    [32m"sex"[39m -> [32m"female"[39m,
    [32m"passengerID"[39m -> [32m2[39m,
    [32m"pclass"[39m -> [32m1[39m,
    [32m"sibsp"[39m -> [32m1[39m,
    [32m"embarked"[39m -> [32m'C'[39m,
    [32m"survived"[39m -> [32m1[39m
  ),
  [33mMap[39m(
    [32m"fare"[39m -> [32m7.925F[39m,
    [32m"parch"

transform every attribute to categorical attributes


Age -> into 4 classes
    infant=0 => (age <= 4)
    child=1  => (age <= 15)
    adult=2  => (age <= 50)
    old=3    => (age > 50)
    
    
fare (costs) -> into 3 classes

In [19]:
def categorize_data(data:List[Map[String,Any]]):List[Map[String,Any]]={
    // fare
    val nrClasses = 3
    val sorted_train = data.sortBy(x => x("fare").asInstanceOf[Float])
    val sizeSet = sorted_train.size
    val sizeIntervals = sizeSet/nrClasses
    val intervals = for (x <- 1 to nrClasses-1) yield sorted_train(x*sizeIntervals)("fare").asInstanceOf[Float]
    val new_Train= for(pass <- sorted_train) yield pass.updated("fareClass",intervals.count(_ < pass("fare").asInstanceOf[Float]))
    
    // age
    val new_Train_2 = for(passenger <- new_Train) yield passenger.updated("ageClass", passenger("age").asInstanceOf[Float] match {
      case x if (x <= 4) => 0
      case x if (x > 4) && (x <= 15) => 1
      case x if (x > 15) && (x <= 50) => 2
      case x if (x > 50) => 3
      case _ => -1
    })
    
    //drop the fare & age feature, because categorized into fareClass & ageClass
    val train_no_fare = new_Train_2.map(passenger => passenger.-("fare"))
    val train_no_age = train_no_fare.map(passenger => passenger.-("age"))
    val train_final = train_no_age
    train_final
}

val train_final = categorize_data(train_no_id)

defined [32mfunction[39m [36mcategorize_data[39m
[36mtrain_final[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"fareClass"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m0[39m
  ),
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"pclass"[39m -> [32m1[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"fareClass"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m0[39m
  ),
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"pclass"

# NAIVE BAYES IMPLEMENTATION

In [20]:
val data_passengers_survived = train_final.filter(passenger => passenger("survived").equals(1))
val data_passengers_not_survived = train_final.filter(passenger => passenger("survived").equals(0))

[36mdata_passengers_survived[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"fareClass"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m1[39m
  ),
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"fareClass"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m1[39m
  ),
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[3

In [21]:
def count_occurrence_of_att(data:List[Map[String,Any]]):Map[(String,Any),Int] = {
    data.flatten.groupBy(identity).mapValues(_.size)
}

val occ_survived = count_occurrence_of_att(data_passengers_survived)
val occ_not_survived = count_occurrence_of_att(data_passengers_not_survived)

defined [32mfunction[39m [36mcount_occurrence_of_att[39m
[36mocc_survived[39m: [32mMap[39m[([32mString[39m, [32mAny[39m), [32mInt[39m] = [33mMap[39m(
  ([32m"sex"[39m, [32m"male"[39m) -> [32m109[39m,
  ([32m"survived"[39m, [32m1[39m) -> [32m342[39m,
  ([32m"sibsp"[39m, [32m1[39m) -> [32m112[39m,
  ([32m"embarked"[39m, [32m'C'[39m) -> [32m93[39m,
  ([32m"parch"[39m, [32m5[39m) -> [32m1[39m,
  ([32m"sibsp"[39m, [32m2[39m) -> [32m13[39m,
  ([32m"fareClass"[39m, [32m2[39m) -> [32m165[39m,
  ([32m"embarked"[39m, [32m'S'[39m) -> [32m217[39m,
  ([32m"parch"[39m, [32m3[39m) -> [32m3[39m,
  ([32m"ageClass"[39m, [32m2[39m) -> [32m244[39m,
  ([32m"ageClass"[39m, [32m3[39m) -> [32m36[39m,
  ([32m"pclass"[39m, [32m2[39m) -> [32m87[39m,
  ([32m"parch"[39m, [32m2[39m) -> [32m40[39m,
  ([32m"sibsp"[39m, [32m3[39m) -> [32m4[39m,
  ([32m"ageClass"[39m, [32m1[39m) -> [32m33[39m,
  ([32m"pclass"[39m

Creating 2 maps (for class survived & not survived) containing the conditional probabilities

In [22]:
// helper get the number of different values for specific attribute
// ex: attribute pclass has 3 different values 
def getNbOfDiffValuesOfAttr(data:List[Map[String,Any]],att:String):Int = {
    data.map(passenger => passenger(att)).distinct.size
}


// smoothing
// conditional probability of each attribute
// p(att)=(#(d^c)+1)/(#(c)+m) with m = nb of different attribute values
def conditional_prob_of_att(occ:Map[(String,Any),Int],survived:Int):Map[(String,Any),Double]= {
      occ.map(x 
              => 
              ((x._1._1,x._1._2), ((x._2+1)/(occ("survived",survived)+getNbOfDiffValuesOfAttr(train_final,x._1._1)).asInstanceOf[Double])))
}

val prob_survived = conditional_prob_of_att(occ_survived,1)
val prob_not_survived = conditional_prob_of_att(occ_not_survived,0)

defined [32mfunction[39m [36mgetNbOfDiffValuesOfAttr[39m
defined [32mfunction[39m [36mconditional_prob_of_att[39m
[36mprob_survived[39m: [32mMap[39m[([32mString[39m, [32mAny[39m), [32mDouble[39m] = [33mMap[39m(
  ([32m"sex"[39m, [32m"male"[39m) -> [32m0.31976744186046513[39m,
  ([32m"survived"[39m, [32m1[39m) -> [32m0.997093023255814[39m,
  ([32m"sibsp"[39m, [32m1[39m) -> [32m0.3237822349570201[39m,
  ([32m"embarked"[39m, [32m'C'[39m) -> [32m0.27246376811594203[39m,
  ([32m"parch"[39m, [32m5[39m) -> [32m0.0057306590257879654[39m,
  ([32m"sibsp"[39m, [32m2[39m) -> [32m0.04011461318051576[39m,
  ([32m"fareClass"[39m, [32m2[39m) -> [32m0.4811594202898551[39m,
  ([32m"embarked"[39m, [32m'S'[39m) -> [32m0.6318840579710145[39m,
  ([32m"parch"[39m, [32m3[39m) -> [32m0.011461318051575931[39m,
  ([32m"ageClass"[39m, [32m2[39m) -> [32m0.708092485549133[39m,
  ([32m"ageClass"[39m, [32m3[39m) -> [32m0.106936416184

# PREDICTION

TestData preparation

Check for missing values

In [23]:
val missing_values = countAllMissingValues(test,attList)

[36mmissing_values[39m: [32mMap[39m[[32mString[39m, [32mInt[39m] = [33mMap[39m(
  [32m"name"[39m -> [32m0[39m,
  [32m"fare"[39m -> [32m1[39m,
  [32m"parch"[39m -> [32m0[39m,
  [32m"age"[39m -> [32m86[39m,
  [32m"ticket"[39m -> [32m0[39m,
  [32m"cabin"[39m -> [32m327[39m,
  [32m"sex"[39m -> [32m0[39m,
  [32m"passengerID"[39m -> [32m0[39m,
  [32m"pclass"[39m -> [32m0[39m,
  [32m"sibsp"[39m -> [32m0[39m,
  [32m"embarked"[39m -> [32m0[39m,
  [32m"survived"[39m -> [32m0[39m
)

Add missing value fare manually (only 1 missing value)

In [24]:
val test_1 = test.map(passenger => 
                      if (!passenger.contains("fare")){
                        (passenger.+("fare" -> 12.5F))
                    }else{
                        passenger
                    })

[36mtest_1[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"name"[39m -> [32m"Kelly, Mr. James"[39m,
    [32m"fare"[39m -> [32m7.8292F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m34.5F[39m,
    [32m"ticket"[39m -> [32m"330911"[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"passengerID"[39m -> [32m892[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'Q'[39m,
    [32m"survived"[39m -> [32m-1[39m
  ),
  [33mMap[39m(
    [32m"name"[39m -> [32m"Wilkes, Mrs. James (Ellen Needs)"[39m,
    [32m"fare"[39m -> [32m7.0F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m47.0F[39m,
    [32m"ticket"[39m -> [32m"363272"[39m,
    [32m"sex"[39m -> [32m"female"[39m,
    [32m"passengerID"[39m -> [32m893[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m1

In [25]:
val test_data = data_preparation(test_1)

[36mtest_data[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"fare"[39m -> [32m7.8292F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m34.5F[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"passengerID"[39m -> [32m892[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'Q'[39m,
    [32m"survived"[39m -> [32m-1[39m
  ),
  [33mMap[39m(
    [32m"fare"[39m -> [32m7.0F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m -> [32m47.0F[39m,
    [32m"sex"[39m -> [32m"female"[39m,
    [32m"passengerID"[39m -> [32m893[39m,
    [32m"pclass"[39m -> [32m3[39m,
    [32m"sibsp"[39m -> [32m1[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m-1[39m
  ),
  [33mMap[39m(
    [32m"fare"[39m -> [32m9.6875F[39m,
    [32m"parch"[39m -> [32m0[39m,
    [32m"age"[39m

Change the data to only categorical 

In [26]:
val data_test_2 = categorize_data(test_data)

[36mdata_test_2[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mAny[39m]] = [33mList[39m(
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"passengerID"[39m -> [32m1158[39m,
    [32m"pclass"[39m -> [32m1[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"fareClass"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m-1[39m
  ),
  [33mMap[39m(
    [32m"parch"[39m -> [32m0[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageClass"[39m -> [32m2[39m,
    [32m"passengerID"[39m -> [32m1264[39m,
    [32m"pclass"[39m -> [32m1[39m,
    [32m"sibsp"[39m -> [32m0[39m,
    [32m"fareClass"[39m -> [32m0[39m,
    [32m"embarked"[39m -> [32m'S'[39m,
    [32m"survived"[39m -> [32m-1[39m
  ),
  [33mMap[39m(
    [32m"parch"[39m -> [32m1[39m,
    [32m"sex"[39m -> [32m"male"[39m,
    [32m"ageCla

In [27]:
def predict_for_passenger(prob_survived:Map[(String, Any), Double],
                            prob_not_survived:Map[(String, Any), Double],
                            passenger:Map[String,Any]):(Int,Int) = {
    val passenger_id  = passenger("passengerID").asInstanceOf[Int]
    val data_size = prob_survived.size.asInstanceOf[Double] + prob_not_survived.size.asInstanceOf[Double]
    val prior_prob_survived = (prob_survived.size/data_size).asInstanceOf[Double]
    val prior_prob_not_survived = (prob_not_survived.size/data_size).asInstanceOf[Double]
    
    // only probabilities relevant for passenger (class survived)
    val tmp_prob_survived = prob_survived.filter(x => x._1._2.equals(passenger(x._1._1))).
        values.map(x=>scala.math.log(x))
    // only probabilities relevant for passenger (class not survived)
    val tmp_prob_not_survived = prob_not_survived.filter(x => x._1._2.equals(passenger(x._1._1))).
        values.map(x=>scala.math.log(x))
    
    val res_survived = scala.math.log(prior_prob_survived) + tmp_prob_survived.foldLeft(0.0)(_+_)
    val res_not_survived = scala.math.log(prior_prob_not_survived) + tmp_prob_not_survived.foldLeft(0.0)(_+_)
    if(res_survived > res_not_survived){
        (passenger_id,1)
    }else{
        (passenger_id,0)
    }
}

defined [32mfunction[39m [36mpredict_for_passenger[39m

In [28]:
// prediction Seq[(ID,CLASS)]
// ID of passenger
// Class => predicted class for passenger (survived/not survived)
def predict(prob_survived:Map[(String, Any), Double],
            prob_not_survived:Map[(String, Any), Double],
            test_data:List[Map[String,Any]]):Seq[(Int,Int)] = {
    test_data.map(passenger => predict_for_passenger(prob_survived,prob_not_survived,passenger))
}

defined [32mfunction[39m [36mpredict[39m

In [29]:
val prediction_for_test_data = predict(prob_survived,prob_not_survived,data_test_2)

[36mprediction_for_test_data[39m: [32mSeq[39m[([32mInt[39m, [32mInt[39m)] = [33mList[39m(
  ([32m1158[39m, [32m0[39m),
  ([32m1264[39m, [32m0[39m),
  ([32m913[39m, [32m0[39m),
  ([32m1008[39m, [32m0[39m),
  ([32m1025[39m, [32m0[39m),
  ([32m1124[39m, [32m0[39m),
  ([32m1183[39m, [32m0[39m),
  ([32m893[39m, [32m0[39m),
  ([32m1055[39m, [32m0[39m),
  ([32m1103[39m, [32m0[39m),
  ([32m1217[39m, [32m0[39m),
  ([32m909[39m, [32m0[39m),
  ([32m911[39m, [32m1[39m),
  ([32m919[39m, [32m0[39m),
  ([32m1028[39m, [32m0[39m),
  ([32m1063[39m, [32m0[39m),
  ([32m1129[39m, [32m0[39m),
  ([32m1166[39m, [32m0[39m),
  ([32m1203[39m, [32m0[39m),
  ([32m1224[39m, [32m0[39m),
  ([32m900[39m, [32m1[39m),
  ([32m927[39m, [32m0[39m),
  ([32m1065[39m, [32m0[39m),
  ([32m1180[39m, [32m0[39m),
  ([32m1184[39m, [32m0[39m),
  ([32m1213[39m, [32m0[39m),
  ([32m1229[39m, [32m0[39m),
  ([32m1231[39

In [58]:
// submit file for kaggle
def createSubmitFile[ID, CLASS](filename: String, data: Seq[(ID, CLASS)], header: String): Unit = {
    val pw = new PrintWriter(filename)
    pw.println(header)
    data.foreach(e => pw.println(e._1.toString + "," + e._2.toString))
    pw.close()
  }

createSubmitFile("mySubmission.csv",prediction_for_test_data,"PassengerId,Survived")

defined [32mfunction[39m [36mcreateSubmitFile[39m