In [1]:
from pyspark.ml import Pipeline
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import col, when, lit
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier


In [2]:
sc

In [3]:
Titanic_train = sqlContext.read.option('header', True) \
    .option('delimiter', ',') \
    .option('inferSchema', True) \
    .csv('file:///home/ckkhandare/spark-3.1.2-bin-hadoop3.2/datasets/train.csv')\
    .withColumn('Sex', when(col('Sex') == 'male', 1.0).otherwise(0.0))

In [4]:
Titanic_Test = sqlContext.read.option('header', True) \
    .option('delimiter', ',') \
    .option('inferSchema', True) \
    .csv('file:///home/ckkhandare/spark-3.1.2-bin-hadoop3.2/datasets/test.csv')\
    .withColumn('Sex', when(col('Sex') == 'male', 1.0).otherwise(0.0))

In [5]:
Titanic_train.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: double (nullable = false)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [6]:
Titanic_train.show()

+-----------+--------+------+--------------------+---+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+---+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|1.0|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|0.0|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|0.0|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|0.0|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|1.0|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|1.0|null|    0|    0|          330877| 8.4583| null|  

In [36]:
Titanic_train = Titanic_train.drop('Cabin').drop('Ticket').drop('Name')

In [37]:
Titanic_train = Titanic_train.drop('PassengerId')

In [38]:
Titanic_train = Titanic_train.drop('Embarked')

In [39]:
Titanic_train.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: double (nullable = false)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)



In [40]:
Titanic_train = Titanic_train.na.drop()

In [41]:
Titanic_train.filter(Titanic_train.Age.isNull()).collect()

[]

In [31]:
Titanic_train.filter(Titanic_train.Parch.isNull()).collect()

[]

In [43]:
Titanic_train.show()

+--------+------+---+----+-----+-----+-------+
|Survived|Pclass|Sex| Age|SibSp|Parch|   Fare|
+--------+------+---+----+-----+-----+-------+
|       0|     3|1.0|22.0|    1|    0|   7.25|
|       1|     1|0.0|38.0|    1|    0|71.2833|
|       1|     3|0.0|26.0|    0|    0|  7.925|
|       1|     1|0.0|35.0|    1|    0|   53.1|
|       0|     3|1.0|35.0|    0|    0|   8.05|
|       0|     1|1.0|54.0|    0|    0|51.8625|
|       0|     3|1.0| 2.0|    3|    1| 21.075|
|       1|     3|0.0|27.0|    0|    2|11.1333|
|       1|     2|0.0|14.0|    1|    0|30.0708|
|       1|     3|0.0| 4.0|    1|    1|   16.7|
|       1|     1|0.0|58.0|    0|    0|  26.55|
|       0|     3|1.0|20.0|    0|    0|   8.05|
|       0|     3|1.0|39.0|    1|    5| 31.275|
|       0|     3|0.0|14.0|    0|    0| 7.8542|
|       1|     2|0.0|55.0|    0|    0|   16.0|
|       0|     3|1.0| 2.0|    4|    1| 29.125|
|       0|     3|0.0|31.0|    1|    0|   18.0|
|       0|     2|1.0|35.0|    0|    0|   26.0|
|       1|   

In [59]:
assembler = VectorAssembler(inputCols=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare'], outputCol='features')\
    .setHandleInvalid('skip')
vectorDF = assembler.transform(Titanic_train)
vectorDF.printSchema()

root
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: double (nullable = false)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- features: vector (nullable = true)



In [64]:
rf = RandomForestClassifier(labelCol="Survived", featuresCol="features", numTrees=10)

In [65]:
pipeline = Pipeline(stages=[assembler, rf])
pipelineModel = pipeline.fit(Titanic_train)

In [66]:
Titanic_Test.show()

+-----------+---+----+-----+-----+----------------+-------+-----+--------+--------+
|PassengerId|Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Survived|
+-----------+---+----+-----+-----+----------------+-------+-----+--------+--------+
|        892|1.0|34.5|    0|    0|          330911| 7.8292| null|       Q|       1|
|        893|0.0|47.0|    1|    0|          363272|    7.0| null|       S|       1|
|        894|1.0|62.0|    0|    0|          240276| 9.6875| null|       Q|       1|
|        895|1.0|27.0|    0|    0|          315154| 8.6625| null|       S|       1|
|        896|0.0|22.0|    1|    1|         3101298|12.2875| null|       S|       1|
|        897|1.0|14.0|    0|    0|            7538|  9.225| null|       S|       1|
|        898|0.0|30.0|    0|    0|          330972| 7.6292| null|       Q|       1|
|        899|1.0|26.0|    1|    1|          248738|   29.0| null|       S|       1|
|        900|0.0|18.0|    0|    0|            2657| 7.2292| null|       C|  

In [81]:
Titanic_Test = Titanic_Test.drop('Cabin').drop('Ticket').drop('Name').drop('Embarked')

In [82]:
Titanic_Test.show()

+-----------+------+---+----+-----+-----+-------+
|PassengerId|Pclass|Sex| Age|SibSp|Parch|   Fare|
+-----------+------+---+----+-----+-----+-------+
|        892|     3|1.0|34.5|    0|    0| 7.8292|
|        893|     3|0.0|47.0|    1|    0|    7.0|
|        894|     2|1.0|62.0|    0|    0| 9.6875|
|        895|     3|1.0|27.0|    0|    0| 8.6625|
|        896|     3|0.0|22.0|    1|    1|12.2875|
|        897|     3|1.0|14.0|    0|    0|  9.225|
|        898|     3|0.0|30.0|    0|    0| 7.6292|
|        899|     2|1.0|26.0|    1|    1|   29.0|
|        900|     3|0.0|18.0|    0|    0| 7.2292|
|        901|     3|1.0|21.0|    2|    0|  24.15|
|        902|     3|1.0|null|    0|    0| 7.8958|
|        903|     1|1.0|46.0|    0|    0|   26.0|
|        904|     1|0.0|23.0|    1|    0|82.2667|
|        905|     2|1.0|63.0|    1|    0|   26.0|
|        906|     1|0.0|47.0|    1|    0| 61.175|
|        907|     2|0.0|24.0|    1|    0|27.7208|
|        908|     2|1.0|35.0|    0|    0|  12.35|


In [83]:
pipelinePrediction = pipelineModel.transform(Titanic_Test)

In [84]:
pipelinePrediction.summary()

DataFrame[summary: string, PassengerId: string, Pclass: string, Sex: string, Age: string, SibSp: string, Parch: string, Fare: string, prediction: string]

In [85]:
pipelinePrediction.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Sex: double (nullable = false)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [87]:
pipelinePrediction.select('PassengerId','prediction','features').show()

+-----------+----------+--------------------+
|PassengerId|prediction|            features|
+-----------+----------+--------------------+
|        892|       0.0|[3.0,1.0,34.5,0.0...|
|        893|       0.0|[3.0,0.0,47.0,1.0...|
|        894|       0.0|[2.0,1.0,62.0,0.0...|
|        895|       0.0|[3.0,1.0,27.0,0.0...|
|        896|       1.0|[3.0,0.0,22.0,1.0...|
|        897|       0.0|[3.0,1.0,14.0,0.0...|
|        898|       1.0|[3.0,0.0,30.0,0.0...|
|        899|       0.0|[2.0,1.0,26.0,1.0...|
|        900|       1.0|[3.0,0.0,18.0,0.0...|
|        901|       0.0|[3.0,1.0,21.0,2.0...|
|        903|       0.0|[1.0,1.0,46.0,0.0...|
|        904|       1.0|[1.0,0.0,23.0,1.0...|
|        905|       0.0|[2.0,1.0,63.0,1.0...|
|        906|       1.0|[1.0,0.0,47.0,1.0...|
|        907|       1.0|[2.0,0.0,24.0,1.0...|
|        908|       0.0|[2.0,1.0,35.0,0.0...|
|        909|       0.0|[3.0,1.0,21.0,0.0...|
|        910|       1.0|[3.0,0.0,27.0,1.0...|
|        911|       0.0|[3.0,0.0,4