In [None]:
pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 73kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 43.0MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612242 sha256=c15c3baa32ab8d7a0fe5a6e8a023137e7f7265df1841f85a56f00f479a043374
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName('titanic').getOrCreate()

In [None]:
data=spark.read.csv('/content/sample_data/titanic.csv',inferSchema=True,header=True)

In [None]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [None]:
data.head(6)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S'),
 Row(PassengerId=6, Survived=0, Pclass=3, Name='Moran, Mr. James', Sex='male',

In [None]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [None]:
my_data=data.select(['Survived',
 'Pclass','Sex',
 'Age',
 'SibSp',
 'Parch','Fare','Embarked'])

In [None]:
my_cols=my_data.na.drop()

In [None]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [None]:
gender_indexer=StringIndexer(inputCol='Sex',outputCol='SexIndex')
# A B C
# 0 1 2
# one hot encode
# KEY A B C
# Example A
# [1 ,0 ,0]
gender_encoder=OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [None]:
embark_indexer=StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder=OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [None]:
assembler=VectorAssembler(inputCols=['Pclass','SexVec','EmbarkVec','Age',
                                     'SibSp','Parch','Fare'],outputCol='features')

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
from pyspark.ml import Pipeline
#sets stages for differents steps the data has to cross

In [None]:
log_titanic=LogisticRegression(featuresCol='features',labelCol='Survived')

In [None]:
pipeline=Pipeline(stages=[gender_indexer,embark_indexer,
                          gender_encoder,embark_encoder,
                          assembler,log_titanic])

In [None]:
train_data,test_data=my_cols.randomSplit([0.7,0.3])

In [None]:
model=pipeline.fit(train_data)

In [None]:
results=model.transform(test_data)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
my_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [None]:
results.select('Survived','prediction').show()

+--------+----------+
|Survived|prediction|
+--------+----------+
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       1.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
|       0|       0.0|
+--------+----------+
only showing top 20 rows



In [None]:
AUC=my_eval.evaluate(results)

In [None]:
AUC

0.7614863940927117