In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
df = spark.read.format('csv').options(header='true', inferschema='true').\
load('data/project/classification/titanic/train.csv')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/18 15:24:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/18 15:24:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


<font size=5> Show statistics of each column, including feature columns and label column (medv)  </font>

In [2]:
df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
PassengerId,891,446.0,257.3538420152301,1,891
Survived,891,0.3838383838383838,0.48659245426485753,0,1
Pclass,891,2.308641975308642,0.8360712409770491,1,3
Name,891,,,"""Andersson, Mr. August Edvard (""""Wennerstrom"""")""","van Melkebeke, Mr. Philemon"
Sex,891,,,female,male
Age,714,29.69911764705882,14.526497332334035,0.42,80.0
SibSp,891,0.5230078563411896,1.1027434322934315,0,8
Parch,891,0.38159371492704824,0.8060572211299488,0,6
Ticket,891,260318.54916792738,471609.26868834975,110152,WE/P 5735


In [3]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
catcols=[]
for i in df.dtypes:
    if i[1] == 'string':
        catcols.append(i[0])

In [5]:
num_cols=[]
for i in df.dtypes:
    if i[1] !='string' and i[0] != 'Survived':
        num_cols.append(i[0])

In [6]:
labelCol='Survived'

In [7]:
catcols

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [8]:
num_cols

['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [14]:
df=df.dropna()

In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [16]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [17]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [18]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [19]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

+-------------------------------------------------------------------------------------------+-----+
|features                                                                                   |label|
+-------------------------------------------------------------------------------------------+-----+
|(449,[56,299,401,442,443,444,445,446,448],[1.0,1.0,1.0,1.0,2.0,1.0,38.0,1.0,71.2833])      |1    |
|(449,[74,197,326,441,443,444,445,446,448],[1.0,1.0,1.0,1.0,4.0,1.0,35.0,1.0,53.1])         |1    |
|(449,[115,182,264,433,441,443,444,445,448],[1.0,1.0,1.0,1.0,1.0,7.0,1.0,54.0,51.8625])     |0    |
|(449,[149,223,311,441,443,444,445,446,447,448],[1.0,1.0,1.0,1.0,11.0,3.0,4.0,1.0,1.0,16.7])|1    |
|(449,[35,245,381,441,443,444,445,448],[1.0,1.0,1.0,1.0,12.0,1.0,58.0,26.55])               |1    |
+-------------------------------------------------------------------------------------------+-----+
only showing top 5 rows



In [20]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
# Update metadata accordingly.
featureIndexer =VectorIndexer(inputCol="features", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=4).fit(data)


In [21]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+-----+--------------------+
|            features|label|     indexedFeatures|
+--------------------+-----+--------------------+
|(449,[56,299,401,...|    1|(449,[56,299,401,...|
|(449,[74,197,326,...|    1|(449,[74,197,326,...|
|(449,[115,182,264...|    0|(449,[115,182,264...|
|(449,[149,223,311...|    1|(449,[149,223,311...|
|(449,[35,245,381,...|    1|(449,[35,245,381,...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [22]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

+----------------------------------------------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|features                                                                                      |label|indexedFeatures                                                                               |
+----------------------------------------------------------------------------------------------+-----+----------------------------------------------------------------------------------------------+
|(449,[0,272,315,441,443,444,445,448],[1.0,1.0,1.0,1.0,346.0,2.0,24.0,13.0])                   |1    |(449,[0,272,315,441,443,444,445,448],[1.0,1.0,1.0,1.0,346.0,1.0,24.0,13.0])                   |
|(449,[1,252,350,442,443,444,445,446,448],[1.0,1.0,1.0,1.0,557.0,1.0,48.0,1.0,39.6])           |1    |(449,[1,252,350,442,443,444,445,446,448],[1.0,1.0,1.0,1.0,557.0,0.0,48.0,1.0,39.6])           |
|(449,[2,1

<font size=5>

Let's do Naive Baynes first
    
</font>

In [23]:
from pyspark.ml.classification import NaiveBayes
naivebayes = NaiveBayes(featuresCol="indexedFeatures", labelCol="label")

In [25]:
nb_model=naivebayes.fit(trainingData)

<font size=5>

Test the model with testData

  
    
</font>

In [27]:
nb_predictions = nb_model.transform(testData)
nb_predictions.select("prediction","label","indexedFeatures").show(5)


+----------+-----+--------------------+
|prediction|label|     indexedFeatures|
+----------+-----+--------------------+
|       0.0|    1|(449,[3,182,290,3...|
|       1.0|    1|(449,[8,191,321,4...|
|       1.0|    1|(449,[9,182,187,3...|
|       1.0|    0|(449,[10,187,312,...|
|       1.0|    0|(449,[11,187,312,...|
+----------+-----+--------------------+
only showing top 5 rows



In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(nb_predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.7
Test Error = 0.3


In [32]:
print('training data (f1):', evaluator.setMetricName('f1').evaluate(nb_predictions), "\n",
     'training data (weightedPrecision): ', evaluator.setMetricName('weightedPrecision').evaluate(nb_predictions),"\n",
     'training data (weightedRecall): ', evaluator.setMetricName('weightedRecall').evaluate(nb_predictions),"\n",
     'training data (accuracy): ', evaluator.setMetricName('accuracy').evaluate(nb_predictions))


training data (f1): 0.7012557832121613 
 training data (weightedPrecision):  0.7027972027972028 
 training data (weightedRecall):  0.7 
 training data (accuracy):  0.7


<font size=5>

Let's do Logistic Regression
    
</font>

In [33]:
from pyspark.ml.classification import LogisticRegression
logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='label')

In [34]:
lg_model=logr.fit(trainingData)

23/05/18 15:39:56 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/18 15:39:56 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/18 15:39:56 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/05/18 15:39:56 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [35]:
lg_predictions = lg_model.transform(testData)
lg_predictions.select("prediction","label","indexedFeatures").show(5)


+----------+-----+--------------------+
|prediction|label|     indexedFeatures|
+----------+-----+--------------------+
|       1.0|    1|(449,[3,182,290,3...|
|       1.0|    1|(449,[8,191,321,4...|
|       1.0|    1|(449,[9,182,187,3...|
|       1.0|    0|(449,[10,187,312,...|
|       1.0|    0|(449,[11,187,312,...|
+----------+-----+--------------------+
only showing top 5 rows



In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(lg_predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.5857142857142857
Test Error = 0.414286


<font size=5>

Let's do Decision Tree
    
</font>

In [38]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='indexedFeatures', labelCol='label')

In [39]:
dt_model=dt.fit(trainingData)

In [41]:
dt_predictions=dt_model.transform(testData)
dt_predictions.select("prediction","label","indexedFeatures").show(5)


+----------+-----+--------------------+
|prediction|label|     indexedFeatures|
+----------+-----+--------------------+
|       0.0|    1|(449,[3,182,290,3...|
|       1.0|    1|(449,[8,191,321,4...|
|       1.0|    1|(449,[9,182,187,3...|
|       1.0|    0|(449,[10,187,312,...|
|       1.0|    0|(449,[11,187,312,...|
+----------+-----+--------------------+
only showing top 5 rows



In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(lg_predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.5857142857142857
Test Error = 0.414286


<font size=5>

Let's do Linear Support Vector Machine
    
</font>

In [43]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(featuresCol="indexedFeatures", labelCol="label", maxIter=50)

In [44]:
svc_model=lsvc.fit(trainingData)

In [46]:
svc_predictions=svc_model.transform(testData)
svc_predictions.select("prediction","label","indexedFeatures").show(5)

+----------+-----+--------------------+
|prediction|label|     indexedFeatures|
+----------+-----+--------------------+
|       1.0|    1|(449,[3,182,290,3...|
|       1.0|    1|(449,[8,191,321,4...|
|       1.0|    1|(449,[9,182,187,3...|
|       1.0|    0|(449,[10,187,312,...|
|       1.0|    0|(449,[11,187,312,...|
+----------+-----+--------------------+
only showing top 5 rows



In [47]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(svc_predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.7428571428571429
Test Error = 0.257143


<font size=5>

Let's do Random Forest
    
</font>

In [49]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='label')

In [50]:
rf_model=rf.fit(trainingData)

In [51]:
rf_predictions=rf_model.transform(testData)
rf_predictions.select("prediction","label","indexedFeatures").show(5)

+----------+-----+--------------------+
|prediction|label|     indexedFeatures|
+----------+-----+--------------------+
|       1.0|    1|(449,[3,182,290,3...|
|       1.0|    1|(449,[8,191,321,4...|
|       1.0|    1|(449,[9,182,187,3...|
|       1.0|    0|(449,[10,187,312,...|
|       1.0|    0|(449,[11,187,312,...|
+----------+-----+--------------------+
only showing top 5 rows



In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rf_predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


Accuracy = 0.6428571428571429
Test Error = 0.357143


<font size=5>

Let's do Gradient Boosted Tree
    
</font>

In [53]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='label')

In [54]:
gbt_model=gbt.fit(trainingData)

In [56]:
gbt_predictions=gbt_model.transform(testData)
gbt_predictions.select("prediction","label","indexedFeatures").show(5)

+----------+-----+--------------------+
|prediction|label|     indexedFeatures|
+----------+-----+--------------------+
|       0.0|    1|(449,[3,182,290,3...|
|       1.0|    1|(449,[8,191,321,4...|
|       1.0|    1|(449,[9,182,187,3...|
|       1.0|    0|(449,[10,187,312,...|
|       1.0|    0|(449,[11,187,312,...|
+----------+-----+--------------------+
only showing top 5 rows



In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(gbt_predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.8142857142857143
Test Error = 0.185714


<font size=5>

This concludes the testing of Spark ML Classifier

</font>

### conclusion