In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructField,StructType,IntegerType,StringType,TimestampType,ArrayType,DoubleType
from pyspark.ml.feature import VectorAssembler,VectorIndexer,StringIndexer,IndexToString
from pyspark.ml.stat import Correlation
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.ml.evaluation import *
from pyspark.ml.classification import LogisticRegression

In [2]:
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1) 

  after removing the cwd from sys.path.


In [3]:
findspark.init()

In [130]:
spark.stop()

In [4]:
# By default 12 executors based on CPU core if not specified
spark=SparkSession.builder.appName("SparkML").master("local[4]").getOrCreate()
sc=spark.sparkContext

In [89]:
rdd1=sc.textFile("data/iris/iris.data")
rdd1=rdd1.map(lambda x: x.split(","))
rdd1=rdd1.map(lambda x: Row(sepal_l=float(x[0]), sepal_w=float(x[1]),petal_l=float(x[2]),petal_w=float(x[3]),cls=x[4]))
rdd1.collect()

[Row(cls='Iris-setosa', petal_l=1.4, petal_w=0.2, sepal_l=5.1, sepal_w=3.5),
 Row(cls='Iris-setosa', petal_l=1.4, petal_w=0.2, sepal_l=4.9, sepal_w=3.0),
 Row(cls='Iris-setosa', petal_l=1.3, petal_w=0.2, sepal_l=4.7, sepal_w=3.2),
 Row(cls='Iris-setosa', petal_l=1.5, petal_w=0.2, sepal_l=4.6, sepal_w=3.1),
 Row(cls='Iris-setosa', petal_l=1.4, petal_w=0.2, sepal_l=5.0, sepal_w=3.6),
 Row(cls='Iris-setosa', petal_l=1.7, petal_w=0.4, sepal_l=5.4, sepal_w=3.9),
 Row(cls='Iris-setosa', petal_l=1.4, petal_w=0.3, sepal_l=4.6, sepal_w=3.4),
 Row(cls='Iris-setosa', petal_l=1.5, petal_w=0.2, sepal_l=5.0, sepal_w=3.4),
 Row(cls='Iris-setosa', petal_l=1.4, petal_w=0.2, sepal_l=4.4, sepal_w=2.9),
 Row(cls='Iris-setosa', petal_l=1.5, petal_w=0.1, sepal_l=4.9, sepal_w=3.1),
 Row(cls='Iris-setosa', petal_l=1.5, petal_w=0.2, sepal_l=5.4, sepal_w=3.7),
 Row(cls='Iris-setosa', petal_l=1.6, petal_w=0.2, sepal_l=4.8, sepal_w=3.4),
 Row(cls='Iris-setosa', petal_l=1.4, petal_w=0.1, sepal_l=4.8, sepal_w=3.0),

In [90]:
schema1=StructType([
    StructField("sepal_l",DoubleType(),False),
    StructField("sepal_w",DoubleType(),False),
    StructField("petal_l",DoubleType(),False),
    StructField("petal_w",DoubleType(),False),
    StructField("cls",StringType(),False),
])
df1=spark.createDataFrame(rdd1,schema1)
print(df1.printSchema())
print(df1.show(5))

root
 |-- sepal_l: double (nullable = false)
 |-- sepal_w: double (nullable = false)
 |-- petal_l: double (nullable = false)
 |-- petal_w: double (nullable = false)
 |-- cls: string (nullable = false)

None
+-------+-------+-------+-------+-----------+
|sepal_l|sepal_w|petal_l|petal_w|        cls|
+-------+-------+-------+-------+-----------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
+-------+-------+-------+-------+-----------+
only showing top 5 rows

None


In [91]:
df1.stat.corr("sepal_l","sepal_w")

-0.10936924995064869

In [92]:
assembler1 = VectorAssembler(inputCols = ["sepal_l", "sepal_w", "petal_l", "petal_w"], outputCol = "features")
assembled1 = assembler1.transform(df1)
assembled1.show(3)
Correlation.corr(assembled1, "features").toPandas()

+-------+-------+-------+-------+-----------+-----------------+
|sepal_l|sepal_w|petal_l|petal_w|        cls|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-------+-------+-------+-------+-----------+-----------------+
only showing top 3 rows



Unnamed: 0,pearson(features)
0,"DenseMatrix([[ 1. , -0.10936925, 0.87175416, 0.81795363],\n [-0.10936925, 1. , -0.4205161 , -0.35654409],\n [ 0.87175416, -0.4205161 , 1. , 0.9627571 ],\n [ 0.81795363, -0.35654409, 0.9627571 , 1. ]])"


In [93]:
featureIndexer=VectorIndexer(inputCol="features",outputCol="indexedFeatures",
                             maxCategories=2)

In [94]:
string_to_idx=StringIndexer(inputCol="cls",outputCol="indexedCls")

In [95]:
lr=LogisticRegression(featuresCol="indexedFeatures",labelCol="indexedCls")

In [96]:
idx_to_string=IndexToString(inputCol="prediction",outputCol="predCls")

In [108]:
df_featureIndexer1_fit=featureIndexer.fit(assembled1)
df_featureIndexer1_tran=df_featureIndexer1_fit.transform(assembled1)
df_featureIndexer1_tran.show(5,True)

df_string_to_idx1_fit=string_to_idx.fit(df_featureIndexer1)
df_string_to_idx1_tran=df_string_to_idx1_fit.transform(df_featureIndexer1)
df_string_to_idx1_tran.show(5)

df_lr_fit=lr.fit(df_string_to_idx1_tran)
df_lr_trans=df_lr_fit.transform(df_string_to_idx1_tran)
df_lr_trans.show(5)

idx_to_string.setLabels(df_string_to_idx1_fit.labels)
df_idx_to_string_trans=idx_to_string.transform(df_lr)
df_idx_to_string_trans.show(5)

+-------+-------+-------+-------+-----------+-----------------+-----------------+
|sepal_l|sepal_w|petal_l|petal_w|        cls|         features|  indexedFeatures|
+-------+-------+-------+-------+-----------+-----------------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[4.7,3.2,1.3,0.2]|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[4.6,3.1,1.5,0.2]|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|[5.0,3.6,1.4,0.2]|
+-------+-------+-------+-------+-----------+-----------------+-----------------+
only showing top 5 rows

+-------+-------+-------+-------+-----------+-----------------+-----------------+----------+
|sepal_l|sepal_w|petal_l|petal_w|        cls|         features|  indexedFeatures|indexedCls|
+-------+-------+-------+-------+-----------+------

In [98]:
assembled2=assembled1.drop("sepal_l","sepal_w","petal_l","petal_w")
(trainingData,testData)=assembled2.randomSplit([0.6,0.4])
trainingData.show(5)
testData.show(5)

+-----------+-----------------+
|        cls|         features|
+-----------+-----------------+
|Iris-setosa|[4.4,2.9,1.4,0.2]|
|Iris-setosa|[4.5,2.3,1.3,0.3]|
|Iris-setosa|[4.6,3.2,1.4,0.2]|
|Iris-setosa|[4.6,3.4,1.4,0.3]|
|Iris-setosa|[4.6,3.6,1.0,0.2]|
+-----------+-----------------+
only showing top 5 rows

+-----------+-----------------+
|        cls|         features|
+-----------+-----------------+
|Iris-setosa|[4.3,3.0,1.1,0.1]|
|Iris-setosa|[4.4,3.0,1.3,0.2]|
|Iris-setosa|[4.4,3.2,1.3,0.2]|
|Iris-setosa|[4.6,3.1,1.5,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-----------+-----------------+
only showing top 5 rows



In [99]:
pipeline1=Pipeline(stages=[string_to_idx,featureIndexer,lr,idx_to_string])
model1=pipeline1.fit(trainingData)
predictions1=model1.transform(testData)
predictions1.select("features","cls","predCls").show(5)

+-----------------+-----------+-----------+
|         features|        cls|    predCls|
+-----------------+-----------+-----------+
|[4.3,3.0,1.1,0.1]|Iris-setosa|Iris-setosa|
|[4.4,3.0,1.3,0.2]|Iris-setosa|Iris-setosa|
|[4.4,3.2,1.3,0.2]|Iris-setosa|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|Iris-setosa|
+-----------------+-----------+-----------+
only showing top 5 rows



## Transform: 
A feature transformer might take a DataFrame, read a column (e.g., text), map it into a new column (e.g., feature vectors), and output a new DataFrame with the mapped column appended.
A learning model might take a DataFrame, read the column containing feature vectors, predict the label for each feature vector, and output a new DataFrame with predicted labels appended as a column.

In [106]:
print(df_featureIndexer1_fit, df_featureIndexer1_tran)
print(df_string_to_idx1_fit, df_string_to_idx1_tran)
print(df_lr_fit, df_lr_trans)
print(df_idx_to_string_trans)
print(featureIndexer, string_to_idx, lr, idx_to_string)
print(pipeline1, model1, predictions1)

VectorIndexer_3f1d47cc302b DataFrame[sepal_l: double, sepal_w: double, petal_l: double, petal_w: double, cls: string, features: vector, indexedFeatures: vector]
StringIndexer_3ba1958b1b52 DataFrame[sepal_l: double, sepal_w: double, petal_l: double, petal_w: double, cls: string, features: vector, indexedFeatures: vector, indexedCls: double]
LogisticRegressionModel: uid = LogisticRegression_386b67b93762, numClasses = 3, numFeatures = 4 DataFrame[sepal_l: double, sepal_w: double, petal_l: double, petal_w: double, cls: string, features: vector, indexedFeatures: vector, indexedCls: double, rawPrediction: vector, probability: vector, prediction: double]
DataFrame[sepal_l: double, sepal_w: double, petal_l: double, petal_w: double, cls: string, features: vector, indexedFeatures: vector, indexedCls: double, rawPrediction: vector, probability: vector, prediction: double, predCls: string]
VectorIndexer_3f1d47cc302b StringIndexer_3ba1958b1b52 LogisticRegression_386b67b93762 IndexToString_a8550e037

In [84]:
evaluator=MulticlassClassificationEvaluator(
    labelCol="indexedCls",
    predictionCol="prediction",metricName="accuracy"
)
accu=evaluator.evaluate(predictions)
print("Test Error: %g, "%(1-accu))

Test Error: 0.0441176, 


In [85]:
predictions.filter(predictions.predCls!=predictions.cls).select("features","cls","predCls").show(5)

+-----------------+--------------+---------------+
|         features|           cls|        predCls|
+-----------------+--------------+---------------+
|[6.0,3.0,4.8,1.8]|Iris-virginica|Iris-versicolor|
|[6.1,2.6,5.6,1.4]|Iris-virginica|Iris-versicolor|
|[6.3,2.8,5.1,1.5]|Iris-virginica|Iris-versicolor|
+-----------------+--------------+---------------+



In [86]:
from pyspark.ml.linalg import Vectors
dfr = spark.createDataFrame([(Vectors.dense([-1.0, 0.0]),),
    (Vectors.dense([0.0, 1.0]),), (Vectors.dense([0.0, 2.0]),)], ["a"])
dfr.head()

Row(a=DenseVector([-1.0, 0.0]))

In [87]:
indexer = VectorIndexer(maxCategories=2, inputCol="a")
indexer.setOutputCol("indexed")
model2 = indexer.fit(dfr)
indexer.getHandleInvalid()
model2.transform(dfr).head()

Row(a=DenseVector([-1.0, 0.0]), indexed=DenseVector([1.0, 0.0]))

# Binarizer

In [109]:
from pyspark.ml.feature import Binarizer
binarizer=Binarizer(threshold=3.2,inputCol="sepal_w",outputCol="b_sepal_w")
binarizer.transform(df1).show(3)

+-------+-------+-------+-------+-----------+---------+
|sepal_l|sepal_w|petal_l|petal_w|        cls|b_sepal_w|
+-------+-------+-------+-------+-----------+---------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|      1.0|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|      0.0|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|      0.0|
+-------+-------+-------+-------+-----------+---------+
only showing top 3 rows



In [None]:
https://www.youtube.com/watch?v=jEyahxFp3ak