In [1]:
# data types in Machine learning

In [2]:
from pyspark.ml.linalg import Vector, DenseVector, SparseVector

In [3]:
dv=DenseVector([1.0, 0., 0., 0., 4.5, 0])
dv

DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])

In [4]:
sv = SparseVector(6, {0:1.0, 4:4.5})
sv

SparseVector(6, {0: 1.0, 4: 4.5})

In [5]:
# convert sparse vector to dense vector

DenseVector(sv.toArray())

DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])

In [6]:
# convert Dense vector to sparse vector

active_elements_dict = {index: value for index, value in enumerate(dv) if value!=0}
active_elements_dict

{0: 1.0, 4: 4.5}

In [7]:
SparseVector(len(dv), active_elements_dict)

SparseVector(6, {0: 1.0, 4: 4.5})

In [8]:
import findspark
findspark.init()

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [10]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]') \
                    .appName('TEKS-Spark-ML-Example') \
                    .getOrCreate()

In [11]:
sentenceData = spark.createDataFrame([
    (0, 'welcome to spark spark pyspark'),
    (1, 'Python SQL'),
    (2, 'Python SQL Transformation'),
    (3, 'Estimator and Transformer')
], ['document', 'sentence'])

In [12]:
sentenceData.show(truncate=False)

+--------+------------------------------+
|document|sentence                      |
+--------+------------------------------+
|0       |welcome to spark spark pyspark|
|1       |Python SQL                    |
|2       |Python SQL Transformation     |
|3       |Estimator and Transformer     |
+--------+------------------------------+



In [13]:
from pyspark.ml.feature import CountVectorizer


df = spark.createDataFrame([
    (0, 'welcome to spark spark pyspark sql'.split(' ')),
    (1, 'pyspark sql welcome'.split(' '))
], ['id', 'words'])

cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+-----------------------------------------+-------------------------+
|id |words                                    |features                 |
+---+-----------------------------------------+-------------------------+
|0  |[welcome, to, spark, spark, pyspark, sql]|(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[pyspark, sql, welcome]                  |(3,[0,1,2],[1.0,1.0,1.0])|
+---+-----------------------------------------+-------------------------+



In [18]:
from pyspark.ml.feature import CountVectorizer


df = spark.createDataFrame([
    (0, 'welcome to spark spark pyspark sql'.split(' ')),
    (1, 'helo hi pyspark sql welcome'.split(' '))
], ['id', 'words'])

cv = CountVectorizer(inputCol='words', outputCol='features')

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+-----------------------------------------+-------------------------------------+
|id |words                                    |features                             |
+---+-----------------------------------------+-------------------------------------+
|0  |[welcome, to, spark, spark, pyspark, sql]|(7,[0,1,2,3,6],[1.0,2.0,1.0,1.0,1.0])|
|1  |[helo, hi, pyspark, sql, welcome]        |(7,[0,2,3,4,5],[1.0,1.0,1.0,1.0,1.0])|
+---+-----------------------------------------+-------------------------------------+



In [19]:
from pyspark.ml.feature import StopWordsRemover
sentenceData=spark.createDataFrame([
    (0,["I","saw","the","red","balloon"]),
    (1,["Mary","had","a","little","lamb"])
],["id","raw"])
remover=StopWordsRemover(inputCol="raw",outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+

