In [1]:
from pyspark.ml.feature import VectorAssembler
dataFrame = spark.createDataFrame(zip([1,2,3],[5,6,7],[7,8,9]),['A','B','C'])
display(dataFrame)

A,B,C
1,5,7
2,6,8
3,7,9


In [2]:
ve = VectorAssembler(inputCols=['A','B','C'], outputCol='vec_feature')
out = ve.transform(dataFrame)
display(out)

A,B,C,vec_feature
1,5,7,"List(1, 3, List(), List(1.0, 5.0, 7.0))"
2,6,8,"List(1, 3, List(), List(2.0, 6.0, 8.0))"
3,7,9,"List(1, 3, List(), List(3.0, 7.0, 9.0))"


In [3]:
from pyspark.ml.feature import StandardScaler, MinMaxScaler

mm_scaler = MinMaxScaler(inputCol='vec_feature', outputCol='minmax_scaled')
ss_scaler = StandardScaler(inputCol='vec_feature', outputCol='standard_scaled', withMean=True, withStd=True)

In [4]:
mm = mm_scaler.fit(out)
ss = ss_scaler.fit(out)

In [5]:
from pyspark.ml.feature import StopWordsRemover

In [6]:
help(StopWordsRemover)

In [7]:
from pyspark.ml.feature import FeatureHasher

dataset = spark.createDataFrame([
    (2.2, True, "1", "foo"),
    (3.3, False, "2", "bar"),
    (4.4, False, "3", "baz"),
    (5.5, False, "4", "foo")
], ["real", "bool", "stringNum", "string"])

hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                       outputCol="features")

In [8]:
featurized = hasher.transform(dataset)

In [9]:
featurized.show(truncate=False)

In [10]:
help(FeatureHasher)

In [11]:
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([
    (Vectors.dense([2.0, 1.0]),),
    (Vectors.dense([0.0, 0.0]),),
    (Vectors.dense([3.0, -1.0]),)
], ["features"])

In [12]:
d = df.toPandas()

In [13]:
d.to_csv('a.csv')

In [14]:
df = spark.createDataFrame([("foo bar",),("hello world",)]).toDF("sentence")

In [15]:
df.show()

In [16]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
 
class StringLengthTransformer(Transformer, HasInputCol, HasOutputCol):
 
   @keyword_only
   def __init__(self, inputCol=None, outputCol=None):
       super(StringLengthTransformer, self).__init__()
       kwargs = self._input_kwargs
       self.setParams(**kwargs)
 
   @keyword_only
   def setParams(self, inputCol=None, outputCol=None):
       kwargs = self._input_kwargs
       return self._set(**kwargs)
 
   def _transform(self, dataset):
       reverse = udf(lambda sentence: len(sentence), IntegerType())
       return dataset.withColumn(self.getOutputCol(), reverse(dataset[self.getInputCol()]))

In [17]:
strlength = StringLengthTransformer(inputCol="sentence", outputCol="len")

In [18]:
reverse = udf(lambda sentence: len(sentence), IntegerType())

In [19]:
df = spark.createDataFrame([("foo bar",),("hello world",)]).toDF("sentence")

In [20]:
df.show()

In [21]:
df.withColumn('len', reverse(df.sentence)).show()

In [22]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
 
class StringLengthTransformer(Transformer, HasInputCol, HasOutputCol):
 
   @keyword_only
   def __init__(self, inputCol=None, outputCol=None):
       super(StringLengthTransformer, self).__init__()
       kwargs = self._input_kwargs
       self.setParams(**kwargs)
 
   @keyword_only
   def setParams(self, inputCol=None, outputCol=None):
       kwargs = self._input_kwargs
       return self._set(**kwargs)
 
   def _transform(self, dataset):
       def cv(s):
          return  s.count('a') + s.count('e') + s.count('i') + s.count('o') + s.count('u')
       count_vowel = udf(cv, IntegerType())
       return dataset.withColumn(self.getOutputCol(), count_vowel(dataset[self.getInputCol()]))

In [23]:
df = spark.createDataFrame([("foo bar",),("hello world",)]).toDF("sentence")
strlength = StringLengthTransformer(inputCol="sentence", outputCol="len")

In [24]:
strlength.transform(df).show()