In [1]:
from pyspark.sql import SparkSession 
spark = SparkSession.builder.appName('learn_ml').master('local[1]').getOrCreate()

ml 模块 三个抽象类：
转换器（Transformer）、评估器（Estimator）和管道（Pipeline）

### pyspark.ml.feature.Binarizer(self, threshold=0.0, inputCol=None, outputCol=None)
根据指定的阈值将连续变量转换为对应的二进制

In [2]:
df = spark.createDataFrame([(0.5,),(1.0,),(1.5,)], ['values'])

In [3]:
df.show()

+------+
|values|
+------+
|   0.5|
|   1.0|
|   1.5|
+------+



In [4]:
from pyspark.ml.feature import Binarizer
binarizer = Binarizer(threshold=0.7, inputCol="values", outputCol="features")
binarizer.transform(df).show()

+------+--------+
|values|features|
+------+--------+
|   0.5|     0.0|
|   1.0|     1.0|
|   1.5|     1.0|
+------+--------+



In [5]:
# 通过setParams，更改配置
binarizer.setParams(outputCol="freqs").transform(df).show()

+------+-----+
|values|freqs|
+------+-----+
|   0.5|  0.0|
|   1.0|  1.0|
|   1.5|  1.0|
+------+-----+



In [6]:
# 通过params更改配置
params = {binarizer.threshold: -0.5, binarizer.outputCol: "vector"}
binarizer.transform(df, params).show()

+------+------+
|values|vector|
+------+------+
|   0.5|   1.0|
|   1.0|   1.0|
|   1.5|   1.0|
+------+------+



In [7]:
# 保存配置
import os
#temp_path = os.getcwd()
temp_path = os.path.abspath('.')
binarizerPath = "file://{}/binarizer".format(temp_path)
binarizer.save(binarizerPath)

In [8]:
# 加载配置
loadedBinarizer = Binarizer.load(binarizerPath)
loadedBinarizer.getThreshold() == binarizer.getThreshold()

True

### pyspark.ml.feature.Bucketizer(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
与Binarizer类似，该方法根据阈值列表（分割的参数），将连续变量转换为多项值（连续变量离散化到指定的范围区间）


In [9]:
from pyspark.ml.feature import Bucketizer
values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
df = spark.createDataFrame(values, ["values"])
# splits 为分类区间
bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],inputCol="values", outputCol="buckets")
# 这里setHandleInvalid是对nan值进行处理，默认是error：有nan则报错；keep：将nan保留为新分类；skip：忽略nan值
bucketed = bucketizer.setHandleInvalid("keep").transform(df)
bucketed.show()

+------+-------+
|values|buckets|
+------+-------+
|   0.1|    0.0|
|   0.4|    0.0|
|   1.2|    1.0|
|   1.5|    2.0|
|   NaN|    3.0|
|   NaN|    3.0|
+------+-------+



In [10]:
# 更改配置
bucketizer.setParams(outputCol="b").transform(df).show()

+------+---+
|values|  b|
+------+---+
|   0.1|0.0|
|   0.4|0.0|
|   1.2|1.0|
|   1.5|2.0|
|   NaN|3.0|
|   NaN|3.0|
+------+---+



### pyspark.ml.feature.ChiSqSelector(self, numTopFeatures=50, featuresCol="features", outputCol=None, labelCol="label", selectorType="numTopFeatures", percentile=0.1, fpr=0.05, fdr=0.05, fwe=0.05)
对于分类目标变量（思考分类模型），此功能允许你选择预定义数量的特征（由numTopFeatures参数进行参数化），以便最好地说明目标的变化。该方法需要两部：需要.fit()——可以计算卡方检验，调用.fit()方法，将DataFrame作为参数传入返回一个ChiSqSelectorModel对象，然后可以使用该对象的.transform()方法来转换DataFrame。默认情况下，选择方法是numTopFeatures，默认顶级要素数设置为50。
percentile 相识于num ，选取百分比的特征
fpr 选择p-values低于阈值的所有特征，从而控制误差的选择率。
fdr 使用  Benjamini-Hochberg procedure 
fwe 选择p-values低于阈值的所有特征。阈值按1 / numFeatures缩放

In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import ChiSqSelector
df = spark.createDataFrame(
[(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
["features", "label"])
selector = ChiSqSelector(numTopFeatures=2, outputCol="selectedFeatures")
model = selector.fit(df)
model.transform(df).show()

+------------------+-----+----------------+
|          features|label|selectedFeatures|
+------------------+-----+----------------+
|[0.0,0.0,18.0,1.0]|  1.0|      [18.0,1.0]|
|[0.0,1.0,12.0,0.0]|  0.0|      [12.0,0.0]|
|[1.0,0.0,15.0,0.1]|  0.0|      [15.0,0.1]|
+------------------+-----+----------------+



### pyspark.ml.feature.CountVectorizer(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, outputCol=None)
从文档集合中提取词汇表并生成向量

In [12]:
from pyspark.ml.feature import CountVectorizer
df = spark.createDataFrame(
[(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
["label", "raw"])
cv = CountVectorizer(inputCol="raw", outputCol="vectors")
model = cv.fit(df)
model.transform(df).show(truncate=False)

+-----+---------------+-------------------------+
|label|raw            |vectors                  |
+-----+---------------+-------------------------+
|0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+-----+---------------+-------------------------+



In [13]:
sorted(model.vocabulary) 

['a', 'b', 'c']

In [14]:
# 保存model
import os
#temp_path = os.getcwd()
temp_path = os.path.abspath('.')
modelPath = "file://{}/count-vectorizer-model".format(temp_path)
model.save(modelPath)

In [15]:
# 加载model
from pyspark.ml.feature import CountVectorizerModel
loadedModel = CountVectorizerModel.load(modelPath)
loadedModel.vocabulary == model.vocabulary

True

### pyspark.ml.feature.ElementwiseProduct(scalingVec=None, inputCol=None, outputCol=None)
使用提供的“权重”向量输出每个输入向量的阿达马乘积（即，逐元素乘积）。换句话说，它通过标量乘数缩放数据集的每一列。

In [16]:
from pyspark.ml.feature import ElementwiseProduct 
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"])
ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
inputCol="values", outputCol="eprod")
ep.transform(df).show()
ep.setParams(scalingVec=Vectors.dense([2.0, 3.0, 5.0])).transform(df).show()


+-------------+-------------+
|       values|        eprod|
+-------------+-------------+
|[2.0,1.0,3.0]|[2.0,2.0,9.0]|
+-------------+-------------+

+-------------+--------------+
|       values|         eprod|
+-------------+--------------+
|[2.0,1.0,3.0]|[4.0,3.0,15.0]|
+-------------+--------------+



### pyspark.ml.feature.Imputer(*args, **kwargs)
用于完成缺失值的插补估计器，使用缺失值所在列的平均值或中值。 输入列应该是DoubleType或FloatType。 目前的Imputer不支持分类特征，可能会为分类特征创建不正确的值。
请注意，平均值/中值是在过滤出缺失值之后计算的。 输入列中的所有Null值都被视为缺失，所以也被归类。 为了计算中位数，使用pyspark.sql.DataFrame.approxQuantile（），相对误差为0.001。


In [17]:
from pyspark.ml.feature import Imputer
df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0),
                             (4.0, 4.0), (5.0, 5.0)], ["a", "b"])
imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
model = imputer.fit(df)
df.show()
model.surrogateDF.show()
model.transform(df).show()

+---+---+
|  a|  b|
+---+---+
|1.0|NaN|
|2.0|NaN|
|NaN|3.0|
|4.0|4.0|
|5.0|5.0|
+---+---+

+---+---+
|  a|  b|
+---+---+
|3.0|4.0|
+---+---+

+---+---+-----+-----+
|  a|  b|out_a|out_b|
+---+---+-----+-----+
|1.0|NaN|  1.0|  4.0|
|2.0|NaN|  2.0|  4.0|
|NaN|3.0|  3.0|  3.0|
|4.0|4.0|  4.0|  4.0|
|5.0|5.0|  5.0|  5.0|
+---+---+-----+-----+



In [18]:
imputer.setStrategy("median").setMissingValue(float("nan")).fit(df).transform(df).show()

+---+---+-----+-----+
|  a|  b|out_a|out_b|
+---+---+-----+-----+
|1.0|NaN|  1.0|  4.0|
|2.0|NaN|  2.0|  4.0|
|NaN|3.0|  2.0|  3.0|
|4.0|4.0|  4.0|  4.0|
|5.0|5.0|  5.0|  5.0|
+---+---+-----+-----+



### pyspark.ml.feature.MaxAbsScaler(self, inputCol=None, outputCol=None)
通过分割每个特征中的最大绝对值来单独重新缩放每个特征以范围[-1,1]。 它不会移动/居中数据，因此不会破坏任何稀疏性

In [19]:
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(Vectors.dense([1.0]),), (Vectors.dense([2.0]),)], ["a"])
maScaler = MaxAbsScaler(inputCol="a", outputCol="scaled")
model = maScaler.fit(df)
model.transform(df).show()

+-----+------+
|    a|scaled|
+-----+------+
|[1.0]| [0.5]|
|[2.0]| [1.0]|
+-----+------+



### pyspark.ml.feature.MinMaxScaler(self, min=0.0, max=1.0, inputCol=None, outputCol=None)
使用列汇总统计信息，将每个特征单独重新标定为一个常用范围[min，max]，这也称为最小 - 最大标准化或重新标定（注意由于零值可能会被转换为非零值，因此即使对于稀疏输入，转换器的输出也将是DenseVector）。 特征E的重新缩放的值被计算为，数据将被缩放到[0.0,1.0]范围内。
Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min
For the case E_max == E_min, Rescaled(e_i) = 0.5 * (max + min)


In [42]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
mmScaler = MinMaxScaler(inputCol="a", outputCol="scaled")
model = mmScaler.fit(df)
print(model.originalMin, model.originalMax)
model.transform(df).show()

[0.0] [2.0]
+-----+------+
|    a|scaled|
+-----+------+
|[0.0]| [0.0]|
|[2.0]| [1.0]|
+-----+------+



### pyspark.ml.feature.NGram(n=2, inputCol=None, outputCol=None)
一种功能转换器，用于将输入的字符串数组转换为n-gram数组。输入数组中的空值将被忽略。它返回一个n-gram数组，其中每个n-gram由一个以空格分隔的单词串表示。当输入为空时，返回一个空数组。当输入数组长度小于n（每n-gram的元素数）时，不返回n-gram。

In [23]:
from pyspark.ml.feature import NGram
from pyspark.sql import Row
df = spark.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])
ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams")
ngram.transform(df).show()

+---------------+--------------------+
|    inputTokens|              nGrams|
+---------------+--------------------+
|[a, b, c, d, e]|[a b, b c, c d, d e]|
+---------------+--------------------+



In [24]:
# 更改 n-gram 长度
ngram.setParams(n=4).transform(df).show()

+---------------+------------------+
|    inputTokens|            nGrams|
+---------------+------------------+
|[a, b, c, d, e]|[a b c d, b c d e]|
+---------------+------------------+



In [25]:
# 临时修改输出列
ngram.transform(df, {ngram.outputCol: "output"}).show()

+---------------+------------------+
|    inputTokens|            output|
+---------------+------------------+
|[a, b, c, d, e]|[a b c d, b c d e]|
+---------------+------------------+



### pyspark.ml.feature.Normalizer(self, p=2.0, inputCol=None, outputCol=None)
使用给定的p范数标准化矢量以得到单位范数（默认为L2）。

In [26]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
svec = Vectors.sparse(4, {1: 4.0, 3: 3.0})
df = spark.createDataFrame([(Vectors.dense([3.0, -4.0]), svec)], ["dense", "sparse"])
normalizer = Normalizer(p=2.0, inputCol="dense", outputCol="features")
normalizer.transform(df).show()

+----------+-------------------+----------+
|     dense|             sparse|  features|
+----------+-------------------+----------+
|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|[0.6,-0.8]|
+----------+-------------------+----------+



In [27]:
normalizer.setParams(inputCol="sparse", outputCol="freqs").transform(df).show()

+----------+-------------------+-------------------+
|     dense|             sparse|              freqs|
+----------+-------------------+-------------------+
|[3.0,-4.0]|(4,[1,3],[4.0,3.0])|(4,[1,3],[0.8,0.6])|
+----------+-------------------+-------------------+



### pyspark.ml.feature.OneHotEncoderEstimator(inputCols=None, outputCols=None, handleInvalid='error', dropLast=True)
(分类列编码为二进制向量列)
一个热门的编码器，将一列类别索引映射到一列二进制向量，每行至多有一个单值，表示输入类别索引。 例如，对于5个类别，输入值2.0将映射到[0.0，0.0，1.0，0.0]的输出向量。 最后一个类别默认不包含（可通过dropLast进行配置），因为它使向量条目总和为1，因此线性相关。 所以一个4.0的输入值映射到[0.0，0.0，0.0，0.0]。这与scikit-learn的OneHotEncoder不同，后者保留所有类别。 输出向量是稀疏的。
当handleInvalid配置为“keep”时，会添加一个指示无效值的额外“类别”作为最后一个类别。因此，当dropLast为true时，无效值将被编码为全零向量。

In [28]:
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"])
model = ohe.fit(df)
model.transform(df).show()

+-----+-------------+
|input|       output|
+-----+-------------+
|  0.0|(2,[0],[1.0])|
|  1.0|(2,[1],[1.0])|
|  2.0|    (2,[],[])|
+-----+-------------+



### pyspark.ml.feature.PCA(self, k=None, inputCol=None, outputCol=None)
PCA训练一个模型将向量投影到前k个主成分的较低维空间。

In [29]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
     (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
     (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data,["features"])
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(df)
model.transform(df).show(truncate=0)

+---------------------+----------------------------------------+
|features             |pca_features                            |
+---------------------+----------------------------------------+
|(5,[1,3],[1.0,7.0])  |[1.6485728230883807,-4.013282700516296] |
|[2.0,0.0,3.0,4.0,5.0]|[-4.645104331781534,-1.1167972663619026]|
|[4.0,0.0,0.0,6.0,7.0]|[-6.428880535676489,-5.337951427775355] |
+---------------------+----------------------------------------+



In [30]:
model.explainedVariance

DenseVector([0.7944, 0.2056])

### pyspark.ml.feature.QuantileDiscretizer(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, handleInvalid="error")
与Bucketizer方法类似，但QuantileDiscretizer采用具有连续特征的列，并输出具有分箱分类特征的列。可以使用numBuckets参数设置区域的数量。所使用的桶的数量可能小于该值，例如，如果输入的不同值太少而不能创建足够的不同分位数。nan会占用一个新的分类

In [31]:
from pyspark.ml.feature import QuantileDiscretizer
values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
df = spark.createDataFrame(values, ["values"])
qds = QuantileDiscretizer(numBuckets=2,
     inputCol="values", outputCol="buckets", relativeError=0.01, handleInvalid="error")
bucketizer = qds.fit(df)
qds.setHandleInvalid("keep").fit(df).transform(df).show()

+------+-------+
|values|buckets|
+------+-------+
|   0.1|    0.0|
|   0.4|    1.0|
|   1.2|    1.0|
|   1.5|    1.0|
|   NaN|    2.0|
|   NaN|    2.0|
+------+-------+



### pyspark.ml.feature.RegexTokenizer(minTokenLength=1, gaps=True, pattern='\s+', inputCol=None, outputCol=None, toLowercase=True)
基于java正则表达式的标记生成器

In [32]:
from pyspark.ml.feature import RegexTokenizer
df = spark.createDataFrame([("A B  c",)], ["text"])
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
reTokenizer.transform(df).show()

+------+---------+
|  text|    words|
+------+---------+
|A B  c|[a, b, c]|
+------+---------+



###  pyspark.ml.feature.SQLTransformer(statement=None)
实现由SQL语句定义的转换。目前我们只支持SQL语法，

In [33]:
from pyspark.ml.feature import SQLTransformer
df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
sqlTrans = SQLTransformer(
     statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
df.show()
sqlTrans.transform(df).show()


+---+---+---+
| id| v1| v2|
+---+---+---+
|  0|1.0|3.0|
|  2|2.0|5.0|
+---+---+---+

+---+---+---+---+----+
| id| v1| v2| v3|  v4|
+---+---+---+---+----+
|  0|1.0|3.0|4.0| 3.0|
|  2|2.0|5.0|7.0|10.0|
+---+---+---+---+----+



### pyspark.ml.feature.StandardScaler(self, withMean=False, withStd=True, inputCol=None, outputCol=None)
(标准化列，使其拥有零均值和等于1的标准差)
通过使用训练集中样本的列汇总统计消除平均值和缩放到单位方差来标准化特征。使用校正后的样本标准偏差计算“单位标准差”，该标准偏差计算为无偏样本方差的平方根。


In [34]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([(Vectors.dense([0.0]),), (Vectors.dense([2.0]),)], ["a"])
standardScaler = StandardScaler(inputCol="a", outputCol="scaled")
model = standardScaler.fit(df)
print(model.mean, model.std)
model.transform(df).show()

[1.0] [1.4142135623730951]
+-----+-------------------+
|    a|             scaled|
+-----+-------------------+
|[0.0]|              [0.0]|
|[2.0]|[1.414213562373095]|
+-----+-------------------+



### pyspark.ml.feature.StopWordsRemover(inputCol=None, outputCol=None, stopWords=None, caseSensitive=False)
一个特征转换器，用于过滤掉输入中的停用词。

In [35]:
from pyspark.ml.feature import StopWordsRemover
df = spark.createDataFrame([(["a", "b", "c"],)], ["text"])
remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"])
remover.transform(df).show()

+---------+------+
|     text| words|
+---------+------+
|[a, b, c]|[a, c]|
+---------+------+



### pyspark.ml.feature.Tokenizer(inputCol=None, outputCol=None)
一个标记生成器，它将输入字符串转换为小写，然后用空格分隔它。

In [36]:
from pyspark.ml.feature import Tokenizer
df = spark.createDataFrame([("ASD VA c",)], ["text"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenizer.transform(df).show()

+--------+------------+
|    text|       words|
+--------+------------+
|ASD VA c|[asd, va, c]|
+--------+------------+



### pyspark.ml.feature.VectorSlicer(inputCol=None, outputCol=None, indices=None, names=None)
此类采用特征向量并输出具有原始特征的子阵列的新特征向量。 可以使用索引（setIndices（））或名称（setNames（））指定要素子集。必须至少选择一个功能。不允许使用重复的功能，因此所选索引和名称之间不能重叠。 输出向量将首先按所选索引（按给定顺序）排序要素，然后是所选名称（按给定顺序）。

In [37]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
df = spark.createDataFrame([
     (Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]),),
     (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]),),
     (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]),)], ["features"])
vs = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4])
vs.transform(df).show(truncate=0)

+-----------------------+----------+
|features               |sliced    |
+-----------------------+----------+
|[-2.0,2.3,0.0,0.0,1.0] |[2.3,1.0] |
|[0.0,0.0,0.0,0.0,0.0]  |[0.0,0.0] |
|[0.6,-1.1,-3.0,4.5,3.3]|[-1.1,3.3]|
+-----------------------+----------+



### pyspark.ml.feature.VectorAssembler(inputCols=None, outputCol=None)
将多个列合并到向量列中的要素转换器。

In [38]:
from pyspark.ml.feature import VectorAssembler
df = spark.createDataFrame([(1, 0, 3)], ["a", "b", "c"])
vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")
vecAssembler.transform(df).show()

+---+---+---+-------------+
|  a|  b|  c|     features|
+---+---+---+-------------+
|  1|  0|  3|[1.0,0.0,3.0]|
+---+---+---+-------------+



### pyspark.ml.feature.Word2Vec(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, inputCol=None, outputCol=None, windowSize=5, maxSentenceLength=1000)
Word2Vec训练Map（String，Vector）模型，即将单词转换为代码以进行进一步的自然语言处理或机器学习过程。

In [39]:
from pyspark.ml.feature import Word2Vec
sent = ("a b " * 100 + "a c " * 10).split(" ")
doc = spark.createDataFrame([(sent,), (sent,)], ["sentence"])
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model")
model = word2Vec.fit(doc)
model.getVectors().show()

+----+--------------------+
|word|              vector|
+----+--------------------+
|   a|[0.09461779892444...|
|   b|[1.15474212169647...|
|   c|[-0.3794820010662...|
+----+--------------------+



In [40]:
# 找相似字符
model.findSynonyms("a", 1).show()
model.findSynonymsArray("a", 1)

+----+-------------------+
|word|         similarity|
+----+-------------------+
|   b|0.25053444504737854|
+----+-------------------+



[('b', 0.25053444504737854)]

In [41]:
from pyspark.sql.functions import format_number as fmt
model.findSynonyms("a", 2).select("word", fmt("similarity", 3).alias("similarity")).show()

+----+----------+
|word|similarity|
+----+----------+
|   b|     0.251|
|   c|    -0.698|
+----+----------+

