In [1]:
from pyspark.ml.feature import CountVectorizer

# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (0, "a f c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|0  |[a, f, c]      |(3,[0,1],[1.0,1.0])      |
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,1.0,2.0])|
+---+---------------+-------------------------+



In [2]:
model.vocabulary

[u'a', u'c', u'b']

In [3]:
# Input data: Each row is a bag of words with a ID.
df2 = spark.createDataFrame([
    (0, "hash1 hash2 hash3".split(" "), 1.0),
    (0, "hash1 hash2 hash3".split(" "), 0.0),
    (1, "hash4 hash2 hash4".split(" "), 0.0)
], ["id", "words2", "label"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words2", outputCol="features2", vocabSize=3, minDF=2.0)

model2 = cv.fit(df2)

result2 = model2.transform(df2)
result2.show(truncate=False)

+---+---------------------+-----+-------------------------+
|id |words2               |label|features2                |
+---+---------------------+-----+-------------------------+
|0  |[hash1, hash2, hash3]|1.0  |(3,[0,1,2],[1.0,1.0,1.0])|
|0  |[hash1, hash2, hash3]|0.0  |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[hash4, hash2, hash4]|0.0  |(3,[0],[1.0])            |
+---+---------------------+-----+-------------------------+



In [4]:
model2.vocabulary

[u'hash2', u'hash1', u'hash3']

In [5]:
c = result.join(result2, "id")

In [6]:
c.show(5, truncate = False)

+---+---------------+-------------------------+---------------------+-----+-------------------------+
|id |words          |features                 |words2               |label|features2                |
+---+---------------+-------------------------+---------------------+-----+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|[hash1, hash2, hash3]|1.0  |(3,[0,1,2],[1.0,1.0,1.0])|
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|[hash1, hash2, hash3]|0.0  |(3,[0,1,2],[1.0,1.0,1.0])|
|0  |[a, f, c]      |(3,[0,1],[1.0,1.0])      |[hash1, hash2, hash3]|1.0  |(3,[0,1,2],[1.0,1.0,1.0])|
|0  |[a, f, c]      |(3,[0,1],[1.0,1.0])      |[hash1, hash2, hash3]|0.0  |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,1.0,2.0])|[hash4, hash2, hash4]|0.0  |(3,[0],[1.0])            |
+---+---------------+-------------------------+---------------------+-----+-------------------------+



In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


assembler = VectorAssembler(
    inputCols=["features", "features2"],
    outputCol="combine")

output = assembler.transform(c)
output.select("combine","features","features2").show(truncate = False)

+-------------------------+-------------------------+-------------------------+
|combine                  |features                 |features2                |
+-------------------------+-------------------------+-------------------------+
|[1.0,1.0,1.0,1.0,1.0,1.0]|(3,[0,1,2],[1.0,1.0,1.0])|(3,[0,1,2],[1.0,1.0,1.0])|
|[1.0,1.0,1.0,1.0,1.0,1.0]|(3,[0,1,2],[1.0,1.0,1.0])|(3,[0,1,2],[1.0,1.0,1.0])|
|[1.0,1.0,0.0,1.0,1.0,1.0]|(3,[0,1],[1.0,1.0])      |(3,[0,1,2],[1.0,1.0,1.0])|
|[1.0,1.0,0.0,1.0,1.0,1.0]|(3,[0,1],[1.0,1.0])      |(3,[0,1,2],[1.0,1.0,1.0])|
|[2.0,1.0,2.0,1.0,0.0,0.0]|(3,[0,1,2],[2.0,1.0,2.0])|(3,[0],[1.0])            |
+-------------------------+-------------------------+-------------------------+



In [8]:
model.vocabulary+ model2.vocabulary

[u'a', u'c', u'b', u'hash2', u'hash1', u'hash3']

In [14]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors


selector = ChiSqSelector(numTopFeatures=3, featuresCol="combine",
                         outputCol="selectedFeatures", labelCol="label")

#smodel = selector.fit(output)
#smodel.transform(output).selectedFeatures
result = selector.fit(output).transform(output)
result.select("selectedFeatures", "combine").show(truncate = False)

+----------------+-------------------------+
|selectedFeatures|combine                  |
+----------------+-------------------------+
|[1.0,1.0,1.0]   |[1.0,1.0,1.0,1.0,1.0,1.0]|
|[1.0,1.0,1.0]   |[1.0,1.0,1.0,1.0,1.0,1.0]|
|[1.0,0.0,1.0]   |[1.0,1.0,0.0,1.0,1.0,1.0]|
|[1.0,0.0,1.0]   |[1.0,1.0,0.0,1.0,1.0,1.0]|
|[2.0,2.0,0.0]   |[2.0,1.0,2.0,1.0,0.0,0.0]|
+----------------+-------------------------+



In [11]:
smodel.selectedFeatures

[0, 2, 4]