In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('ngram').config('spark.executor.memory', '5gb') \
    .config("spark.cores.max", "6") \
    .getOrCreate()

In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (1.0, "I wish Java could use case classes"),
    (2.0, "Logistic regression models are neat")
], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wrdsData = tokenizer.transform(sentenceData)

wrdsData.printSchema()

root
 |-- id: double (nullable = true)
 |-- sentence: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [14]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10)
featurizedData = hashingTF.transform(wrdsData)

display(featurizedData.printSchema())
featurizedData.show()

root
 |-- id: double (nullable = true)
 |-- sentence: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)



None

+---+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|0.0|Hi I heard about ...|[hi, i, heard, ab...|(10,[0,5,7,9],[1....|
|1.0|I wish Java could...|[i, wish, java, c...|(10,[2,3,5,7,9],[...|
|2.0|Logistic regressi...|[logistic, regres...|(10,[3,4,5,6,8],[...|
+---+--------------------+--------------------+--------------------+



In [17]:
featurizedData.cache()

DataFrame[id: double, sentence: string, words: array<string>, rawFeatures: vector]

In [15]:
idf = IDF(inputCol="rawFeatures", outputCol="feature")
idfModel = idf.fit(featurizedData)

display(type(idfModel))

pyspark.ml.feature.IDFModel

In [16]:
rescaledData = idfModel.transform(featurizedData)
rescaledData.select('sentence','feature').show(truncate=False)

+-----------------------------------+----------------------------------------------------------------------------------------------------+
|sentence                           |feature                                                                                             |
+-----------------------------------+----------------------------------------------------------------------------------------------------+
|Hi I heard about Spark             |(10,[0,5,7,9],[0.6931471805599453,0.0,0.5753641449035617,0.28768207245178085])                      |
|I wish Java could use case classes |(10,[2,3,5,7,9],[0.6931471805599453,0.28768207245178085,0.0,0.28768207245178085,0.8630462173553426])|
|Logistic regression models are neat|(10,[3,4,5,6,8],[0.28768207245178085,0.6931471805599453,0.0,0.6931471805599453,0.6931471805599453]) |
+-----------------------------------+----------------------------------------------------------------------------------------------------+



In [45]:
for features_label in rescaledData.select("feature", "id").take(3):
        print(features_label) 

Row(feature=SparseVector(10, {0: 0.6931, 5: 0.0, 7: 0.5754, 9: 0.2877}), id=0.0)
Row(feature=SparseVector(10, {2: 0.6931, 3: 0.2877, 5: 0.0, 7: 0.2877, 9: 0.863}), id=1.0)
Row(feature=SparseVector(10, {3: 0.2877, 4: 0.6931, 5: 0.0, 6: 0.6931, 8: 0.6931}), id=2.0)


In [46]:
def tf_idf_feature(wordsData, numFeatures=20):
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    return rescaledData

In [47]:
gg = tf_idf_feature(wrdsData, 10)

### Test if the results are equivalent

In [61]:
for features_label in gg.select("features", "id").take(3):
        print(features_label) 
        
for features_label in rescaledData.select("feature", "id").take(3):
        print(features_label) 

Row(features=SparseVector(10, {0: 0.6931, 5: 0.0, 7: 0.5754, 9: 0.2877}), id=0.0)
Row(features=SparseVector(10, {2: 0.6931, 3: 0.2877, 5: 0.0, 7: 0.2877, 9: 0.863}), id=1.0)
Row(features=SparseVector(10, {3: 0.2877, 4: 0.6931, 5: 0.0, 6: 0.6931, 8: 0.6931}), id=2.0)
Row(feature=SparseVector(10, {0: 0.6931, 5: 0.0, 7: 0.5754, 9: 0.2877}), id=0.0)
Row(feature=SparseVector(10, {2: 0.6931, 3: 0.2877, 5: 0.0, 7: 0.2877, 9: 0.863}), id=1.0)
Row(feature=SparseVector(10, {3: 0.2877, 4: 0.6931, 5: 0.0, 6: 0.6931, 8: 0.6931}), id=2.0)


### SVD

In [40]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
mat = RowMatrix(rescaledData.rdd.map(lambda v: Vectors.dense(v.rawFeatures.toArray()) ))
result = mat.computeSVD(k=5)

In [41]:
U = result.U       # The U factor is a RowMatrix.
s = result.s       # The singular values are stored in a local dense vector.
V = result.V       # The V factor is a local dense matrix.

In [42]:
display(U)
display(s)
display(V)

None

DenseVector([4.1373, 2.1494, 1.8062, 0.0, 0.0])

DenseMatrix(10, 5, [-0.1244, -0.0, -0.2025, -0.2462, -0.0437, -0.3707, -0.0437, -0.4514, ..., 0.0918, -0.0662, -0.0976, 0.359, -0.0976, 0.2395, -0.0976, -0.208], 0)