## Working with Text Data - Chapter 4


In [2]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()

Define a documentDB

In [6]:
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")

model = word2Vec.fit(documentDF)

result = model.transform(documentDF)

for row in result.collect():
    text, vector = row
    print(f"Text: [{', '.join(text)}] => \nVector: {vector}\n")

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.008409444987773896,0.005632373690605164,0.03362197317183018]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.004308507378612245,0.0405282654932567,0.056872056680731475]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.049474555999040604,0.021491939178667964,0.04208827670663595]



In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame(
    [
        (0, "Hi|I|heard|about|Spark"),
        (1, "I     wish Java      could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], 
    ["id", "sentence"]
)

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words")))\
    .show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words")))\
    .show(truncate=False)

+-------------------------------------------+------------------------------------------------------------+------+
|sentence                                   |words                                                       |tokens|
+-------------------------------------------+------------------------------------------------------------+------+
|Hi|I|heard|about|Spark                     |[hi|i|heard|about|spark]                                    |1     |
|I     wish Java      could use case classes|[i, , , , , wish, java, , , , , , could, use, case, classes]|16    |
|Logistic,regression,models,are,neat        |[logistic,regression,models,are,neat]                       |1     |
+-------------------------------------------+------------------------------------------------------------+------+

+-------------------------------------------+------------------------------------------+------+
|sentence                                   |words                                     |tokens|
+--------

In [19]:
from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame(
    [
        (0, ["I", "saw", "the", "red", "balloon"]),
        (1, ["Mary", "had", "a", "little", "lamb"])
    ], 
    ["id", "raw"]
)

remover = StopWordsRemover(inputCol="raw", outputCol="filtered", locale="es")

remover.transform(sentenceData).show(truncate=False)

25/10/04 18:46:33 WARN StopWordsRemover: Default locale set was [en_GT]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



# Code Example -> Tokanizer -> N Gram

In [None]:
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame(
    [
        (0, "Hi I heard about Spark "),
        (1, "I wish, wish Java, Java could"),
        (2, "Logistic regression, regression models")
    ], 
    ["id", "sentence"]
)

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
countTokens = udf(lambda words: len(words), IntegerType())
wordDataFrame = tokenizer.transform(sentenceDataFrame)

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

+-----------------------------------------------------------------+
|ngrams                                                           |
+-----------------------------------------------------------------+
|[hi i, i heard, heard about, about spark]                        |
|[i wish,, wish, wish, wish java,, java, java, java could]        |
|[logistic regression,, regression, regression, regression models]|
+-----------------------------------------------------------------+



In [None]:
from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([
    (0, 5.1),
    (1, 5.8),
    (2, 0.2)
], ["id", "feature"])

binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())

binarizedDataFrame.show()

Binarizer output with Threshold = 0.500000
+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    5.1|              1.0|
|  1|    5.8|              1.0|
|  2|    0.2|              0.0|
+---+-------+-----------------+



In [24]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

df = spark.createDataFrame(data, ["features"])

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

+------------------------------------------------------------+
|pcaFeatures                                                 |
+------------------------------------------------------------+
|[1.6485728230883814,-4.0132827005162985,-1.0091435193998504]|
|[-4.645104331781533,-1.1167972663619048,-1.0091435193998501]|
|[-6.428880535676488,-5.337951427775359,-1.009143519399851]  |
+------------------------------------------------------------+



In [25]:
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (Vectors.dense([2.0, 1.0]),),
    (Vectors.dense([0.0, 0.0]),)
], ["features"])

polyExpansion = PolynomialExpansion(degree=5, inputCol="features", outputCol="polyFeatures")
polyDF = polyExpansion.transform(df)

polyDF.show(truncate=False)

+---------+------------------------------------------------------------------------------------+
|features |polyFeatures                                                                        |
+---------+------------------------------------------------------------------------------------+
|[2.0,1.0]|[2.0,4.0,8.0,16.0,32.0,1.0,2.0,4.0,8.0,16.0,1.0,2.0,4.0,8.0,1.0,2.0,4.0,1.0,2.0,1.0]|
|[0.0,0.0]|[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]   |
+---------+------------------------------------------------------------------------------------+



In [27]:
from pyspark.ml.feature import Word2Vec, PolynomialExpansion
from pyspark.ml import Pipeline

documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="word2vec_features")

polyExpansion = PolynomialExpansion(degree=2, inputCol="word2vec_features", outputCol="poly_features")

pipeline = Pipeline(stages=[word2Vec, polyExpansion])
model = pipeline.fit(documentDF)
result = model.transform(documentDF)

result.select("text", "word2vec_features", "poly_features").show(truncate=False)

+------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                      |word2vec_features                                               |poly_features                                                                                                                                                                                       |
+------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Hi, I, heard, about, Spark]              |[0.008409444987773896,0.00563237369

In [28]:
from pyspark.ml.feature import DCT
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (Vectors.dense([0.0, 1.0, -2.0, 3.0]),),
    (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),),
    (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])

dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")

dctDf = dct.transform(df)

dctDf.select("featuresDCT").show(truncate=False)

+----------------------------------------------------------------+
|featuresDCT                                                     |
+----------------------------------------------------------------+
|[1.0,-1.1480502970952693,2.0000000000000004,-2.7716385975338604]|
|[-1.0,3.378492794482933,-7.000000000000001,2.9301512653149677]  |
|[4.0,9.304453421915744,11.000000000000002,1.5579302036357163]   |
+----------------------------------------------------------------+



In [29]:
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -8.0,200]),),
    (1, Vectors.dense([2.0, 1.0, -4.0,2]),),
    (2, Vectors.dense([4.0, 10.0, 8.0,0]),)
], ["id", "features"])

scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MaxAbsScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [-1, 1].
scaledData = scalerModel.transform(dataFrame)

scaledData.select("features", "scaledFeatures").show()

+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[1.0,0.1,-8.0,200.0]|[0.25,0.010000000...|
|  [2.0,1.0,-4.0,2.0]| [0.5,0.1,-0.5,0.01]|
|  [4.0,10.0,8.0,0.0]|   [1.0,1.0,1.0,0.0]|
+--------------------+--------------------+



In [30]:
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



In [31]:
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

# Create some vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                 inputCol="vector", outputCol="transformedVector")
# Batch transform the vectors to create new column:
transformer.transform(df).show()

+-------------+-----------------+
|       vector|transformedVector|
+-------------+-----------------+
|[1.0,2.0,3.0]|    [0.0,2.0,6.0]|
|[4.0,5.0,6.0]|   [0.0,5.0,12.0]|
+-------------+-----------------+



In [32]:
from pyspark.ml.feature import Imputer

df = spark.createDataFrame([
    (1.0, float("nan")),
    (2.0, float("nan")),
    (float("nan"), 3.0),
    (4.0, 4.0),
    (5.0, 5.0)
], ["a", "b"])

imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
model = imputer.fit(df)

model.transform(df).show()

+---+---+-----+-----+
|  a|  b|out_a|out_b|
+---+---+-----+-----+
|1.0|NaN|  1.0|  4.0|
|2.0|NaN|  2.0|  4.0|
|NaN|3.0|  3.0|  3.0|
|4.0|4.0|  4.0|  4.0|
|5.0|5.0|  5.0|  5.0|
+---+---+-----+-----+



In [38]:
sentence_data_frame = spark.createDataFrame([
    (0, "Hi I think pyspark is cool ","happy"),
    (1, "All I want is a pyspark cluster","indifferent"),
    (2, "I finally understand how ML works","Fulfilled"),
    (3, "Yet another sentence about pyspark and ML","indifferent"),
    (4, "Why didn't I know about mllib before","sad"),
    (5, "Yes, I can","happy")
], ["id", "sentence", "sentiment"])

sentence_data_frame.show()

+---+--------------------+-----------+
| id|            sentence|  sentiment|
+---+--------------------+-----------+
|  0|Hi I think pyspar...|      happy|
|  1|All I want is a p...|indifferent|
|  2|I finally underst...|  Fulfilled|
|  3|Yet another sente...|indifferent|
|  4|Why didn't I know...|        sad|
|  5|          Yes, I can|      happy|
+---+--------------------+-----------+



In [None]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

tokenized = tokenizer.transform(sentence_data_frame)

tokenized.show(truncate=False)

+---+-----------------------------------------+-----------+-------------------------------------------------+
|id |sentence                                 |sentiment  |words                                            |
+---+-----------------------------------------+-----------+-------------------------------------------------+
|0  |Hi I think pyspark is cool               |happy      |[hi, i, think, pyspark, is, cool]                |
|1  |All I want is a pyspark cluster          |indifferent|[all, i, want, is, a, pyspark, cluster]          |
|2  |I finally understand how ML works        |Fulfilled  |[i, finally, understand, how, ml, works]         |
|3  |Yet another sentence about pyspark and ML|indifferent|[yet, another, sentence, about, pyspark, and, ml]|
|4  |Why didn't I know about mllib before     |sad        |[why, didn't, i, know, about, mllib, before]     |
|5  |Yes, I can                               |happy      |[yes,, i, can]                                   |
+---+-----

In [39]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="meaningful_words")
meaningful_data_frame = remover.transform(tokenized)

meaningful_data_frame.select("words","meaningful_words").show(truncate=False)

25/10/04 19:06:14 WARN StopWordsRemover: Default locale set was [en_GT]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


+-------------------------------------------------+-------------------------------------+
|words                                            |meaningful_words                     |
+-------------------------------------------------+-------------------------------------+
|[hi, i, think, pyspark, is, cool]                |[hi, think, pyspark, cool]           |
|[all, i, want, is, a, pyspark, cluster]          |[want, pyspark, cluster]             |
|[i, finally, understand, how, ml, works]         |[finally, understand, ml, works]     |
|[yet, another, sentence, about, pyspark, and, ml]|[yet, another, sentence, pyspark, ml]|
|[why, didn't, i, know, about, mllib, before]     |[know, mllib]                        |
|[yes,, i, can]                                   |[yes,]                               |
+-------------------------------------------------+-------------------------------------+



In [40]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="sentiment", outputCol="categoryIndex")
indexed = indexer.fit(meaningful_data_frame).transform(meaningful_data_frame)
indexed.show()

+---+--------------------+-----------+--------------------+--------------------+-------------+
| id|            sentence|  sentiment|               words|    meaningful_words|categoryIndex|
+---+--------------------+-----------+--------------------+--------------------+-------------+
|  0|Hi I think pyspar...|      happy|[hi, i, think, py...|[hi, think, pyspa...|          0.0|
|  1|All I want is a p...|indifferent|[all, i, want, is...|[want, pyspark, c...|          1.0|
|  2|I finally underst...|  Fulfilled|[i, finally, unde...|[finally, underst...|          2.0|
|  3|Yet another sente...|indifferent|[yet, another, se...|[yet, another, se...|          1.0|
|  4|Why didn't I know...|        sad|[why, didn't, i, ...|       [know, mllib]|          3.0|
|  5|          Yes, I can|      happy|      [yes,, i, can]|              [yes,]|          0.0|
+---+--------------------+-----------+--------------------+--------------------+-------------+



In [41]:
indexed

DataFrame[id: bigint, sentence: string, sentiment: string, words: array<string>, meaningful_words: array<string>, categoryIndex: double]

In [44]:
sentiment_data_frame = spark.createDataFrame([
    (0, 0.01,0.43,0.3,0.5),
    (1, 0.097,0.21,0.2,0.9),
    (2, 0.4,0.329,0.97,0.4),
    (3, 0.7,0.4,0.3,0.87),
    (4, 0.34,0.4,0.3,0.78),
    (5, 0.1,0.3,0.31,0.29)
], ["sentence_id", "happy", "indifferent","Fulfilled","sad"])

sentiment_data_frame

DataFrame[sentence_id: bigint, happy: double, indifferent: double, Fulfilled: double, sad: double]

In [43]:
casted_data_frame = sentiment_data_frame.selectExpr("cast(happy as double)")
casted_data_frame.show()

+-----+
|happy|
+-----+
| 0.01|
|0.097|
|  0.4|
|  0.7|
| 0.34|
|  0.1|
+-----+



In [45]:
sentiment_data_frame.printSchema()

root
 |-- sentence_id: long (nullable = true)
 |-- happy: double (nullable = true)
 |-- indifferent: double (nullable = true)
 |-- Fulfilled: double (nullable = true)
 |-- sad: double (nullable = true)



In [46]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(20,[6,8,13,16],[...|(20,[6,8,13,16],[...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[0,2,7,13,15,...|(20,[0,2,7,13,15,...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[3,4,6,11,19]...|(20,[3,4,6,11,19]...|
+-----+--------------------+--------------------+--------------------+--------------------+



In [48]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
model = pipeline.fit(sentenceData)
rescaledData = model.transform(sentenceData)
rescaledData.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(20,[6,8,13,16],[...|(20,[6,8,13,16],[...|
|  0.0|I wish Java could...|[i, wish, java, c...|(20,[0,2,7,13,15,...|(20,[0,2,7,13,15,...|
|  1.0|Logistic regressi...|[logistic, regres...|(20,[3,4,6,11,19]...|(20,[3,4,6,11,19]...|
+-----+--------------------+--------------------+--------------------+--------------------+

