In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
sentence_df = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish Java could use case classes'),
    (2, 'Logistic,regression,models,are,neat'),
    (3, 'I saw the red balloon'),
    (4, 'Mary had a little lamb')
], ['id', 'sentence'])

In [5]:
sentence_df.show(truncate=False)

+---+-----------------------------------+
|id |sentence                           |
+---+-----------------------------------+
|0  |Hi I heard about Spark             |
|1  |I wish Java could use case classes |
|2  |Logistic,regression,models,are,neat|
|3  |I saw the red balloon              |
|4  |Mary had a little lamb             |
+---+-----------------------------------+



In [6]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

countTokens = udf(lambda words: len(words), IntegerType())

In [7]:
tokenized_df = tokenizer.transform(sentence_df)
regex_tokenized_df = regexTokenizer.transform(sentence_df)

In [8]:
tokenized_df.withColumn('tokens', countTokens(col('words'))).show(truncate=False)
regex_tokenized_df.withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+---+-----------------------------------+------------------------------------------+------+
|id |sentence                           |words                                     |tokens|
+---+-----------------------------------+------------------------------------------+------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|2  |Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |1     |
|3  |I saw the red balloon              |[i, saw, the, red, balloon]               |5     |
|4  |Mary had a little lamb             |[mary, had, a, little, lamb]              |5     |
+---+-----------------------------------+------------------------------------------+------+

+---+-----------------------------------+------------------------------------------+------+
|id |sentence                           |words                                 

In [9]:
from pyspark.ml.feature import StopWordsRemover

In [10]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
remover.transform(regex_tokenized_df).show(truncate=False)


+---+-----------------------------------+------------------------------------------+------------------------------------+
|id |sentence                           |words                                     |filtered                            |
+---+-----------------------------------+------------------------------------------+------------------------------------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |[hi, heard, spark]                  |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|[wish, java, use, case, classes]    |
|2  |Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |[logistic, regression, models, neat]|
|3  |I saw the red balloon              |[i, saw, the, red, balloon]               |[saw, red, balloon]                 |
|4  |Mary had a little lamb             |[mary, had, a, little, lamb]              |[mary, little, lamb]                |
+---+-------------------

In [11]:
from pyspark.ml.feature import NGram

In [12]:
ngram = NGram(n=2, inputCol='words', outputCol='ngrams')

ngram.transform(regex_tokenized_df).show(truncate=False)

+---+-----------------------------------+------------------------------------------+------------------------------------------------------------------+
|id |sentence                           |words                                     |ngrams                                                            |
+---+-----------------------------------+------------------------------------------+------------------------------------------------------------------+
|0  |Hi I heard about Spark             |[hi, i, heard, about, spark]              |[hi i, i heard, heard about, about spark]                         |
|1  |I wish Java could use case classes |[i, wish, java, could, use, case, classes]|[i wish, wish java, java could, could use, use case, case classes]|
|2  |Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |[logistic regression, regression models, models are, are neat]    |
|3  |I saw the red balloon              |[i, saw, the, red, balloon]               |[i s

In [13]:
from pyspark.ml.feature import HashingTF, IDF

In [14]:
hashing_TF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=20)
featurized_df = hashing_TF.transform(regex_tokenized_df)

idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_df)
rescaled_data = idf_model.transform(featurized_df)

In [15]:
rescaled_data.select('sentence', 'rawFeatures', 'features').show(truncate=100)

+-----------------------------------+-----------------------------------------------+----------------------------------------------------------------------------------------------------+
|                           sentence|                                    rawFeatures|                                                                                            features|
+-----------------------------------+-----------------------------------------------+----------------------------------------------------------------------------------------------------+
|             Hi I heard about Spark|             (20,[6,8,13,16],[1.0,1.0,1.0,2.0])|      (20,[6,8,13,16],[0.6931471805599453,1.0986122886681098,0.6931471805599453,0.8109302162163288])|
| I wish Java could use case classes|(20,[0,2,7,13,15,16],[1.0,1.0,2.0,1.0,1.0,1.0])|(20,[0,2,7,13,15,16],[0.6931471805599453,1.0986122886681098,1.3862943611198906,0.6931471805599453...|
|Logistic,regression,models,are,neat|       (20,[3,4,6,11,19],[1.

In [16]:
from pyspark.ml.feature import CountVectorizer

In [17]:
df = spark.createDataFrame([
    (0, 'a b c'.split()),
    (1, 'a b b c a'.split())
], ['id', 'words'])

cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [18]:
sms_spam = spark.read.csv('../data/SMSSpamCollection.tsv', inferSchema=True, sep='\t')

In [19]:
sms_spam.printSchema()
sms_spam.describe().show()
sms_spam.show(5)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)

+-------+----+--------------------+
|summary| _c0|                 _c1|
+-------+----+--------------------+
|  count|5574|                5574|
|   mean|null|               645.0|
| stddev|null|                null|
|    min| ham| &lt;#&gt;  in mc...|
|    max|spam|… we r stayin her...|
+-------+----+--------------------+

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [20]:
from pyspark.sql.functions import length

In [21]:
sms_spam = sms_spam\
    .withColumnRenamed('_c0','class')\
    .withColumnRenamed('_c1','text')
sms_spam = sms_spam.withColumn('length', length(sms_spam['text']))

In [22]:
sms_spam.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [23]:
sms_spam.groupBy('class').agg({'length': 'mean', 'class': 'count'}).show()

+-----+-----------------+------------+
|class|      avg(length)|count(class)|
+-----+-----------------+------------+
|  ham|71.45431945307645|        4827|
| spam|138.6706827309237|         747|
+-----+-----------------+------------+



In [24]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler

ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')

tokenizer = Tokenizer(inputCol='text', outputCol='tokens')
stopremove = StopWordsRemover(inputCol='tokens',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='count_vec')
idf = IDF(inputCol='count_vec', outputCol='tf_idf')
assembler = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

In [25]:
from pyspark.ml.classification import NaiveBayes

In [26]:
clf = NaiveBayes()

In [27]:
from pyspark.ml import Pipeline

In [28]:
ppln = Pipeline(stages=[
    tokenizer, stopremove, count_vec, idf, assembler, clf
])

In [29]:
sms_spam = ham_spam_to_num.fit(sms_spam).transform(sms_spam)
sms_spam.show(5)

+-----+--------------------+------+-----+
|class|                text|length|label|
+-----+--------------------+------+-----+
|  ham|Go until jurong p...|   111|  0.0|
|  ham|Ok lar... Joking ...|    29|  0.0|
| spam|Free entry in 2 a...|   155|  1.0|
|  ham|U dun say so earl...|    49|  0.0|
|  ham|Nah I don't think...|    61|  0.0|
+-----+--------------------+------+-----+
only showing top 5 rows



In [30]:
train_sms, test_sms = sms_spam.randomSplit([.7, .8], seed=42)

In [31]:
spam_model = ppln.fit(train_sms)

predictions = spam_model.transform(test_sms)

In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [33]:
predictions.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|class|                text|length|label|              tokens|         stop_tokens|           count_vec|              tf_idf|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|  ham| &lt;#&gt;  in mc...|    36|  0.0|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(8490,[4,13,6183]...|(8490,[4,13,6183]...|(8491,[4,13,6183,...|[-156.87817715538...|[1.0,1.0617857285...|       0.0|
|  ham| &lt;#&gt;  mins ...|    51|  0.0|[, &lt;#&gt;, , m...|[, &lt;#&gt;, , m...|(8490,[4,13,41,23...|(8490,[4,13,41,23...|(8491,[4,13,41,23...|[-308.06882545624...|[1.0,4.4088653034...|       0.0|


In [34]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')

for metric in ['accuracy', 'weightedPrecision', 'weightedRecall', 'weightedFMeasure']:
    m = evaluator.setMetricName(metric).evaluate(predictions)
    print(f'{metric}: {m}')

accuracy: 0.9744811160258592
weightedPrecision: 0.974842530521143
weightedRecall: 0.9744811160258591
weightedFMeasure: 0.9746342366567277
